## Wie funktioniert MapReduce?

In [1]:
def find_longest_string(list_of_strings):
    longest_string = None
    longest_string_len = 0 
    for s in list_of_strings:
        if len(s) > longest_string_len:
            longest_string_len = len(s)
            longest_string = s
    return longest_string

In [2]:
list_of_strings = ['abc', 'python', 'dima']
%time max_length = print(find_longest_string(list_of_strings))

python
Wall time: 305 µs


In [3]:
large_list_of_strings = list_of_strings*1000
%time print(find_longest_string(large_list_of_strings))

python
Wall time: 980 µs


In [4]:
large_list_of_strings = list_of_strings*100000000
%time max_length = print(max(large_list_of_strings, key=len))

python
Wall time: 11.4 s


In [5]:
%%time

# step 1:
list_of_string_lens = [len(s) for s in list_of_strings]
list_of_string_lens = zip(list_of_strings, list_of_string_lens)

#step 2:
max_len = max(list_of_string_lens, key=lambda t: t[1])
print(max_len)

('python', 6)
Wall time: 0 ns


In [6]:
mapper = len

def reducer(p, c):
    if p[1] > c[1]:
        return p
    return c   

In [7]:
from functools import reduce

In [8]:
%%time

#Schritt 1: Vorverarbeien (Mappen) mit dem definierten mapper. (In unserem Beispiel der len-Funktion)

mapped = map(mapper, list_of_strings)
mapped = zip(list_of_strings, mapped)

#Schritt 2: Ausführen der eigentlichen Aufgabe in den Vorverarbeiteten Daten (Reducing).
#           (In unserem Beispiel das Finden des maximalen Wertes)

reduced = reduce(reducer, mapped)
print(reduced)

('python', 6)
Wall time: 0 ns


## Exkurs: Wie genau funktionieren Map und Reduce?

In [9]:
#Wie genau funktioniert map?
#map??

#Definieren einer beliebigen Funktion. Hier: Schreibe alles, was übergeben wird, groß.
def to_upper_case(s):
    return str(s).upper()

#Hilfsfunktion, um zu zeigen, was innerhalb von map passiert.
def print_iterator(it):
    for x in it:
        print(x, end=' ')
    print('')  # for new line

In [10]:
#Wie genau funktioniert map (Teil 2)

map_iterator = map(to_upper_case, 'abc')
print(type(map_iterator))
print_iterator(map_iterator)

<class 'map'>
A B C 


In [11]:
#Wie genau funktioniert map (Teil 3)

map_iterator = map(to_upper_case, (1, 'a', 'abc'))
print_iterator(map_iterator)

1 A ABC 


In [12]:
#Wie genau funktioniert reduce?

liste = [ 1 , 3, 5, 6, 2, ]
print ("Die Summe aller Elemente ist: ", end="") 
print (reduce(lambda a,b : a+b, liste)) 

Die Summe aller Elemente ist: 17


## Exkurs endet hier

In [13]:
#Funktion zum Splitten einer Liste von Objekten in n-gleichgroße Objekte
def chunks(l, n):
    n = max(1, n)
    e = int(len(l)/n)
    e = max(1,e)
    return list(l[i:i+e] for i in range(0, len(l), e))

In [14]:
data_chunks = chunks(list_of_strings, 3)

In [15]:
data_chunks

[['abc'], ['python'], ['dima']]

In [16]:
data_chunks = chunks(list_of_strings, 30)

#step 1:
reduced_all = []
for chunk in data_chunks:
    mapped_chunk = map(mapper, chunk)
    mapped_chunk = zip(chunk, mapped_chunk)
    
    reduced_chunk = reduce(reducer, mapped_chunk)
    reduced_all.append(reduced_chunk)
    
#step 2:
reduced = reduce(reducer, reduced_all)
print(reduced)

('python', 6)


In [17]:
def chunks_mapper(chunk):
    mapped_chunk = map(mapper, chunk) 
    mapped_chunk = zip(chunk, mapped_chunk)
    return reduce(reducer, mapped_chunk)

In [18]:
data_chunks = chunks(large_list_of_strings, 30)

In [19]:
%%time

#step 1:
mapped = map(chunks_mapper, data_chunks)

#step 2:
reduced = reduce(reducer, mapped)
print(reduced)

('python', 6)
Wall time: 56 s


<img src='mapreduce.png'>