## Project: MapReduce implementation

### Introduction
In this project MapReduce implementation will be built.

Counting words occurences within Wikipedia pages (999 pages)

**Used modules:** os; math; multiprocessing; functools; csv; pandas

In [1]:
import os

urls_path = "./wiki/"
file_names = os.listdir(urls_path)

print(len(file_names))

print(file_names[0])

999
Bay_of_ConcepciC3B3n.html


### Create needed functions:
1. Split data into chunks
2. MapReduce itself

In [2]:
import math
import functools
from multiprocessing import Pool

def make_chunks(data, num_chunks):
    len_data = len(data)
    chunk_size = math.ceil(len_data / num_chunks)
    chunks = [data[i : chunk_size + i] for i in range(0, len_data, chunk_size)]
    return chunks

def map_reduce(data, num_processes, mapper, reducer):
    chunks = make_chunks(data, num_processes)
    with Pool(num_processes) as pool:
        chunk_results = pool.map(mapper, chunks)
    return functools.reduce(reducer, chunk_results)

### Create mapper and reducer functions to count total lines in all files. 

In [3]:
def map_count_lines(chunk):
    num_lines = 0
    for i in range(0, len(chunk)):
        with open(os.path.join(urls_path, chunk[i]), encoding="UTF-8") as file:
            num_lines += len(file.readlines())
    return num_lines

def reduce_count_lines(n_lines1, n_lines2):
    return n_lines1 + n_lines2

In [4]:
n_lines = map_reduce(file_names, 4, map_count_lines, reduce_count_lines)
print(n_lines)

499797


### Implementing a mapper, which finds all *target_word* occurences in html page. Writing page name, *target_word* line index and index of occurence into dictionary.
### Also implementing a reducer for dictionaries and a function calling map_reduce

In [25]:
target_word = "data"

def map_word_lines(file_paths):
    locations = {}
    
    for i in range(0, len(file_paths)):
        fn = file_paths[i]
        
        with open(fn) as file:
            lines = file.readlines()
        
            for j, line in enumerate(lines):
                line = line.lower()
                iof = 0
                while line.find(target_word, iof) != -1:
                    iof = line.find(target_word, iof)
                    if fn not in locations:
                        locations[fn] = [(j, iof)]
                    else:
                        locations[fn].append((j, iof))
                    iof += 1
    return locations


def reduce_word_lines(loc1, loc2):
    merged = {}
    merged.update(loc1)
    merged.update(loc2)
    return merged


def map_reduce_grep(path, num_processes):
    file_paths = [os.path.join(path, fn) for fn in os.listdir(path)]
    return map_reduce(file_paths, num_processes, map_word_lines, reduce_word_lines)
    

word_locations = map_reduce_grep(urls_path, 4)
import json
print(word_locations["./wiki/Bay_of_ConcepciC3B3n.html"])

[(6, 422), (45, 628), (45, 650), (58, 447), (58, 692), (60, 18), (62, 568), (62, 590), (105, 40), (105, 748), (105, 789), (105, 814), (188, 1039), (188, 1088), (188, 1132), (205, 125)]


### Writing gathered data into csv file. 

In [37]:
counter = 0
rows = [["File", "Line", "Index", "Context"]]

for fn in word_locations:
    with open(fn) as file:
        lines = file.readlines()
        
    context_step = 30

    for line, iof in word_locations[fn]:
        line_len = len(lines[line])
        context_start = max(iof - context_step, 0)
        context_end = min(iof + len(target_word) + context_step, line_len)
        rows.append([fn, line, iof, lines[line][context_start:context_end + 1]])


import csv

with open('result.csv', mode='w') as file:
    csv.writer(file).writerows(rows)
        
        
import pandas as pd
df = pd.read_csv('result.csv')

df.head(5)

Unnamed: 0,File,Line,Index,Context
0,./wiki/Bay_of_ConcepciC3B3n.html,6,422,"egories"":[""Coordinates on Wikidata"",""All stub ..."
1,./wiki/Bay_of_ConcepciC3B3n.html,45,628,"78-sj18-04-quiriquina.jpg 2x"" data-file-width=..."
2,./wiki/Bay_of_ConcepciC3B3n.html,45,650,"jpg 2x"" data-file-width=""960"" data-file-height..."
3,./wiki/Bay_of_ConcepciC3B3n.html,58,447,"aps, aerial photos, and other data for this lo..."
4,./wiki/Bay_of_ConcepciC3B3n.html,58,692,"aps, aerial photos, and other data for this lo..."
