# Dataquest Project: Analyzing Wikipedia Pages

The goal of this project is to create a simplified version of the grep terminal command to search a collection of articles scraped from Wikipedia. To do this, we will use th map-reduce framework we learned about in the lessons. 

## Exploring the Data

In [3]:
import os

In [4]:
file_names = os.listdir("wiki")

In [5]:
len(file_names)

999

In [6]:
folder_name = "wiki"
file_name = "Dragnet_(franchise).html"
with open(os.path.join(folder_name, file_name)) as f:
    file_one = f.readlines()

In [30]:
file_one[0:10]

['<!DOCTYPE html>\n',
 '<html class="client-nojs" lang="en" dir="ltr">\n',
 '<head>\n',
 '<meta charset="UTF-8"/>\n',
 '<title>Dragnet (franchise) - Wikipedia</title>\n',
 '<script>document.documentElement.className = document.documentElement.className.replace( /(^|\\s)client-nojs(\\s|$)/, "$1client-js$2" );</script>\n',
 '<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Dragnet_(franchise)","wgTitle":"Dragnet (franchise)","wgCurRevisionId":765947026,"wgRevisionId":765947026,"wgArticleId":113356,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Use mdy dates from June 2013","All articles with peacock terms","Articles with peacock terms from September 2015","All articles with unsourced statements","Articles with unsourced statements from September 2015","Articles to be expanded from January 2016","All articles 

## Setting up the Map-Reduce Framework

In [8]:
import math
import functools
from multiprocessing import Pool

In [9]:
def make_chunks(data, num_chunks):
    chunk_size = math.ceil(len(data) / num_chunks)
    return [data[i:i+chunk_size] for i in range(0, len(data), chunk_size)]

def map_reduce(data, num_processes, mapper, reducer):
    chunks = make_chunks(data, num_processes)
    with Pool(num_processes) as pool:
        chunk_results = pool.map(mapper, chunks)
    return functools.reduce(reducer, chunk_results)

## Counting the Total Number of Lines

In [10]:
def line_mapper(file_names):
    total = 0
    for name in file_names:
        with open(os.path.join('wiki', name)) as f:
            total += len(f.readlines())
    return total

In [11]:
def line_reducer(count1, count2):
    return count1 + count2

In [12]:
total_lines = map_reduce(file_names, 4, line_mapper, line_reducer)

In [13]:
total_lines

499797

## Creating a Grep Exact Match Algorithm 

In [14]:
target_string = 'data'

In [15]:
def grep_mapper(file_names):
    return_dict = {}
    for name in file_names:
        with open(os.path.join('wiki', name)) as f:
            f_contents = f.readlines()
        for line in f_contents:
            if target_string in line:
                return_dict[name] = []
                break
        for i in range(len(f_contents)):
            if target_string in f_contents[i]:
                return_dict[name].append(i)
    return return_dict

In [16]:
def grep_reducer(dic1, dic2):
    dic1.update(dic2)
    return dic1

In [17]:
locations_str_data = map_reduce(file_names, 8, grep_mapper, grep_reducer)

## Making the Grep Algorithm Case Insensitive

In [19]:
def grep_mapper_case(file_names):
    return_dict = {}
    for name in file_names:
        with open(os.path.join('wiki', name)) as f:
            f_contents = [line.lower() for line in f.readlines()]
        for line in f_contents:
            if target_string.lower() in line:
                return_dict[name] = []
                break
        for i in range(len(f_contents)):
            if target_string.lower() in f_contents[i]:
                return_dict[name].append(i)
    return return_dict

def grep_reducer_case(dic1, dic2):
    dic1.update(dic2)
    return dic1

In [20]:
data_locations_case_insensitive = map_reduce(file_names, 8, grep_mapper_case, grep_reducer_case)

### Checking that the Case-Insensitive Version Returns More Results

In [22]:
for key in data_locations_case_insensitive:
    if len(data_locations_case_insensitive[key]) > len(locations_str_data[key]):
        print("{}:{}".format(key, len(data_locations_case_insensitive[key]) - len(locations_str_data[key])))

West_Park_Bridge.html:1
Urs_Burkart.html:1
Harry_Hill_Bandholtz.html:1
Cryptographic_primitive.html:1
Saravan_Gilan.html:1
Precorrin6A_reductase.html:2
Gordon_Bau.html:2
Kim_Yonghwa.html:2
Camp_Nelson_Confederate_Cemetery.html:1
Julien_Boisselier.html:1
Lis_LC3B8wert.html:1
Don_Parsons_(ice_hockey).html:1
Ek_Dil_Sau_Afsane.html:1
Exploratorium_(film).html:1
Morgana_King.html:1
Devil_on_Horseback.html:1
A_Beautiful_Valley.html:1
WLSR.html:1
C389cole_des_Mines_de_Douai.html:1
Gulliver_Mickey.html:1
Battle_of_Wattignies.html:1
Tropical_sprue.html:1
C11orf30.html:1
Taylor_Williamson.html:1
List_of_molecular_graphics_systems.html:4
Companys_procC3A9s_a_Catalunya.html:1
Doumanaba.html:1
Bahmanabade_Olya.html:1
SalemAuburn_Streets_Historic_District.html:1
Wilhelm_Wagenfeld_House.html:1
The_Future_(film).html:1
Peter_Collingwood.html:1
Mudramothiram.html:1
Code_page_1023.html:3
Demographics_of_American_Samoa.html:1
Appa_(film).html:1
Oldfield_Baby_Great_Lakes.html:1
Kattukukke.html:1
Maniitsoq

## Adding Match Positions on Each Line to the Algorithm

In [23]:
import re
def grep_mapper_line_positions(file_names):
    return_dict = {}
    for name in file_names:
        with open(os.path.join('wiki', name)) as f:
            f_contents = [line.lower() for line in f.readlines()]
        for line in f_contents:
            if target_string.lower() in line:
                return_dict[name] = []
                break
        for i in range(len(f_contents)):
            start_indexes = [m.start() for m in re.finditer(target_string.lower(), f_contents[i])]
            for s in start_indexes:
                return_dict[name].append((i, s))
    return return_dict

def grep_reducer_line_positions(dic1, dic2):
    dic1.update(dic2)
    return dic1

In [24]:
data_locations_match_positions = map_reduce(file_names, 8, grep_mapper_line_positions, grep_reducer_line_positions)

## Writing the Results to a CSV File

In [26]:
import csv
import pandas as pd

with open("results-data.csv", "w") as f:
    writer = csv.writer(f)
    rows = [["File", "Line", "Index"]]
    for key in data_locations_match_positions:
        for line, index in data_locations_match_positions[key]:
            rows.append([key, line, index])
    writer.writerows(rows)

In [27]:
results = pd.read_csv('results-data.csv')

In [32]:
results.head(20)

Unnamed: 0,File,Line,Index
0,Torovirinae.html,89,18
1,Torovirinae.html,91,348
2,Torovirinae.html,91,370
3,Torovirinae.html,140,40
4,Torovirinae.html,223,995
5,Torovirinae.html,223,1045
6,Torovirinae.html,223,1089
7,Torovirinae.html,248,124
8,West_Park_Bridge.html,6,420
9,West_Park_Bridge.html,64,658
