# Guided Project: Analyzing Wikipedia Pages

## 1. Introducing Wikipedia Data


In [1]:
import os

# 1. list all of the files in the wiki folder
file_names = os.listdir("wiki")
for file in file_names:
    print(file)


Bay_of_ConcepciC3B3n.html
Bye_My_Boy.html
Valentin_Yanin.html
Kings_XI_Punjab_in_2014.html
William_Harvey_Lillard.html
Radial_Road_3.html
George_Weldrick.html
Zgornji_Otok.html
Blue_Heelers_(season_8).html
Taggen_Nunatak.html
Henri_BraqueniC3A9.html
Vrila.html
William_Henry_Porter.html
Clive_Brown_(footballer).html
Blick_nach_Rechts.html
Central_District_(Rezvanshahr_County).html
Alexios_Aspietes.html
Mei_Lanfang.html
Wangeroogeclass_tug.html
Dowell_Philip_O27Reilly.html
Coalville_Town_railway_station.html
Gennady_Lesun.html
Bartrum_Glacier.html
Victor_S._Mamatey.html
Gottfried_Keller.html
Table_Point_Formation.html
Nobuhiko_Ushiba.html
Master_of_Space_and_Time.html
Early_medieval_states_in_Kazakhstan.html
Eressa_aperiens.html
Myrtle_(sternwheeler).html
Abanycha_bicolor.html
JeecyVea.html
Aubrey_Fair.html
Ingrid_GuimarC3A3es.html
Urban_chicken.html
Elgin_National_Watch_Company.html
AlMidan.html
Antae_temple.html
Metis_Institute_of_Polytechnic.html
Sverre_Solberg.html
John_Reid_(British

In [2]:
# 2. Count and display the number of files in the wiki folder
num_file = len(file_names)
print(num_file)

999


In [3]:
#3. Read the first file in the wiki folder, and print its contents
with open(os.path.join("wiki",file_names[0])) as f:
    lines = [line for line in f.readlines()]
print(lines)


['<!DOCTYPE html>\n', '<html class="client-nojs" lang="en" dir="ltr">\n', '<head>\n', '<meta charset="UTF-8"/>\n', '<title>Bay of Concepción - Wikipedia</title>\n', '<script>document.documentElement.className = document.documentElement.className.replace( /(^|\\s)client-nojs(\\s|$)/, "$1client-js$2" );</script>\n', '<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Bay_of_Concepción","wgTitle":"Bay of Concepción","wgCurRevisionId":647460156,"wgRevisionId":647460156,"wgArticleId":16044270,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Coordinates on Wikidata","All stub articles","Landforms of Bío Bío Region","Bays of Chile","Bío Bío Region geography stubs"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgD

## 2. Adding the MapReduce Framework

* Use map_reduce() function we developed throughout this courese


In [4]:
# Implement map_reduce function
import math
import functools
from multiprocessing import Pool

def make_chunks(data, num_chunks):
    chunk_size = math.ceil(len(data) / num_chunks)
    return [data[i: i+chunk_size] for i in range(0, len(data), chunk_size)]

def map_reduce(data, num_processes, mapper, reducer):
    chunks = make_chunks(data, num_processes)
    pool = Pool(num_processes)
    chunk_results = pool.map(mapper, chunks)
    return functools.reduce(reducer, chunk_results)

In [5]:
# Count the total number of lines in all files
# orginal solution
total_lines = 0
file_names = os.listdir("wiki")
for file in file_names:
    with open(os.path.join("wiki",file)) as f:
        lines =  f.readlines()
        total_lines += len(lines)
total_lines


499797

In [6]:
# Use MapReduce for same step
# We should use 4 processes to split number of file
# we should also define a mapper & reducer function
# 1. mapper function that return total number of lines within a chunk

def line_mapper(file_chunk):
    num_lines = 0
    for file in file_chunk:
        with open(os.path.join("wiki", file)) as f:
            lines = f.readlines()
            num_lines += len(lines)
    return num_lines

# 2. reducer function that return total lines of two chunks
def line_reducer(num1, num2):
    return num1+num2

# 3. Implement actuall map_reduce function
total_num_lines = map_reduce(file_names, 4, line_mapper, line_reducer)
total_num_lines

499797

## Observation: We can see there are 499797 total lines within all files

## 3. Grep Exact Match

In [43]:
# set up target word first
target = "data"
file_names = os.listdir("wiki")
# 1. set up mapper function for this search condition
def exact_match_mapper (file_chunk):
    occurrence = {}
    for file_name in file_chunk:
        with open(os.path.join("wiki", file_name)) as f:
            line_occurrence = []
            counter = 0
            lines = f.readlines()
            for line in lines:
                if target in line:
                    line_occurrence.append(counter)
                counter +=1
            if len(line_occurrence) > 0 :
                occurrence[file_name] = line_occurrence
    return occurrence

# 2. set up reducer function for this search condition
def exact_match_reducer (match1, match2):
    merged = {}
    merged.update(match1)
    merged.update(match2)
    return merged

# 3.Implement actuall map_reduce function
final_exact_match = map_reduce(file_names, 8, exact_match_mapper, exact_match_reducer)
final_exact_match

{'Bay_of_ConcepciC3B3n.html': [6, 45, 58, 60, 62, 105, 188, 205],
 'Bye_My_Boy.html': [276, 359, 376],
 'Valentin_Yanin.html': [101, 144, 227, 244],
 'Kings_XI_Punjab_in_2014.html': [221,
  229,
  237,
  245,
  253,
  269,
  277,
  293,
  301,
  317,
  325,
  341,
  374,
  376,
  381,
  383,
  388,
  390,
  395,
  397,
  402,
  564,
  647,
  664],
 'William_Harvey_Lillard.html': [45, 65, 81, 129, 212, 229],
 'Radial_Road_3.html': [52, 103, 301, 505, 588, 605],
 'George_Weldrick.html': [194, 277, 294],
 'Zgornji_Otok.html': [6, 53, 55, 65, 69, 211, 260, 262, 311, 394, 411],
 'Blue_Heelers_(season_8).html': [49,
  79,
  82,
  105,
  107,
  125,
  127,
  133,
  135,
  141,
  143,
  660,
  695,
  730,
  739,
  886,
  969,
  986],
 'Taggen_Nunatak.html': [6, 44, 46, 48, 93, 176, 193],
 'Henri_BraqueniC3A9.html': [43, 46, 92, 175, 192],
 'Vrila.html': [6, 57, 59, 69, 73, 99, 100, 102, 151, 234, 251],
 'William_Henry_Porter.html': [48, 88, 171, 188],
 'Clive_Brown_(footballer).html': [146, 22

## Observation: We have a dictionry contains result "data" appears in all HTML file with in Wiki Folder

* I used 8 processes as I have noticed that 4 processes was a bit slow

## 4. Improve search to be case insensitive

In [44]:
# We should only need to change mapper function and leave rest of mapper_reduce the same
# 1. set up mapper function for this search condition
def improved_exact_match_mapper (file_chunk):
    occurrence = {}
    for file_name in file_chunk:
        with open(os.path.join("wiki", file_name)) as f:
            line_occurrence = []
            counter = 0
            lines = f.readlines()
            for line in lines:
                if target in line.lower():
                    line_occurrence.append(counter)
                counter +=1
            if len(line_occurrence) > 0:
                occurrence[file_name] = line_occurrence
    return occurrence

# Implement actuall map_reduce function
final_improved_exact_match = map_reduce(file_names, 6, improved_exact_match_mapper, exact_match_reducer)
final_improved_exact_match

{'Bay_of_ConcepciC3B3n.html': [6, 45, 58, 60, 62, 105, 188, 205],
 'Bye_My_Boy.html': [276, 359, 376],
 'Valentin_Yanin.html': [101, 144, 227, 244],
 'Kings_XI_Punjab_in_2014.html': [221,
  229,
  237,
  245,
  253,
  269,
  277,
  293,
  301,
  317,
  325,
  341,
  374,
  376,
  381,
  383,
  388,
  390,
  395,
  397,
  402,
  564,
  647,
  664],
 'William_Harvey_Lillard.html': [45, 65, 81, 129, 212, 229],
 'Radial_Road_3.html': [52, 103, 301, 505, 588, 605],
 'George_Weldrick.html': [194, 277, 294],
 'Zgornji_Otok.html': [6, 53, 55, 65, 69, 211, 260, 262, 311, 394, 411],
 'Blue_Heelers_(season_8).html': [49,
  79,
  82,
  105,
  107,
  125,
  127,
  133,
  135,
  141,
  143,
  660,
  695,
  730,
  739,
  886,
  969,
  986],
 'Taggen_Nunatak.html': [6, 44, 46, 48, 93, 176, 193],
 'Henri_BraqueniC3A9.html': [43, 46, 92, 175, 192],
 'Vrila.html': [6, 57, 59, 69, 73, 99, 100, 102, 151, 234, 251],
 'William_Henry_Porter.html': [48, 88, 171, 188],
 'Clive_Brown_(footballer).html': [146, 22

## 5. Checking the implementation

Let's see if we picked up more occurrence.

In [16]:
# Find out exactly new match
diff = {}
for key in final_improved_exact_match:
    temp = []
    if key not in final_exact_match:
        diff[key] = final_improved_exact_match[key]
    else:
        for c in final_improved_exact_match[key]:
            if c not in final_exact_match[key]:
                temp.append(c)
        if len(temp) != 0:
            diff[key] = temp
diff

{'Table_Point_Formation.html': [80],
 'Ingrid_GuimarC3A3es.html': [173],
 'Jules_Verne_ATV.html': [918, 1169],
 'Pictogram.html': [397],
 'Claire_Danes.html': [818, 820],
 'PTPRS.html': [58],
 'A_Beautiful_Valley.html': [177],
 'Mudramothiram.html': [196],
 'Gordon_Bau.html': [131, 148],
 'Embraer_Unidade_GaviC3A3o_Peixoto_Airport.html': [131],
 'Code_page_1023.html': [142, 533, 1315],
 'Cryptographic_primitive.html': [81],
 'Alex_Kurtzman.html': [338],
 'Filip_Pyrochta.html': [88],
 'Morgana_King.html': [587],
 'Don_Parsons_(ice_hockey).html': [168],
 'Bias.html': [971],
 'Tomohiko_ItC58D_(director).html': [107, 108],
 'Imperial_Venus_(film).html': [130],
 'Camp_Nelson_Confederate_Cemetery.html': [145],
 'Benny_Lee.html': [91],
 'Kul_Gul.html': [104],
 'Medicago_murex.html': [107],
 'Oldfield_Baby_Great_Lakes.html': [117],
 'Wilson_Global_Explorer.html': [120],
 'Craig_Chester.html': [278],
 'Derek_Acorah.html': [141],
 'Jack_Goes_Home.html': [166],
 'Morning_Glory_(2010_film).html': 

In [29]:
# Display file_name and number of new matches
for item in diff:
    print(item + ':--' + str(len(diff[item])))

Table_Point_Formation.html:--1
Ingrid_GuimarC3A3es.html:--1
Jules_Verne_ATV.html:--2
Pictogram.html:--1
Claire_Danes.html:--2
PTPRS.html:--1
A_Beautiful_Valley.html:--1
Mudramothiram.html:--1
Gordon_Bau.html:--2
Embraer_Unidade_GaviC3A3o_Peixoto_Airport.html:--1
Code_page_1023.html:--3
Cryptographic_primitive.html:--1
Alex_Kurtzman.html:--1
Filip_Pyrochta.html:--1
Morgana_King.html:--1
Don_Parsons_(ice_hockey).html:--1
Bias.html:--1
Tomohiko_ItC58D_(director).html:--2
Imperial_Venus_(film).html:--1
Camp_Nelson_Confederate_Cemetery.html:--1
Benny_Lee.html:--1
Kul_Gul.html:--1
Medicago_murex.html:--1
Oldfield_Baby_Great_Lakes.html:--1
Wilson_Global_Explorer.html:--1
Craig_Chester.html:--1
Derek_Acorah.html:--1
Jack_Goes_Home.html:--1
Morning_Glory_(2010_film).html:--1
Tim_Spencer_(singer).html:--1
Lower_Blackburn_Grade_Bridge.html:--1
1953E2809354_FA_Cup_qualifying_rounds.html:--1
Sol_Eclipse.html:--1
Jonathan_A._Goldstein.html:--1
83_(number).html:--1
Devil_on_Horseback.html:--1
Harry_H

## Observation: We have picked up more occurrence

1. New file(key)

2. Same file(key), new location

## 6. Finding Match Positions on Lines

* I don't think this change in requirement will need to modify reducer function.

* Only need to update Mapper Function to return indexs within lines

* Let's add a context around the match


In [54]:
# We should only need to change mapper function and leave rest of mapper_reduce the same
# 1. set up mapper function for this search condition
target = d
def pairs_exact_match_mapper (file_chunk):
    occurrence = {}
    for file_name in file_chunk:
        with open(os.path.join("wiki", file_name)) as f:
            line_occurrence = []
            counter = 0
            lines = f.readlines()
            for line in lines:
                if target in line.lower():
                    start_index = line.lower().find(target)
                    while start_index != -1:
                        context = line[max(start_index-25,0) : min(start_index +25, len(line))]
                        line_occurrence.append((counter, start_index, context))
                        start_index = line.lower().find(target,start_index + 1)
                counter +=1
            if len(line_occurrence) > 0:
                occurrence[file_name] = line_occurrence
    return occurrence

# Implement actuall map_reduce function
final_pairs_exact_match = map_reduce(file_names, 6, pairs_exact_match_mapper, exact_match_reducer)
final_pairs_exact_match

{'Valentin_Yanin.html': [(6,
   840,
   's of the USSR Academy of Sciences","Full Members o'),
  (6, 890, 'f the Russian Academy of Sciences","Demidov Prize '),
  (66, 90, '"/wiki/Soviet_Academy_of_Sciences" class="mw-redir'),
  (66, 145, 'title="Soviet Academy of Sciences">Soviet Academy '),
  (66, 173, 'ences">Soviet Academy of Sciences</a>; he became a'),
  (144, 1440, 's_of_the_USSR_Academy_of_Sciences" title="Category'),
  (144, 1502, 's of the USSR Academy of Sciences">Full Members of'),
  (144, 1548, 's of the USSR Academy of Sciences</a></li><li><a h'),
  (144, 1632, 'f_the_Russian_Academy_of_Sciences" title="Category'),
  (144, 1697, 'f the Russian Academy of Sciences">Full Members of'),
  (144, 1746, 'f the Russian Academy of Sciences</a></li><li><a h')],
 'William_Harvey_Lillard.html': [(80,
   166,
   'lmer, D.D. (1910) <i>The Science, Art and Philosop')],
 'Victor_S._Mamatey.html': [(48,
   682,
   'klin_College_of_Arts_and_Sciences" title="Franklin'),
  (48, 728, 'klin Co

## Observation: We were able to modifry our mapper function to return the correct form for result.

## 7. Displaying the Results

We should consider four columns:

1. File (given in our dict as key) --Given

2. Line (first portion of our tuple) --Given

3. Index (2nd portion of our tuple) -- Given

4. Context (We should consider display +- 5 words from index) -- Need to find

In [55]:
# Update the Dictionary with new target word = "science"
target = "science"
final_pairs_exact_match = map_reduce(file_names, 6, pairs_exact_match_mapper, exact_match_reducer)
final_pairs_exact_match

{'Valentin_Yanin.html': [(6,
   840,
   's of the USSR Academy of Sciences","Full Members o'),
  (6, 890, 'f the Russian Academy of Sciences","Demidov Prize '),
  (66, 90, '"/wiki/Soviet_Academy_of_Sciences" class="mw-redir'),
  (66, 145, 'title="Soviet Academy of Sciences">Soviet Academy '),
  (66, 173, 'ences">Soviet Academy of Sciences</a>; he became a'),
  (144, 1440, 's_of_the_USSR_Academy_of_Sciences" title="Category'),
  (144, 1502, 's of the USSR Academy of Sciences">Full Members of'),
  (144, 1548, 's of the USSR Academy of Sciences</a></li><li><a h'),
  (144, 1632, 'f_the_Russian_Academy_of_Sciences" title="Category'),
  (144, 1697, 'f the Russian Academy of Sciences">Full Members of'),
  (144, 1746, 'f the Russian Academy of Sciences</a></li><li><a h')],
 'William_Harvey_Lillard.html': [(80,
   166,
   'lmer, D.D. (1910) <i>The Science, Art and Philosop')],
 'Victor_S._Mamatey.html': [(48,
   682,
   'klin_College_of_Arts_and_Sciences" title="Franklin'),
  (48, 728, 'klin Co

In [56]:
import csv

with open('wiki_page_grep.csv', mode='w') as csv_file:
    fieldnames = ['File', 'Line', 'Index', 'Context']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    for key in final_pairs_exact_match:
        for item in final_pairs_exact_match[key]:
            writer.writerow({'File':key, 'Line': item[0],  'Index': item[1], 'Context': item[2]})


In [57]:
# let's display the result
import pandas as pd

result = pd.read_csv('wiki_page_grep.csv')
result

Unnamed: 0,File,Line,Index,Context
0,Valentin_Yanin.html,6,840,"s of the USSR Academy of Sciences"",""Full Membe..."
1,Valentin_Yanin.html,6,890,"f the Russian Academy of Sciences"",""Demidov Pr..."
2,Valentin_Yanin.html,66,90,"""/wiki/Soviet_Academy_of_Sciences"" class=""mw-r..."
3,Valentin_Yanin.html,66,145,"title=""Soviet Academy of Sciences"">Soviet Acad..."
4,Valentin_Yanin.html,66,173,"ences"">Soviet Academy of Sciences</a>; he beca..."
...,...,...,...,...
1265,Imperial_amazon.html,201,418,<i>Caribbean Journal of Science</i>. <b>44</b...
1266,Imperial_amazon.html,201,929,tle=Caribbean+Journal+of+Science&amp;rft.pages...
1267,North_Coast_(RTA_Rapid_Transit_station).html,169,31,"href=""/wiki/Great_Lakes_Science_Center"" title..."
1268,North_Coast_(RTA_Rapid_Transit_station).html,169,66,"nter"" title=""Great Lakes Science Center"">Great..."
