In [18]:
import pandas as pd
from os import listdir
from os.path import isfile, join
from bs4 import BeautifulSoup
import requests
import string

from collections import defaultdict

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer() 

In [19]:
# Extract "Article Name" from all "Article Name.html" for a list of article names
file_names = [name[:-5] for name in listdir('../wikiarticles') if isfile(join('../wikiarticles', name))]
try: file_names.remove(".DS_")
except: pass
wikiarticles_list = file_names

In [20]:
def createrow(article):
    '''
    Finds the html file with the title 'article'
    Then tokenizes and cleans up body of article to return bag of words
    Input: name of the article
    Output: single pandas row with bag of words
    '''
    filename = "../wikiarticles/" + article + ".html"
    html_handle = open(filename)
    html_text = html_handle.read()
    soup = BeautifulSoup(html_text, 'lxml')

    #delete all tables within article (adds noise)
    for table in soup.find_all('table', {"id":"mw-content-text"}):
        table.decompose()

    #flatten html and extract text from body
    body_text = soup.find("div", {"id":"mw-content-text"}).text 
    body_lower = body_text.lower()

    #tokenize
    bag_raw = word_tokenize(body_lower)

    # remove english stopwords, punctuation, numbers, and single letters
    stop = stopwords.words('english')
    bag_alpha = [w for w in bag_raw if w.isalpha()]
    bag_no_stop = [w for w in bag_alpha if not w in stop]
    bag_short = [w for w in bag_no_stop if len(w) > 2]
    
    # lemmatize
    bag_lem = [lemmatizer.lemmatize(w) for w in bag_short]

    # make dict with freq
    wordfreq = defaultdict(int)
    for word in bag_lem:
        wordfreq[word] += 1
    freq = pd.DataFrame.from_dict(wordfreq, orient='index')
    freq.drop(freq[freq[0] < 3].index, inplace = True) 
    freq = freq.transpose()
    freq.insert(0,'this_article_unique_name', article)

    return freq

In [54]:
def generate_subdf(start, stop):
    '''
    The function 'create_row' creates a single df row from an html file with tally of words
    We can concat every row to the df before. However, this is computationally very expensive
    Thus, dividing up the load.
    Generate 4 dataframes separately, then concat.
    Inputs: start and stop index of 'wikiarticles_list'
    Output: dataframe
    '''
    wiki_nlp = pd.DataFrame(columns=['this_article_unique_name']) #init empty df

    exec_counter = 0
    for article in wikiarticles_list[start : stop]:
        newrow = createrow(article)
        wiki_nlp = pd.concat([newrow, wiki_nlp], sort=False, ignore_index=True)

        exec_counter += 1
        print(str(exec_counter) + " generating index " + str(start) + " " + str(stop))
        print(wiki_nlp.shape)
        
    
    return wiki_nlp

In [55]:
# separating wikiarticles_list into 4
index1 = len(wikiarticles_list)//4 * 1
index2 = len(wikiarticles_list)//4 * 2
index3 = len(wikiarticles_list)//4 * 3

nlp_1 = generate_subdf(0, index1)
nlp_2 = generate_subdf(index1, index2)
nlp_3 = generate_subdf(index2, index3)
nlp_4 = generate_subdf(index3, len(wikiarticles_list))

 7986)
457 generating index 2739 3653
(457, 7989)
458 generating index 2739 3653
(458, 8044)
459 generating index 2739 3653
(459, 8047)
460 generating index 2739 3653
(460, 8060)
461 generating index 2739 3653
(461, 8077)
462 generating index 2739 3653
(462, 8087)
463 generating index 2739 3653
(463, 8092)
464 generating index 2739 3653
(464, 8110)
465 generating index 2739 3653
(465, 8115)
466 generating index 2739 3653
(466, 8121)
467 generating index 2739 3653
(467, 8127)
468 generating index 2739 3653
(468, 8127)
469 generating index 2739 3653
(469, 8181)
470 generating index 2739 3653
(470, 8188)
471 generating index 2739 3653
(471, 8204)
472 generating index 2739 3653
(472, 8212)
473 generating index 2739 3653
(473, 8216)
474 generating index 2739 3653
(474, 8216)
475 generating index 2739 3653
(475, 8219)
476 generating index 2739 3653
(476, 8235)
477 generating index 2739 3653
(477, 8292)
478 generating index 2739 3653
(478, 8295)
479 generating index 2739 3653
(479, 8304)
480 

In [56]:
wiki_nlp = pd.concat([nlp_1, nlp_2, nlp_3, nlp_4], sort=False, ignore_index=True)

In [58]:
wiki_nlp.to_csv(r'wiki_nlp.csv')