In [72]:
import pandas as pd
from os import listdir
from os.path import isfile, join
from bs4 import BeautifulSoup
import requests
import string

from collections import defaultdict

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [78]:
# Extract "Article Name" from all "Article Name.html" for a list of article names
file_names = [name[:-5] for name in listdir('../wikiarticles30') if isfile(join('../wikiarticles30', name))]
try: file_names.remove(".DS_")
except: pass
wikiarticles30_list = file_names
# turn list into df
# wikiarticles30 = pd.DataFrame(file_names,columns=['article_name'])
# wikiarticles = wikiarticles.set_index('article_name') #set article name as index

In [79]:
def createrow(article):
    '''
    Finds the html file with the title 'article'
    Then tokenizes and cleans up body of article to return bag of words
    Input: name of the article
    Output: single pandas row with bag of words
    '''
    filename = "../wikiarticles30/" + article + ".html"
    html_handle = open(filename)
    html_text = html_handle.read()
    soup = BeautifulSoup(html_text, 'lxml')

    #delete all tables within article (adds noise)
    for table in soup.find_all('table', {"id":"mw-content-text"}):
        table.decompose()

    #flatten html and extract text from body
    body_text = soup.find("div", {"id":"mw-content-text"}).text 
    body_lower = body_text.lower()

    #tokenize
    bag_raw = word_tokenize(body_lower)

    # remove english stopwords, punctuation, numbers, and single letters
    stop = stopwords.words('english')
    bag_alpha = [w for w in bag_raw if w.isalpha()]
    bag_no_stop = [w for w in bag_alpha if not w in stop]
    bag_clean = [w for w in bag_no_stop if len(w) > 2]

    # make dict with freq
    wordfreq = defaultdict(int)
    for word in bag_clean:
        wordfreq[word] += 1
    freq = pd.DataFrame.from_dict(wordfreq, orient='index')
    freq.drop(freq[freq[0] < 3].index, inplace = True) 
    freq = freq.transpose()
    freq.insert(0,'this_article_unique_name', article)

    return freq

In [100]:
wiki_nlp = pd.DataFrame(columns=['this_article_unique_name']) #init empty df
for article in wikiarticles30_list:
    newrow = createrow(article)
    wiki_nlp = pd.concat([newrow, wiki_nlp], sort=False, ignore_index=True)
wiki_nlp.fillna(0)

Unnamed: 0,this_article_unique_name,article,multiple,please,improve,learn,remove,template,references,message,...,individual,person,cultural,european,change,celtic,brimscombe,port,macdougall,wdh
0,Albumin transport function analysis by EPR spe...,4.0,8.0,4.0,3.0,4.0,4.0,4.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Akito Tachibana,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Alan Longo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Akiva Librecht,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Albano Cathedral,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Akiaga Station,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Akletos,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Al-Arabi (magazine),7.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Aksel Berget Skjølsvik,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Albanyà,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
