# Downloading documents from Wikipedia

In [9]:
import requests
import bs4
import re
import tqdm
from collections import deque
from time import sleep
import random
import pandas as pd
import networkx as nx

In [10]:
def generator(texts):
  while len(texts) < 1500:
    yield


In [11]:
def bfs(link):
    response = requests.get(link)
    parsed = bs4.BeautifulSoup(response.text)
    links = parsed.find_all(
        'a', attrs={'href': re.compile(r'^/wiki')})  # english only
    texts = {link: parsed}
    q = deque()
    for a in links:
        q.append("https://en.wikipedia.org" + a['href'])
    for _ in (pbar := tqdm.tqdm(generator(texts))):
        pbar.set_description(f'{len(texts)} sites already')
        v = q.popleft()
        response = requests.get(v)
        if response.status_code != 200:
            continue
        parsed = bs4.BeautifulSoup(response.text)
        links = parsed.find_all(
            'a', attrs={'href': re.compile(r'^/wiki')})
        texts[v] = parsed
        for a in links:
            q.append("https://en.wikipedia.org" + a['href'])
        sleep(random.random()*3)
    return texts


In [12]:
texts = bfs('https://en.wikipedia.org/wiki/Jazz')


1499 sites already: : 2169it [1:01:08,  1.69s/it]


In [13]:
df = pd.DataFrame([texts]).transpose()
df.to_csv('raw_texts.csv')
df.head()

Unnamed: 0,0
https://en.wikipedia.org/wiki/Jazz,"[html, [\n, [\n, <meta charset=""utf-8""/>, \n, ..."
https://en.wikipedia.org/wiki/Wikipedia:Protection_policy#semi,"[html, [\n, [\n, <meta charset=""utf-8""/>, \n, ..."
https://en.wikipedia.org/wiki/Jazz_(disambiguation),"[html, [\n, [\n, <meta charset=""utf-8""/>, \n, ..."
https://en.wikipedia.org/wiki/Blues,"[html, [\n, [\n, <meta charset=""utf-8""/>, \n, ..."
https://en.wikipedia.org/wiki/Ragtime,"[html, [\n, [\n, <meta charset=""utf-8""/>, \n, ..."


In [14]:
print(len(set(texts.values())))

1468


In [19]:
contents = ["".join([p.getText() for p in parsed.select('p')]) for parsed in texts.values()]

In [21]:
print(contents[0][:100])



Jazz is a music genre that originated in the African-American communities of New Orleans, Louisiana


In [25]:
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, wordpunct_tokenize

from string import ascii_letters


In [60]:

wordnet = WordNetLemmatizer()


In [56]:
def custom_stemmer(string):
    lemmatized = wordnet.lemmatize(string)
    words = word_tokenize(lemmatized)
    final_words = [] # can't use set to preserve order
    for word in words:
        if word[0] in ascii_letters:  # heuristic about non-words
            final_words.append(word)
    return [word for word in final_words if word not in stopwords.words('english')]


In [61]:
preprocessed = [custom_stemmer(text) for text in contents]


In [64]:
final = {}

In [72]:
for key, text in zip(texts.keys(), preprocessed):
    final[key] = " ".join(text)

In [77]:
df_final = pd.DataFrame(final, index=[0]).transpose()
df_final.to_csv('preprocessed.csv')
df_final.head()

Unnamed: 0,0
https://en.wikipedia.org/wiki/Jazz,Jazz music genre originated African-American c...
https://en.wikipedia.org/wiki/Wikipedia:Protection_policy#semi,In circumstances pages may need protected modi...
https://en.wikipedia.org/wiki/Jazz_(disambiguation),Jazz style music subgenres Jazz may also refer
https://en.wikipedia.org/wiki/Blues,Blues music genre musical form originated Deep...
https://en.wikipedia.org/wiki/Ragtime,Ragtime also spelled rag-time rag time musical...


All files available [here](https://drive.google.com/drive/folders/1FkuFF7tCvBj8pTVDtOtFXtfSUOH7a2vw?usp=sharing), as git doesn't support so large files 