# Downloading documents from Wikipedia

In [4]:
import requests
import bs4
import re
import tqdm
from collections import deque
from time import sleep
import random
import pandas as pd
import numpy as np

np.random.seed(0)

In [5]:
df = pd.read_csv("preprocessed.csv")

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,0
0,https://en.wikipedia.org/wiki/Jazz,Jazz music genre originated African-American c...
1,https://en.wikipedia.org/wiki/Wikipedia:Protec...,In circumstances pages may need protected modi...
2,https://en.wikipedia.org/wiki/Jazz_(disambigua...,Jazz style music subgenres Jazz may also refer
3,https://en.wikipedia.org/wiki/Blues,Blues music genre musical form originated Deep...
4,https://en.wikipedia.org/wiki/Ragtime,Ragtime also spelled rag-time rag time musical...


In [7]:
df[df["Unnamed: 0"] == "https://en.wikipedia.org/wiki/File:John_Coltrane_1963.jpg"]["0"].item()

'Original file pixels file size MB MIME type image/jpeg http Commons Zero Public Domain Dedicationfalsefalse Click date/time view file appeared time The following wikis use file View global usage file This file contains additional information probably added digital camera scanner used create digitize If file modified original state details may fully reflect modified file'

In [8]:
def generator(texts):
  while len(texts) < 1500:
    yield


In [48]:
from collections import OrderedDict
class BFSScraper():
    def __init__(self, n_to_visit):
        self.n_to_visit = n_to_visit
        self.already_visited = set()
        self.q = deque()
        self.pages_with_error_response = {}
        self.pages = OrderedDict() # We want to reproduce the order of visiting pages

    def get_unique_n_links(self, links):
        """This function returns at most n yet not visited links from given list of pages"""
        new_links = []
        candidates_ids = np.arange(len(links)) 
        np.random.shuffle(candidates_ids) # To walk randomly
        for candidate_id in candidates_ids: # Possibly all the links could be already visited, or we won't have n links
            if len(new_links) > self.n_to_visit:
                break
            link = "https://en.wikipedia.org" + links[candidate_id]['href']
            if link not in self.already_visited:
                new_links.append(link)
            
        return new_links

    def find_links(self, parsed_page):
        links = parsed_page.find_all(
            'a', attrs={'href': re.compile(r'^\/wiki\/(?!File)(?!Main_Page)\w*$')})  # To get only wikipedia articles, doen't take files nor something with :, ( etc. Don't go back to main page
        
        links = list(set(links)) # To remove duplicates, probably not the most efficient way

        return links

    def process_one_link(self, link):
        response = requests.get(link)
        if response.status_code != 200:
            self.pages_with_error_response[link] = response.status_code
            return None
        
        parsed = bs4.BeautifulSoup(response.text)
        found_links = self.find_links(parsed)
        n_not_visited_links = self.get_unique_n_links(found_links)
        content = "".join([p.getText() for p in parsed.find(id="mw-content-text").select('p')])

        self.pages[link] = {} # No OrderedDefaultDict :(
        self.pages[link]["content"] = content
        self.pages[link]["num_of_links"] = len(found_links)
        self.pages[link]["selected_links"] = n_not_visited_links

        self.already_visited.add(link)
        return n_not_visited_links

    def dummy_generator(self, n):
        while len(self.pages) < n:
            yield
            
    def generate_summary(self):
        with open("summary.txt" , 'w') as f:
            for link, page in self.pages.items():
                f.write(f"{link} number of reasonable links: {page['num_of_links']}\n")
                f.write("Visited neighbours: \n")
                for neighbour in page["selected_links"]:
                    f.write(f"\t\t{neighbour}\n")
                f.write("\n\n")

    def generate_csv(self):
        df = pd.DataFrame(self.pages)
        df.to_csv('text.csv')
        # with open("text.csv" , 'w', encoding="utf-8") as f:
        #     for link, page in self.pages.items():
        #         f.write(f"{link}, {page['content'].strip()}\n")

    def bfs(self, starting_link, n = 1000):
        self.q.append(starting_link)
        for _ in (pbar := tqdm.tqdm(self.dummy_generator(n))):
            link_to_scrap = self.q.popleft()
            links_to_visit = self.process_one_link(link_to_scrap)
            if links_to_visit is not None: # Succesfull scraping of this particual pages and n neighbours gathered
                pbar.set_description(f'{len(self.pages)} sites already collected')
                for link in links_to_visit:
                    self.q.append(link)

            sleep(random.random()*3) 


In [50]:
bfs_scrap = BFSScraper(n_to_visit=3)
bfs_scrap.bfs('https://en.wikipedia.org/wiki/Jazz', 2)

2 sites already collected: : 2it [00:02,  1.18s/it]


In [51]:
bfs_scrap.generate_summary()

In [52]:
bfs_scrap.generate_csv()

In [135]:
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, wordpunct_tokenize


In [198]:
porter = PorterStemmer()
lancaster = LancasterStemmer()
wordnet = WordNetLemmatizer()

class CustomStemmer():
    def __init__(self, data, stemmer=LancasterStemmer, tokenizer=word_tokenize, lemmatizer=WordNetLemmatizer, custom_stopwords=stopwords.words('english')):
        self.data = data
        self.stemmer = stemmer()
        self.tokenizer = tokenizer
        self.lemmatizer = lemmatizer()
        self.custom_stopwords = custom_stopwords

    def process_text(self, text, stem = False, lemmatize = True):
        words = self.tokenizer(text)
        final_words = []  # can't use set to preserve order
        for word in words:
            if re.match(r'^[A-Za-z]*$', word):
                final_words.append(word)
        cleaned = [
            word for word in final_words if word not in self.custom_stopwords]
        
        text = ' '.join(cleaned)
        if stem:
            text = self.stemmer.stem(text)
        elif lemmatize:
            text = self.lemmatizer.lemmatize(text)

        return text

    def process_corpus(self):
        self.processed = {index: self.process_text(element['content']) for index, element in self.data.iterrows()}
        return self.processed

    def generate_csv(self):
        df = pd.DataFrame(self.processed, index=[0]).T
        df.columns = ['text']
        df.to_csv('processed.csv', columns=['text'])


In [199]:
texts = pd.read_csv('text.csv', index_col=0).T
texts.head()

Unnamed: 0,content,num_of_links,selected_links
https://en.wikipedia.org/wiki/Jazz,\nJazz is a music genre that originated in the...,863,"['https://en.wikipedia.org/wiki/Danish_jazz', ..."
https://en.wikipedia.org/wiki/Danish_jazz,Danish jazz dates back to 1923 when Valdemar E...,284,"['https://en.wikipedia.org/wiki/Big_band', 'ht..."


In [200]:
stemmer = CustomStemmer(texts, PorterStemmer, word_tokenize)
preprocessed = stemmer.process_corpus()

In [201]:
stemmer.generate_csv()

All files available [here](https://drive.google.com/drive/folders/1FkuFF7tCvBj8pTVDtOtFXtfSUOH7a2vw?usp=sharing), as git doesn't support so large files 