In [3]:
import re
import requests
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
import pymongo
import json
import pymongo

from nltk.corpus import stopwords
from spacy.lang.en import English
from nltk.stem.porter import PorterStemmer
import nltk
from nltk.corpus import stopwords
from spacy.lang.en import STOP_WORDS
nlp = English()
stop = set(stopwords.words('english'))
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer

In [4]:
client = pymongo.MongoClient('localhost', 27017)

wiki_db = client.wikipedia

wiki_col = wiki_db.my_collection

## Create functions to get data from Wiki API

In [5]:
def category_request(category):
    """
    Scrape a category page from Wikipedia API.
    
    Params:
    --------
    category: str
        The name of the category to be scraped.
        
    Returns:
    --------
    DataFrame
        Pandas DataFrame containing categories 
        
    """
    my_params = {
        'action':'query',
        'format':'json',
        'list':'categorymembers',
        'cmtitle': 'Category:{}'.format(category),
        'cmlimit': 'max'
        }
    page = requests.get('http://en.wikipedia.org/w/api.php', params=my_params)
    return pd.DataFrame(page.json()['query']['categorymembers'])


In [6]:
def get_content(title):
    """
    Scrape a page from Wikipedia API to get the content.
    
    Params:
    --------
    title: str
        The name of the page to be scraped.
        
    Returns:
    --------
    List of the contents of the page
        
    """
    my_params = {
        'action':'query',
        'format':'json',
        'titles': title,
        'prop': 'revisions',
        'rvprop': 'content'
    }
    content = requests.get('http://en.wikipedia.org/w/api.php', params=my_params)
    return list(content.json()['query']['pages'].values())[0]['revisions'][0]['*']


In [7]:
def get_cats_and_pages(category):
    """
    Returns the pages and subcategories of a category
    
    Params
    ------
    category : str
        Name of a category
    
    Returns
    -------
    children: list 
        list of sub categories 
    pages: list
        list of pages on the category
    
    page_id = list of page_ids for each page
        
    """
    cats = pd.DataFrame(category_request(category))
    cats['title'] = cats.title.astype(str) 
    #returns a boolean mask of all titles with 'category' in the str
    subs_mask = cats['title'].str.contains('Category:')
    
    #creates list of new sub catagories
    children = list(cats['title'][subs_mask].str.replace('Category:', ""))
    pages = list(cats['title'][~cats.title.str.contains('Category:')])
    page_id = list(cats['pageid'][~cats.title.str.contains('Category:')])
    return page_id, pages, children

#sub_categories, pages

In [8]:
def cleaner(text):
    text = re.sub('&#39;','',text).lower()
    text = re.sub('<br />','',text)
    text = re.sub('<.*>.*</.*>','', text)
    text = re.sub('[\d]','',text)
    text = re.sub('[^a-z ]',' ',text)
    text = re.sub(u'<.*>','',text)
    text = re.sub(u'[^a-z\s]',' ',text)
    text = re.sub("\\s+", " ", text)
    text = nlp(text)
    text = [str(i.lemma_) for i in text if str(i.orth_) not in stop]
    text = ' '.join(text)

    return text

In [None]:

def wiki_traverse(main_cat, category, max_depth=-1):
    """ 
    
    Params
    ------
    category : str
        Name of a category
    
    Returns
    -------
    Does not return anything, function automatically feeds dictionaries of category, articles/
    and content into Mongo database. 
        
    """
    
    if max_depth != 0:

        page_id, pages, children = get_cats_and_pages(category)
        
        for index, article in enumerate(pages):    
            article_dict = {}
            article_dict['main_cat'] = main_cat
            article_dict['sub_cat'] = category
            article_dict['article'] = article
            article_dict['page_id'] = str(page_id[index])  
            article_dict['content'] = cleaner(get_content(article))
            
            #this line adds each article onto mongo database as each article is being called 
            wiki_col.insert_one(article_dict)
            
        for child in children:
            wiki_traverse(main_cat, child, max_depth-1)
            

### Chose to only collect a max depth of 3 for the categories because subcategories beyond depth 4 did not seem relevant to the topic.

In [10]:
wiki_traverse('Business software', 'Business software', max_depth=3)

In [11]:
wiki_traverse('Machine learning', 'Machine learning', max_depth=3)           


In [12]:
wiki_col.count()

5785