In [4]:
!python -m pip install pymongo
!pip install --quiet nltk
!pip install --quiet bs4
!pip install --quiet spacy
!python -m nltk.downloader all
!python -m spacy download en

Collecting pymongo
  Downloading https://files.pythonhosted.org/packages/db/5a/77060da2196471c8c47eeed6526029bd35cb2f10b1e4fc0e5e5234ca1aa0/pymongo-3.6.1-cp27-cp27mu-manylinux1_x86_64.whl (381kB)
[K    100% |████████████████████████████████| 389kB 2.4MB/s ta 0:00:011
[?25hInstalling collected packages: pymongo
Successfully installed pymongo-3.6.1
[33mYou are using pip version 9.0.1, however version 10.0.0 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[nltk_data] Downloading collection u'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /home/ith/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /home/ith/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /home/ith/nltk_data...
[nltk_data]    |   Package biocreative_ppi is already up-to-date!
[nltk_data]    | Down

[nltk_data]    |   Package semcor is already up-to-date!
[nltk_data]    | Downloading package senseval to
[nltk_data]    |     /home/ith/nltk_data...
[nltk_data]    |   Package senseval is already up-to-date!
[nltk_data]    | Downloading package sentiwordnet to
[nltk_data]    |     /home/ith/nltk_data...
[nltk_data]    |   Package sentiwordnet is already up-to-date!
[nltk_data]    | Downloading package sentence_polarity to
[nltk_data]    |     /home/ith/nltk_data...
[nltk_data]    |   Package sentence_polarity is already up-to-date!
[nltk_data]    | Downloading package shakespeare to
[nltk_data]    |     /home/ith/nltk_data...
[nltk_data]    |   Package shakespeare is already up-to-date!
[nltk_data]    | Downloading package sinica_treebank to
[nltk_data]    |     /home/ith/nltk_data...
[nltk_data]    |   Package sinica_treebank is already up-to-date!
[nltk_data]    | Downloading package smultron to
[nltk_data]    |     /home/ith/nltk_data...
[nltk_data]    |   Package smultron is alrea

In [2]:
import re
import requests
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
import pymongo
import json
import pymongo
import sklearn

from nltk.corpus import stopwords
from spacy.lang.en import English
from nltk.stem.porter import PorterStemmer
import nltk
from nltk.corpus import stopwords
from spacy.lang.en import STOP_WORDS
nlp = English()
stop = set(stopwords.words('english'))
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer

In [3]:
client = pymongo.MongoClient('35.174.105.75', 27016)
wiki_db = client.wikipedia
wiki_col = wiki_db.my_collection

# Create functions to get data from Wiki API

In [4]:
def category_request(category):
    """
    Scrape a category page from Wikipedia API.
    
    Params:
    --------
    category: str
        The name of the category to be scraped.
        
    Returns:
    --------
    DataFrame
        Pandas DataFrame containing categories 
        
    """
    my_params = {
        'action':'query',
        'format':'json',
        'list':'categorymembers',
        'cmtitle': 'Category:{}'.format(category),
        'cmlimit': 'max'
        }
    page = requests.get('http://en.wikipedia.org/w/api.php', params=my_params)
    return pd.DataFrame(page.json()['query']['categorymembers'])

In [5]:
def get_content(title):
    """
    Scrape a page from Wikipedia API to get the content.
    
    Params:
    --------
    title: str
        The name of the page to be scraped.
        
    Returns:
    --------
    List of the contents of the page
        
    """
    my_params = {
        'action':'query',
        'format':'json',
        'titles': title,
        'prop': 'revisions',
        'rvprop': 'content'
    }
    content = requests.get('http://en.wikipedia.org/w/api.php', params=my_params)
    return list(content.json()['query']['pages'].values())[0]['revisions'][0]['*']

In [6]:
def get_cats_and_pages(category):
    """
    Returns the pages and subcategories of a category
    
    Params
    ------
    category : str
        Name of a category
    
    Returns
    -------
    children: list 
        list of sub categories 
    pages: list
        list of pages on the category
    
    page_id = list of page_ids for each page
        
    """
    cats = pd.DataFrame(category_request(category))
    cats['title'] = cats.title.astype(str) 
    #returns a boolean mask of all titles with 'category' in the str
    subs_mask = cats['title'].str.contains('Category:')
    
    #creates list of new sub catagories
    children = list(cats['title'][subs_mask].str.replace('Category:', ""))
    pages = list(cats['title'][~cats.title.str.contains('Category:')])
    page_id = list(cats['pageid'][~cats.title.str.contains('Category:')])
    return page_id, pages, children

In [7]:
def cleaner(text):
    text = re.sub('&#39;','',text).lower()
    text = re.sub('<br />','',text)
    text = re.sub('<.*>.*</.*>','', text)
    text = re.sub('[\d]','',text)
    text = re.sub('[^a-z ]',' ',text)
    text = re.sub(u'<.*>','',text)
    text = re.sub(u'[^a-z\s]',' ',text)
    text = re.sub("\\s+", " ", text)
    text = nlp(text)
    text = [str(i.lemma_) for i in text if str(i.orth_) not in stop]
    text = ' '.join(text)

    return text

In [8]:
def wiki_traverse(main_cat, category, max_depth=-1):
    """ 
    
    Params
    ------
    category : str
        Name of a category
    
    Returns
    -------
    Does not return anything, function automatically feeds dictionaries of category, articles/
    and content into Mongo database. 
        
    """
    
    if max_depth != 0:

        page_id, pages, children = get_cats_and_pages(category)
        
        for index, article in enumerate(pages):    
            article_dict = {}
            article_dict['main_cat'] = main_cat
            article_dict['sub_cat'] = category
            article_dict['article'] = article
            article_dict['page_id'] = str(page_id[index])  
            article_dict['content'] = cleaner(get_content(article))
            
            #this line adds each article onto mongo database as each article is being called 
            wiki_col.insert_one(article_dict)
            
        for child in children:
            wiki_traverse(main_cat, child, max_depth-1)

In [None]:
wiki_traverse('Business software', 'Business software', max_depth=3)

In [None]:
wiki_traverse('Machine learning', 'Machine learning', max_depth=3)

In [None]:
wiki_col.count()