In [2]:
import re
import pandas as pd
import requests
import numpy as np

In [3]:
def generate_category(category):
    '''
    format a category for insertion in to a wikipedia api call
    '''
    category = re.sub('\s', '+', category)
    return category

def generate_query(category):
    '''
    Format an api call for requests
    '''
    query = """
            http://en.wikipedia.org/w/api.php?
            action=query&
            format=json&
            list=categorymembers&
            cmtitle=Category:{}& 
            cmlimit=max
            """.format(generate_category(category))
    query = re.sub('\s','',query)
    return query

def execute_category_query(category):
    '''
    Executes a category qeury and returns a 
    DataFrame of the category members
    '''
    
    r = requests.get(generate_query(category))
    response = r.json()
    return pd.DataFrame(response['query']['categorymembers'])

def remove_category(category):
    '''prepares subcategories to be queried'''
    category = re.sub('Category:', '', category)
    return category

In [4]:
# trying to fix for empty categories
def get_all_pages_rec(category, max_depth=3):
    '''
    parameters:
        category - the category to be searched
        max_depth - the level of subcategories (i.e. max_depth=2 returns pages for category and pages for
                    its first set of subcategories)
    returns: a dataframe of pageid, ns, and title
    '''

    if max_depth > 0:
 #       print('depth:', max_depth,'- category:', category,)
        category_df = execute_category_query(category)
        pages_list = []
        category_mask = category_df['title'].str.contains('Category:')
        pages_df = category_df[~category_mask]
        pages_list.append(pages_df)
        categories = category_df[category_mask]['title'].str.replace('Category:', '').tolist()
        if len (categories) > 0:
            for cat in categories:
                if 'club software' in cat.lower():
                    continue
                max_depth -= 1
                pages_list.append(get_all_pages_rec(cat, max_depth))
                max_depth += 1
        pages_df = pd.concat(pages_list)
        pages_df.reset_index()
        return pages_df
    else:
        return

def get_whole_category(category, max_depth=3):
    '''
    parameters:
        category - the category to be searched
        max_depth - the level of subcategories (i.e. max_depth=2 returns pages for category and pages for
                    its first set of subcategories)
    returns: a dataframe of pageid, title and category
    '''
    df = get_all_pages_rec(category, max_depth)
    df = df.drop_duplicates().reset_index(drop=True)
    df['category'] = category
    df = df.drop('ns', axis = 1)
    return df

In [5]:
ml_df = get_whole_category('machine learning', max_depth=5)

In [6]:
bs_df = get_whole_category('business software', max_depth=2)

In [9]:
big_df = pd.concat([ml_df, bs_df])

In [10]:
big_df.shape

(2658, 3)

In [11]:
big_df.to_pickle('../data/pages-df.p')