In [1]:
import re
import requests
import pandas as pd

In [2]:
def generate_category(category):
    '''
    format a category for insertion in to a wikipedia api call
    '''
    category = re.sub('\s', '+', category)
    return category

In [3]:
generate_category('machine learning')

'machine+learning'

In [4]:
def generate_query(category):
    '''
    Format an api call for requests
    '''
    query = """
            http://en.wikipedia.org/w/api.php?
            action=query&
            format=json&
            list=categorymembers&
            cmtitle=Category:{}& 
            cmlimit=max
            """.format(generate_category(category))
    query = re.sub('\s','',query)
    return query

In [5]:
def execute_category_query(category):
    '''
    Executes a category qeury and returns a 
    DataFrame of the category members
    '''
    
    r = requests.get(generate_query(category))
    response = r.json()
    return pd.DataFrame(response['query']['categorymembers'])

In [6]:
test = execute_category_query('machine learning')

In [7]:
test.head()

Unnamed: 0,ns,pageid,title
0,0,43385931,Data exploration
1,0,49082762,List of datasets for machine learning research
2,0,233488,Machine learning
3,0,53587467,Outline of machine learning
4,0,53198248,Singular statistical model


In [8]:
category_mask = test['title'].str.contains('Category:')

In [9]:
test[category_mask]

Unnamed: 0,ns,pageid,title
198,14,33547387,Category:Applied machine learning
199,14,42936114,Category:Artificial neural networks
200,14,1718975,Category:Bayesian networks
201,14,1991254,Category:Classification algorithms
202,14,22532673,Category:Cluster analysis
203,14,34310097,Category:Computational learning theory
204,14,12932492,Category:Artificial intelligence conferences
205,14,33542714,Category:Data mining and machine learning soft...
206,14,42320378,Category:Datasets in machine learning
207,14,29549713,Category:Dimension reduction


In [10]:
def remove_category(category):
    category = re.sub('Category:', '', category)
    return category

In [11]:
subcat_df_list = []
categories_to_query = test[category_mask]['title'].apply(remove_category).tolist()

In [12]:
categories_to_query

['Applied machine learning',
 'Artificial neural networks',
 'Bayesian networks',
 'Classification algorithms',
 'Cluster analysis',
 'Computational learning theory',
 'Artificial intelligence conferences',
 'Data mining and machine learning software',
 'Datasets in machine learning',
 'Dimension reduction',
 'Ensemble learning',
 'Evolutionary algorithms',
 'Genetic programming',
 'Inductive logic programming',
 'Kernel methods for machine learning',
 'Latent variable models',
 'Learning in computer vision',
 'Log-linear models',
 'Loss functions',
 'Machine learning algorithms',
 'Machine learning portal',
 'Machine learning task',
 'Markov models',
 'Machine learning researchers',
 'Semisupervised learning',
 'Statistical natural language processing',
 'Structured prediction',
 'Supervised learning',
 'Support vector machines',
 'Unsupervised learning']

In [13]:
for cat in categories_to_query:
    subcat_df_list.append(execute_category_query(cat))

In [14]:
def get_all_pages_rec(category):
    category_df = execute_category_query(category)
    pages_list = []
    category_mask = category_df['title'].str.contains('Category:')
    pages_df = category_df[~category_mask]
    pages_list.append(pages_df)
    categories = category_df[category_mask]['title'].str.replace('Category:', '').tolist()
    if len (categories) > 0:
        for cat in categories:
            pages_list.append(get_all_pages_rec(cat))
    
    pages_df = pd.concat(pages_list)
    pages_df.reset_index()
    return pages_df 

In [16]:
rec_test = get_all_pages_rec('machine learning')

In [17]:
def get_whole_category(category):
    df = get_all_pages_rec(category)
    df = df.drop_duplicates().reset_index(drop=True)
    df['category'] = category
    return df

In [18]:
df = get_whole_category('machine learning')

In [19]:
df.shape

(1104, 4)