In [1]:
import json
import pandas as pd
from oresapi import Session
import requests

import matplotlib.pyplot as plt
import matplotlib

matplotlib.style.use('ggplot')
pd.set_option("display.max_rows", 100)

In [None]:
#https://en.wikipedia.org/w/api.php?action=query&titles=Albert%20Einstein

BASE_URL = 'https://en.wikipedia.org/w/api.php'
CHUNK_SIZE = 50

def getPageIdFromName(df, chunk_size):
    page_id_list = []
    df_list = [df[i:i+chunk_size] for i in range(0,df.shape[0],chunk_size)]
    for i, df_chunk in enumerate(df_list):
        title_list = '|'.join(df_chunk['title'])
        params = {
            'action':'query',
            'titles':title_list,
            'format':'json',
            'prop':'revisions'
        }
        
        r = requests.get(BASE_URL,params=params)
        if not r:
            raise requests.RequestException()
        json_response = r.json()
        
        for page in json_response['query']['pages']:
            id_name_map = {
                'rev_id':json_response['query']['pages'][page]['revisions'][0]['revid'],
                'label':json_response['query']['pages'][page]['title'],
            }
            page_id_list.append(id_name_map)
        if chunk_size*(i+1) % 1000 == 0:
            print('fetched {0} rev IDs'.format(chunk_size*(i+1)))
    result = pd.DataFrame(page_id_list)
    return result


def getTopics(df):
    rev_id_list = df['rev_id'].unique().tolist()
    
    session = Session("https://ores.wikimedia.org",user_agent="")
    results = session.score("enwiki", ["drafttopic"], rev_id_list)

    topic_list = []
    i = 1
    for rev_id, result in zip(rev_id_list, results):
        score_dict = {
            'rev_id':rev_id
        }
        if 'error' in result['drafttopic']:
            score_dict['dt_pred'] = None
        elif len(result['drafttopic']['score']['prediction']) < 1:
            score_dict['dt_pred'] = None
        else:
            try:
                score_dict['dt_pred'] = result['drafttopic']['score']['prediction'][0],
            except:
                print(result)
                raise
        topic_list.append(score_dict)
        if i % 1000 == 0:
            print('predicted topics for {0} page IDs'.format(i))
        i += 1
    topic_df = pd.DataFrame(topic_list)
    return topic_df

name_id_map = getPageIdFromName(df.head(1000), chunk_size=CHUNK_SIZE)
topic_df = getTopics(name_id_map)