In [1]:
import pandas as pd
import numpy as np
import requests
import json
pd.set_option('display.max_rows', 999)

In [5]:
df = pd.read_csv('/Users/allen/Downloads/country/data/page_data.csv')
pop = pd.read_csv('/Users/allen/Downloads/WPDS_2018_data.csv')

In [6]:
# page names start with 'Template' are not Wikipedia articles
df['has_template'] = df['page'].str.find('Template')
df = df[df.has_template == -1].reset_index(drop=True)
df.drop(columns=['has_template'], inplace=True)

In [7]:
headers = {'User-Agent' : 'https://github.com/liuy379', 'From' : 'liuy379@uw.edu'}
# define function to get ores data
def get_ores_data(revision_ids, headers):
    endpoint = 'https://ores.wikimedia.org/v3/scores/{project}/?models={model}&revids={revids}'
    params = {'project' : 'enwiki',
              'model'   : 'wp10',
              'revids'  : '|'.join(str(x) for x in revision_ids)
              }
    api_call = requests.get(endpoint.format(**params))
    response = api_call.json()
    
    return response

In [8]:
def extract_prediction(df):
    """Return dataframe of rev_ids and their predictions."""
    idx = np.arange(0, df.shape[0], 100).tolist()
    output = list()
    for i in idx:
        ids = df['rev_id'].tolist()[i:(i+100)]
        ores_data = get_ores_data(ids, headers)
        for key, item in ores_data['enwiki']['scores'].items():
            dc = dict()
            dc['rev_id'] = key
            if 'error' in item['wp10']:
                dc['prediction'] = 'ERROR'
            else:
                dc['prediction'] = item['wp10']['score']['prediction']
            output.append(dc)
    
    return pd.DataFrame(output)

In [9]:
pred = extract_prediction(df)

# output the rev_ids with no prediction available
pred[pred.prediction == 'ERROR'][['rev_id']].to_csv('revid_no_prediction.csv', index=False)

# update data
pred = pred[pred.prediction != 'ERROR']

In [10]:
pred.head()

Unnamed: 0,rev_id,prediction
0,355319463,Stub
1,393276188,Stub
2,393822005,Stub
3,395521877,Stub
4,395526568,Stub


In [11]:
df.head()

Unnamed: 0,page,country,rev_id
0,Bir I of Kanem,Chad,355319463
1,Information Minister of the Palestinian Nation...,Palestinian Territory,393276188
2,Yos Por,Cambodia,393822005
3,Julius Gregr,Czech Republic,395521877
4,Edvard Gregr,Czech Republic,395526568


In [31]:
# merge data
pred = pred.astype({'rev_id': 'int64'})
target = df.merge(pop, how='left', left_on='country', right_on='Geography')

In [32]:
target.head()

Unnamed: 0,page,country,rev_id,Geography,Population mid-2018 (millions)
0,Bir I of Kanem,Chad,355319463,Chad,15.4
1,Information Minister of the Palestinian Nation...,Palestinian Territory,393276188,,
2,Yos Por,Cambodia,393822005,Cambodia,16.0
3,Julius Gregr,Czech Republic,395521877,,
4,Edvard Gregr,Czech Republic,395526568,,


In [35]:
# output data that does not have country match
target[target.Geography.isnull()].to_csv('wp_wpds_countries-no_match.csv', index=False)

# update working data
target.dropna(inplace=True)

In [36]:
# merge with prediction data
target = target.merge(pred, how='left', on='rev_id')
target.dropna(inplace=True)

In [43]:
target.drop(columns=['Geography'], inplace=True)
target.columns = ['article_name','country','revision_id','population','article_quality']
target = target[['country','article_name','revision_id','article_quality','population']]
target.to_csv('wp_wpds_politicians_by_country.csv', index=True)

In [49]:
target.head()

Unnamed: 0,country,article_name,revision_id,article_quality,population
0,Chad,Bir I of Kanem,355319463,Stub,15.4
1,Cambodia,Yos Por,393822005,Stub,16.0
2,Canada,Robert Douglas Cook,401577829,Stub,37.2
3,Egypt,List of Grand Viziers of Egypt,442937236,Stub,97.0
4,Pakistan,Sehba Musharraf,448555418,Stub,200.6


In [109]:
pop.head()

Unnamed: 0,Geography,Population mid-2018 (millions)
0,AFRICA,1284.0
1,Algeria,42.7
2,Egypt,97.0
3,Libya,6.5
4,Morocco,35.2


In [153]:
# "10 highest-ranked countries in terms of number of politician articles as a proportion of country population"
a1 = target.groupby('country').agg(
    num_article = ('revision_id', 'count'),
).merge(pop, how='left', left_on='country', right_on='Geography')
a1['Population mid-2018 (millions)'] = a1['Population mid-2018 (millions)'].str.replace(',', '')
a1 = a1.astype({'Population mid-2018 (millions)':'float'})
a1['article_prop'] = (a1['num_article'] / a1['Population mid-2018 (millions)']) / 1000000
a1.drop(columns=['num_article','Population mid-2018 (millions)'], inplace=True)
a1.columns = ['country','article_proportion']
a1.sort_values('article_proportion', ascending=False, inplace=True)

In [None]:
a1.head(10)

In [201]:
pop[pop['Geography'].str.isupper()]

Unnamed: 0,Geography,Population mid-2018 (millions)
0,AFRICA,1284
56,NORTHERN AMERICA,365
59,LATIN AMERICA AND THE CARIBBEAN,649
95,ASIA,4536
144,EUROPE,746
189,OCEANIA,41
