This notebook focuses on calculating the ratio of Wikipedia politician articles counts to the country/region population, as well as the ratio of "high quality" Wikipedia politician articles counts to the total country/region article counts.

In [1]:
import pandas as pd
import numpy as np
import requests
import json
pd.set_option('display.max_rows', 999)

In [2]:
df = pd.read_csv('/Users/allen/Downloads/country/data/page_data.csv')
pop = pd.read_csv('/Users/allen/Downloads/WPDS_2018_data.csv')

# page names start with 'Template' are not Wikipedia articles
df['has_template'] = df['page'].str.find('Template')
df = df[df.has_template == -1].reset_index(drop=True)
df.drop(columns=['has_template'], inplace=True)

headers = {'User-Agent' : 'https://github.com/liuy379', 'From' : 'liuy379@uw.edu'}

In [3]:
def get_ores_data(revision_ids, headers):
    """Connect API to extract data."""
    endpoint = 'https://ores.wikimedia.org/v3/scores/{project}/?models={model}&revids={revids}'
    params = {'project' : 'enwiki',
              'model'   : 'wp10',
              'revids'  : '|'.join(str(x) for x in revision_ids)
              }
    api_call = requests.get(endpoint.format(**params))
    response = api_call.json()
    
    return response

In [4]:
def extract_prediction(df):
    """Return dataframe of rev_ids and their predictions."""
    idx = np.arange(0, df.shape[0], 100).tolist()
    output = list()
    for i in idx:
        ids = df['rev_id'].tolist()[i:(i+100)]
        ores_data = get_ores_data(ids, headers)
        for key, item in ores_data['enwiki']['scores'].items():
            dc = dict()
            dc['rev_id'] = key
            if 'error' in item['wp10']:
                dc['prediction'] = 'ERROR'
            else:
                dc['prediction'] = item['wp10']['score']['prediction']
            output.append(dc)
    
    return pd.DataFrame(output)

In [None]:
pred = extract_prediction(df)

# output the rev_ids with no prediction available
pred[pred.prediction == 'ERROR'][['rev_id']].to_csv('revid_no_prediction.csv', index=False)

# update data
pred = pred[pred.prediction != 'ERROR']

# merge data
pred = pred.astype({'rev_id': 'int64'})
target = df.merge(pop, how='left', left_on='country', right_on='Geography')

# output data that does not have country match
target[target.Geography.isnull()].to_csv('wp_wpds_countries-no_match.csv', index=False)

# update working data
target.dropna(inplace=True)

# merge with prediction data
target = target.merge(pred, how='left', on='rev_id')
target.dropna(inplace=True)

target.drop(columns=['Geography'], inplace=True)
target.columns = ['article_name','country','revision_id','population','article_quality']
target = target[['country','article_name','revision_id','article_quality','population']]
target.to_csv('wp_wpds_politicians_by_country.csv', index=True)

In [None]:
# manipulate data type
pop['Population mid-2018 (millions)'] = pop['Population mid-2018 (millions)'].str.replace(',', '')
pop = pop.astype({'Population mid-2018 (millions)':'float'})

In [None]:
target.head()

In [None]:
pop.head()

### 1. "10 highest-ranked countries in terms of number of politician articles as a proportion of country population"

In [11]:
d1 = target.groupby('country').agg(
    num_article = ('revision_id', 'count'),
).merge(pop, how='left', left_on='country', right_on='Geography')

d1['article_prop'] = (d1['num_article'] / d1['Population mid-2018 (millions)']) / 1000000
d1.drop(columns=['num_article','Population mid-2018 (millions)'], inplace=True)
d1.columns = ['country','article_proportion']
d1.sort_values('article_proportion', ascending=False).head(10).reset_index(drop=True)

Unnamed: 0,country,article_proportion
0,Tuvalu,0.0054
1,Nauru,0.0052
2,San Marino,0.0027
3,Monaco,0.001
4,Liechtenstein,0.0007
5,Tonga,0.00063
6,Marshall Islands,0.000617
7,Iceland,0.000503
8,Andorra,0.000425
9,Grenada,0.00036


### 2. "10 lowest-ranked countries in terms of number of politician articles as a proportion of country population"

In [12]:
d1.sort_values('article_proportion', ascending=True).head(10).reset_index(drop=True)

Unnamed: 0,country,article_proportion
0,India,7.146503e-07
1,Indonesia,7.918552e-07
2,China,8.107332e-07
3,Uzbekistan,8.510638e-07
4,Ethiopia,9.395349e-07
5,"Korea, North",1.40625e-06
6,Zambia,1.412429e-06
7,Thailand,1.691843e-06
8,Mozambique,1.901639e-06
9,Bangladesh,1.917067e-06


### 3. "10 highest-ranked countries in terms of the relative proportion of politician articles that are of GA and FA-quality"

In [13]:
d2 = target.groupby('country').agg(
    num_quality = ('article_quality', lambda x: sum(x.isin(['FA','GA']))),
    num_article = ('country', 'count')
)
d2['proportion'] = d2.apply(lambda x: x.num_quality / x.num_article, axis=1)
d2[['proportion']].sort_values('proportion', ascending=False).head(10)

Unnamed: 0_level_0,proportion
country,Unnamed: 1_level_1
"Korea, North",0.194444
Saudi Arabia,0.127119
Mauritania,0.125
Central African Republic,0.121212
Romania,0.113703
Tuvalu,0.092593
Bhutan,0.090909
Dominica,0.083333
Syria,0.078125
Benin,0.076923


### 4. "10 lowest-ranked countries in terms of the relative proportion of politician articles that are of GA and FA-quality" these countries do not have GA and FA-quality articles at all

In [22]:
# these countries do not have any high quality Wiki politician article
d2[d2.num_quality == 0].index

Index(['Andorra', 'Angola', 'Antigua and Barbuda', 'Bahamas', 'Barbados',
       'Belize', 'Cameroon', 'Cape Verde', 'Comoros', 'Costa Rica', 'Djibouti',
       'Eritrea', 'Estonia', 'Federated States of Micronesia', 'Finland',
       'Guyana', 'Kazakhstan', 'Kiribati', 'Lesotho', 'Liechtenstein',
       'Macedonia', 'Malta', 'Marshall Islands', 'Moldova', 'Monaco',
       'Mozambique', 'Namibia', 'Nauru', 'San Marino', 'Sao Tome and Principe',
       'Seychelles', 'Slovakia', 'Solomon Islands', 'Tonga', 'Tunisia',
       'Turkmenistan', 'Uganda', 'Zambia'],
      dtype='object', name='country')

In [23]:
# after excluding the above ones, these are the 10 lowest-ranked countries
d2[d2.proportion != 0][['proportion']].sort_values('proportion', ascending=True).head(10)

Unnamed: 0_level_0,proportion
country,Unnamed: 1_level_1
Belgium,0.001923
Tanzania,0.002469
Switzerland,0.002488
Nepal,0.002801
Peru,0.002857
Nigeria,0.002954
Colombia,0.003509
Lithuania,0.004098
Fiji,0.005076
Azerbaijan,0.005587


### 5. "Ranking of geographic regions (in descending order) in terms of the total count of politician articles from countries in each region as a proportion of total regional population"

In [15]:
def generate_region_col(pop):
    region_idx = pop.index[pop.Geography.str.isupper()].tolist()
    region_name = pop.Geography[region_idx].tolist()
    region_name = [[x] for x in region_name]
    region_idx.append(pop.shape[0])
    rep_times = [region_idx[i + 1] - region_idx[i] for i in range(len(region_idx)-1)]
    ls = [region_name[i] * rep_times[i] for i in range(len(rep_times))]
    ls = [y for x in ls for y in x]
    
    return ls

In [25]:
pop['region'] = generate_region_col(pop)

d3 = target.groupby('country').agg(
    num_article = ('revision_id', 'count'),
).merge(pop, how='left', left_on='country', right_on='Geography')

d3 = d3.groupby('region').agg(
    total_article = ('num_article', 'sum'),
    total_population_million = ('Population mid-2018 (millions)', 'sum')
)

d3['proportion'] = d3.apply(lambda x: x.total_article / (x.total_population * 1000000), axis=1)
d3.sort_values('proportion', ascending=False)[['proportion']]

Unnamed: 0_level_0,proportion
region,Unnamed: 1_level_1
OCEANIA,7.9e-05
EUROPE,2.2e-05
LATIN AMERICA AND THE CARIBBEAN,8e-06
AFRICA,6e-06
NORTHERN AMERICA,5e-06
ASIA,3e-06


### 6. "Ranking of geographic regions (in descending order) in terms of the relative proportion of politician articles from countries in each region that are of GA and FA-quality"

In [26]:
d4 = target.groupby('country').agg(
    num_quality = ('article_quality', lambda x: sum(x.isin(['FA','GA']))),
    num_article = ('country', 'count')
).merge(pop, how='left', left_on='country', right_on='Geography')

d4 = d4.groupby('region').agg(
    total_quality_article = ('num_quality', 'sum'),
    total_article = ('num_article', 'sum')
)
d4['proportion'] = d4.apply(lambda x: x.total_quality_article/x.total_article, axis=1)

d4.sort_values('proportion', ascending=False)[['proportion']]

Unnamed: 0_level_0,proportion
region,Unnamed: 1_level_1
NORTHERN AMERICA,0.051536
ASIA,0.026884
OCEANIA,0.0211
EUROPE,0.020298
AFRICA,0.018246
LATIN AMERICA AND THE CARIBBEAN,0.013349
