## A2 - Bias in Data
## Jack Chen

### Step 2: Cleaning the Data

In [1]:
import pandas as pd
import numpy as np
import requests
import matplotlib.pyplot as plt

In [2]:
page_data = pd.read_csv("../data_raw/page_data.csv")
wpds = pd.read_csv("../data_raw/WPDS_2020_data.csv")

In [3]:
page_data_reduced = page_data[~page_data.page.str.contains("Template:", regex = False)]
page_data_reduced

Unnamed: 0,page,country,rev_id
1,Bir I of Kanem,Chad,355319463
10,Information Minister of the Palestinian Nation...,Palestinian Territory,393276188
12,Yos Por,Cambodia,393822005
23,Julius Gregr,Czech Republic,395521877
24,Edvard Gregr,Czech Republic,395526568
...,...,...,...
47192,Yahya Jammeh,Gambia,807482007
47193,Lucius Fairchild,United States,807483006
47194,Fahd of Saudi Arabia,Saudi Arabia,807483153
47195,Francis Fessenden,United States,807483270


In [4]:
wpds_country = wpds[~wpds.Name.str.isupper()]
wpds_region = wpds[wpds.Name.str.isupper()]
wpds_country

Unnamed: 0,FIPS,Name,Type,TimeFrame,Data (M),Population
3,DZ,Algeria,Country,2019,44.357,44357000
4,EG,Egypt,Country,2019,100.803,100803000
5,LY,Libya,Country,2019,6.891,6891000
6,MA,Morocco,Country,2019,35.952,35952000
7,SD,Sudan,Country,2019,43.849,43849000
...,...,...,...,...,...,...
229,WS,Samoa,Country,2019,0.200,200000
230,SB,Solomon Islands,Country,2019,0.715,715000
231,TO,Tonga,Country,2019,0.099,99000
232,TV,Tuvalu,Country,2019,0.010,10000


page_data_reduced

### Step 3: Getting Article Quality Predictions

In [5]:
# Customize these with your own information
headers = {
    'User-Agent': 'https://github.com/jchen023',
    'From': 'jchen023@uw.edu'
}
endpoint = 'https://ores.wikimedia.org/v3/scores/{project}/?models={model}&revids={revids}'

def get_param(rev_id):
    param = {"project": "enwiki",
            "model": "articlequality",
            "revids": "|".join(str(x) for x in rev_id)}
    return param

In [6]:
def api_call(endpoint, param):
    call = requests.get(endpoint.format(**param), headers=headers)
    response = call.json()
    return response

In [7]:
def get_prediction(rev_id):
    param = {"project": "enwiki",
            "model": "articlequality",
            "revids": "|".join(str(x) for x in rev_id)}
    api_data = api_call(endpoint, param)
    prediction = []
    for id in rev_id:
        prediction.append(api_data.get('enwiki', {})
            .get('scores', {})
            .get(str(id), {})
            .get("articlequality", {})
            .get("score", {})
            .get("prediction", np.nan))
    return prediction

In [8]:
max_item = 50
page_data_len = len(page_data_reduced)
article_quality_prediction = []
for start_pos in range(0, page_data_len, max_item):
    end_pos = min(page_data_len, start_pos + max_item)
    predict_list = get_prediction(page_data_reduced.rev_id.iloc[start_pos: end_pos])
    article_quality_prediction += predict_list

In [9]:
page_data_reduced["article_quality_est"] = article_quality_prediction
page_data_without_prediction = page_data_reduced[page_data_reduced.article_quality_est.isnull()]
page_data_with_prediction = page_data_reduced[~page_data_reduced.article_quality_est.isnull()]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  page_data_reduced["article_quality_est"] = article_quality_prediction


In [10]:
page_data_without_prediction.drop(columns="article_quality_est").to_csv("../log/page_data_without_prediction.csv")

### Step 3: Combining the Datasets

In [15]:
all_merged_data = page_data_with_prediction.merge(wpds_country, left_on="country", right_on="Name", how="left")

In [37]:
wp_wpds_countries_no_match = all_merged_data[all_merged_data['Name'].isnull()]
wp_wpds_politicians_by_country = all_merged_data[~all_merged_data['Name'].isnull()]
wp_wpds_politicians_by_country = wp_wpds_politicians_by_country.rename(columns={"page": "article_name",
                                              "rev_id": "revision_id",
                                              "article_quality_est": "article_quality_est.",
                                              "Population": 'population'})[["country",
     "article_name", 
     "revision_id",
     "article_quality_est.",
     'population']]
wp_wpds_countries_no_match.to_csv("../log/wp_wpds_countries-no_match.csv")

wp_wpds_politicians_by_country.to_csv("../log/wp_wpds_politicians_by_country.csv")

In [38]:
wp_wpds_politicians_by_country

Unnamed: 0,country,article_name,revision_id,article_quality_est.,population
0,Chad,Bir I of Kanem,355319463,Stub,16877000.0
1,Palestinian Territory,Information Minister of the Palestinian Nation...,393276188,Stub,5008000.0
2,Cambodia,Yos Por,393822005,Stub,15497000.0
5,Canada,Robert Douglas Cook,401577829,Stub,38190000.0
6,Egypt,List of Grand Viziers of Egypt,442937236,Stub,100803000.0
...,...,...,...,...,...
46420,United States,Hal Bidlack,807481636,C,329878000.0
46421,Gambia,Yahya Jammeh,807482007,GA,2417000.0
46422,United States,Lucius Fairchild,807483006,C,329878000.0
46423,Saudi Arabia,Fahd of Saudi Arabia,807483153,GA,35041000.0


### Step 4: Analysis

In [60]:
wpds_pop_good_article = wp_wpds_politicians_by_country.groupby(["country"]).aggregate({"population":np.max, 
                                                               "article_quality_est.": 
                                                               lambda x: (x == "GA").sum() + (x == "FA").sum()})
wpds_pop_good_article["proportion"] = wpds_pop_good_article["article_quality_est."]/wpds_pop_good_article["population"]
pd.options.display.float_format = '{:.8f}'.format
wpds_pop_good_article

Unnamed: 0_level_0,population,article_quality_est.,proportion
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,38928000.00000000,13,0.00000033
Albania,2838000.00000000,3,0.00000106
Algeria,44357000.00000000,2,0.00000005
Andorra,82000.00000000,0,0.00000000
Angola,32522000.00000000,0,0.00000000
...,...,...,...
Venezuela,28645000.00000000,3,0.00000010
Vietnam,96209000.00000000,13,0.00000014
Yemen,29826000.00000000,3,0.00000010
Zambia,18384000.00000000,0,0.00000000


In [66]:
wpds_pop_article = wp_wpds_politicians_by_country.groupby(["country"]).aggregate({"population":np.max, 
                                                               "article_quality_est.": 'count'})
wpds_pop_article["proportion"] = wpds_pop_article["article_quality_est."]/wpds_pop_article["population"]
pd.options.display.float_format = '{:.8f}'.format
wpds_pop_article

Unnamed: 0_level_0,population,article_quality_est.,proportion
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,38928000.00000000,319,0.00000819
Albania,2838000.00000000,456,0.00016068
Algeria,44357000.00000000,116,0.00000262
Andorra,82000.00000000,34,0.00041463
Angola,32522000.00000000,106,0.00000326
...,...,...,...
Venezuela,28645000.00000000,130,0.00000454
Vietnam,96209000.00000000,187,0.00000194
Yemen,29826000.00000000,116,0.00000389
Zambia,18384000.00000000,25,0.00000136


### Step 5: Results

#### Top 10 countries by coverage: 10 highest-ranked countries in terms of number of politician articles as a proportion of country population

In [69]:
wpds_pop_article.sort_values("proportion", ascending=False)[:10]

Unnamed: 0_level_0,population,article_quality_est.,proportion
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Tuvalu,10000.0,54,0.0054
Nauru,11000.0,52,0.00472727
San Marino,34000.0,81,0.00238235
Monaco,38000.0,40,0.00105263
Liechtenstein,39000.0,28,0.00071795
Marshall Islands,57000.0,37,0.00064912
Tonga,99000.0,63,0.00063636
Iceland,368000.0,201,0.0005462
Andorra,82000.0,34,0.00041463
Federated States of Micronesia,106000.0,36,0.00033962


#### Bottom 10 countries by coverage: 10 lowest-ranked countries in terms of number of politician articles as a proportion of country population

In [70]:
wpds_pop_article[wpds_pop_article["article_quality_est."]!=0].sort_values("proportion", ascending=False)[-10:]

Unnamed: 0_level_0,population,article_quality_est.,proportion
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bangladesh,169809000.0,317,1.87e-06
Mozambique,31166000.0,58,1.86e-06
Thailand,66534000.0,112,1.68e-06
"Korea, North",25779000.0,36,1.4e-06
Zambia,18384000.0,25,1.36e-06
Ethiopia,114916000.0,101,8.8e-07
Uzbekistan,34174000.0,28,8.2e-07
China,1402385000.0,1129,8.1e-07
Indonesia,271739000.0,209,7.7e-07
India,1400100000.0,968,6.9e-07


#### Top 10 countries by relative quality: 10 highest-ranked countries in terms of the relative proportion of politician articles that are of GA and FA-quality

In [56]:
wpds_pop_good_article.sort_values("proportion", ascending=False)[:10]

Unnamed: 0_level_0,population,article_quality_est.,proportion
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Tuvalu,10000.0,4,0.0004
Dominica,72000.0,1,1.389e-05
Vanuatu,321000.0,3,9.35e-06
Iceland,368000.0,2,5.43e-06
Ireland,5003000.0,25,5e-06
Montenegro,622000.0,2,3.22e-06
Martinique,356000.0,1,2.81e-06
Bhutan,730000.0,2,2.74e-06
New Zealand,4987000.0,13,2.61e-06
Romania,19241000.0,42,2.18e-06


#### Bottom 10 countries by relative quality: 10 lowest-ranked countries in terms of the relative proportion of politician articles that are of GA and FA-quality

In [71]:
wpds_pop_good_article[wpds_pop_good_article["article_quality_est."]!=0].sort_values("proportion", ascending=False)[-10:]

Unnamed: 0_level_0,population,article_quality_est.,proportion
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
China,1402385000.0,40,3e-08
Brazil,211812000.0,6,3e-08
Morocco,35952000.0,1,3e-08
Uganda,45741000.0,1,2e-08
Colombia,49444000.0,1,2e-08
Bangladesh,169809000.0,3,2e-08
Ethiopia,114916000.0,2,2e-08
Tanzania,59734000.0,1,2e-08
Nigeria,206140000.0,2,1e-08
India,1400100000.0,13,1e-08


In [72]:
wpds_region


Unnamed: 0,FIPS,Name,Type,TimeFrame,Data (M),Population
0,WORLD,WORLD,World,2019,7772.85,7772850000
1,AFRICA,AFRICA,Sub-Region,2019,1337.918,1337918000
2,NORTHERN AFRICA,NORTHERN AFRICA,Sub-Region,2019,244.344,244344000
10,WESTERN AFRICA,WESTERN AFRICA,Sub-Region,2019,401.115,401115000
27,EASTERN AFRICA,EASTERN AFRICA,Sub-Region,2019,444.97,444970000
48,MIDDLE AFRICA,MIDDLE AFRICA,Sub-Region,2019,179.757,179757000
58,SOUTHERN AFRICA,SOUTHERN AFRICA,Sub-Region,2019,67.732,67732000
64,NORTHERN AMERICA,NORTHERN AMERICA,Sub-Region,2019,368.193,368193000
67,LATIN AMERICA AND THE CARIBBEAN,LATIN AMERICA AND THE CARIBBEAN,Sub-Region,2019,651.036,651036000
68,CENTRAL AMERICA,CENTRAL AMERICA,Sub-Region,2019,178.611,178611000


In [73]:
wpds

Unnamed: 0,FIPS,Name,Type,TimeFrame,Data (M),Population
0,WORLD,WORLD,World,2019,7772.85000000,7772850000
1,AFRICA,AFRICA,Sub-Region,2019,1337.91800000,1337918000
2,NORTHERN AFRICA,NORTHERN AFRICA,Sub-Region,2019,244.34400000,244344000
3,DZ,Algeria,Country,2019,44.35700000,44357000
4,EG,Egypt,Country,2019,100.80300000,100803000
...,...,...,...,...,...,...
229,WS,Samoa,Country,2019,0.20000000,200000
230,SB,Solomon Islands,Country,2019,0.71500000,715000
231,TO,Tonga,Country,2019,0.09900000,99000
232,TV,Tuvalu,Country,2019,0.01000000,10000
