In [230]:
import pandas as pd
import numpy as np
import json
import requests
import plotly

# Step 1: Getting the Article and Population Data

In [231]:
# Load page data datasets which contains the politian articles and their corresponding countries
politicians_country = pd.read_csv("page_data.csv")
politicians_country.head()

Unnamed: 0,page,country,rev_id
0,Template:ZambiaProvincialMinisters,Zambia,235107991
1,Bir I of Kanem,Chad,355319463
2,Template:Zimbabwe-politician-stub,Zimbabwe,391862046
3,Template:Uganda-politician-stub,Uganda,391862070
4,Template:Namibia-politician-stub,Namibia,391862409


In [232]:
# Load world population dataset

world_population = pd.read_csv("WPDS_2020_data.csv")
world_population.head()

Unnamed: 0,FIPS,Name,Type,TimeFrame,Data (M),Population
0,WORLD,WORLD,World,2019,7772.85,7772850000
1,AFRICA,AFRICA,Sub-Region,2019,1337.918,1337918000
2,NORTHERN AFRICA,NORTHERN AFRICA,Sub-Region,2019,244.344,244344000
3,DZ,Algeria,Country,2019,44.357,44357000
4,EG,Egypt,Country,2019,100.803,100803000


# Step 2: Cleaning the Data


In [233]:
# Cleaning politicians_country by removing the non-wikipedia rows from the dataframe
politicians_country['Not Wiki'] = politicians_country['page'].str.contains("Template:") 
politicians_country.head()

Unnamed: 0,page,country,rev_id,Not Wiki
0,Template:ZambiaProvincialMinisters,Zambia,235107991,True
1,Bir I of Kanem,Chad,355319463,False
2,Template:Zimbabwe-politician-stub,Zimbabwe,391862046,True
3,Template:Uganda-politician-stub,Uganda,391862070,True
4,Template:Namibia-politician-stub,Namibia,391862409,True


In [234]:
subset_politicians_country = politicians_country[politicians_country['Not Wiki'] == False]
subset_politicians_country.head()

Unnamed: 0,page,country,rev_id,Not Wiki
1,Bir I of Kanem,Chad,355319463,False
10,Information Minister of the Palestinian Nation...,Palestinian Territory,393276188,False
12,Yos Por,Cambodia,393822005,False
23,Julius Gregr,Czech Republic,395521877,False
24,Edvard Gregr,Czech Republic,395526568,False


In [235]:
# Cleaning world population data by taking the rows that correspond to country level and not regional level
world_population['Not Country Level'] = world_population['Name'].str.isupper()
subset_world_population = world_population[world_population['Not Country Level'] == False]
subset_world_population.head()

Unnamed: 0,FIPS,Name,Type,TimeFrame,Data (M),Population,Not Country Level
3,DZ,Algeria,Country,2019,44.357,44357000,False
4,EG,Egypt,Country,2019,100.803,100803000,False
5,LY,Libya,Country,2019,6.891,6891000,False
6,MA,Morocco,Country,2019,35.952,35952000,False
7,SD,Sudan,Country,2019,43.849,43849000,False


# Step 3: Getting Article Quality Predictions

In [236]:
# API call to get article prediciton scores

def api_call(rev_id):
    endpoint = "https://ores.wikimedia.org/v3/scores/enwiki/?models=articlequality&revids={rev_id}"
    headers = {
    'User-Agent': 'https://github.com/leenaelamrawy',
    'From': 'lelamraw@uw.edu'
    }
    call = requests.get(endpoint.format(rev_id = rev_id), headers=headers)
    response = call.json()
    quality_predictions_lst = []
    no_results = []
    for rev_ids,value in response['enwiki']['scores'].items():
        result_dict = value['articlequality']
        if "error" not in result_dict:
            prediction = {
                'rev_id': int(rev_ids),
                'prediction': result_dict["score"]["prediction"]
            }
            quality_predictions_lst.append(prediction)
    return quality_predictions_lst
    

# Step 4: Combining the Datasets

In [237]:
# function to split dataframe into chuncks
def get_groups(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [239]:
grouped_ids = list(get_groups(subset_politicians_country['rev_id'], 50))
preds = []
for i in grouped_ids:
    preds.append(api_call("|".join(str(x) for x in i)))


In [240]:
# create one dataframe containing all the prediction scores for all the articles called merged
df_lst = [pd.DataFrame(i) for i in preds]
merged = pd.concat(df_lst)
merged

Unnamed: 0,rev_id,prediction
0,355319463,Stub
1,393276188,Stub
2,393822005,Stub
3,395521877,Stub
4,395526568,Stub
...,...,...
44,807481636,C
45,807482007,GA
46,807483006,C
47,807483153,GA


In [241]:
# Combine both dataframe on rev id to get the final_df containing rev id, prediction, page, country
final_df = merged.merge(subset_politicians_country, on='rev_id')
final_df = final_df[['rev_id', 'prediction', 'page', 'country']]
final_df

Unnamed: 0,rev_id,prediction,page,country
0,355319463,Stub,Bir I of Kanem,Chad
1,393276188,Stub,Information Minister of the Palestinian Nation...,Palestinian Territory
2,393822005,Stub,Yos Por,Cambodia
3,395521877,Stub,Julius Gregr,Czech Republic
4,395526568,Stub,Edvard Gregr,Czech Republic
...,...,...,...,...
46420,807481636,C,Hal Bidlack,United States
46421,807482007,GA,Yahya Jammeh,Gambia
46422,807483006,C,Lucius Fairchild,United States
46423,807483153,GA,Fahd of Saudi Arabia,Saudi Arabia


In [242]:
# combine final df(politicians country) to world population dataset
merged = subset_world_population.merge(final_df, left_on = 'Name', right_on = 'country', how = 'outer')
# select rows where country column of both dataframes match
merged = merged[merged['Name'] == merged['country']]
# get no match data
no_match = merged[merged['Name'] != merged['country']]
#select the important columns
merged = merged[['country', 'page', 'rev_id', 'prediction', 'Population']]
# rename columns
merged = merged.rename(columns = {'country': 'Country', 'page': 'Article Name','rev_id': 'Revision ID', 'prediction':'Prediction'})
merged

Unnamed: 0,Country,Article Name,Revision ID,Prediction,Population
0,Algeria,Ali Fawzi Rebaine,686269631.0,Stub,44357000.0
1,Algeria,Ahmed Attaf,705910185.0,Stub,44357000.0
2,Algeria,Ahmed Djoghlaf,707427823.0,Stub,44357000.0
3,Algeria,Hammi Larouissi,708060571.0,Stub,44357000.0
4,Algeria,Salah Goudjil,708980561.0,Stub,44357000.0
...,...,...,...,...,...
44590,Vanuatu,Tallis Obed Moses,799954279.0,Stub,321000.0
44591,Vanuatu,Esmon Saimon,799954813.0,Start,321000.0
44592,Vanuatu,Baldwin Lonsdale,799955662.0,C,321000.0
44593,Vanuatu,Sela Molisa,800106636.0,C,321000.0


In [244]:
no_match = no_match[['country', 'page', 'rev_id', 'prediction', 'Population']]
no_match = no_match.rename(columns = {'country': 'Country', 'page': 'Article Name','rev_id': 'Revision ID', 'prediction':'Prediction'})


In [245]:
merged.to_csv('wp_wpds_politicians_by_country.csv')
no_match.to_csv('wp_wpds_countries-no_match.csv')

# Step 5: Analysis and Step 6: Results


In [246]:
# Top 10 countries by coverage: 10 highest-ranked countries in terms of number of politician articles as a proportion 
# of country population

# Adding article per population metric to merged the dataframe- COUNTRY LEVEL
article_per_country = merged.groupby('Country')['Article Name'].nunique().to_frame().reset_index()
article_per_country_pop = article_per_country.merge(merged[['Country', 'Population']], on = 'Country')
article_per_country_pop = article_per_country_pop.drop_duplicates()
article_per_country_pop = article_per_country_pop.rename(columns = {'Article Name': 'Number of Articles'})
article_per_country_pop['articles-per-population'] = (article_per_country_pop['Number of Articles']/article_per_country_pop['Population'])*100
article_per_country_pop.head()



Unnamed: 0,Country,Number of Articles,Population,articles-per-population
0,Afghanistan,319,38928000.0,0.000819
319,Albania,456,2838000.0,0.016068
775,Algeria,116,44357000.0,0.000262
891,Andorra,34,82000.0,0.041463
925,Angola,106,32522000.0,0.000326


In [247]:
top_10_articles_per_population = article_per_country_pop.sort_values(by = 'articles-per-population', ascending = False)[:10]
top_10_articles_per_population


Unnamed: 0,Country,Number of Articles,Population,articles-per-population
41067,Tuvalu,54,10000.0,0.54
26792,Nauru,52,11000.0,0.472727
35132,San Marino,81,34000.0,0.238235
25926,Monaco,40,38000.0,0.105263
22754,Liechtenstein,28,39000.0,0.071795
24246,Marshall Islands,37,57000.0,0.064912
40459,Tonga,63,99000.0,0.063636
16421,Iceland,201,368000.0,0.05462
891,Andorra,34,82000.0,0.041463
11290,Federated States of Micronesia,36,106000.0,0.033962


In [248]:
#Bottom 10 countries by coverage: 10 lowest-ranked countries in terms of number of politician articles as a 
# proportion of country population

bottom10_articles_per_population = article_per_country_pop.sort_values(by = 'articles-per-population', ascending = True)[:10]
bottom10_articles_per_population

Unnamed: 0,Country,Number of Articles,Population,articles-per-population
16622,India,968,1400100000.0,6.9e-05
17590,Indonesia,209,271739000.0,7.7e-05
7990,China,1129,1402385000.0,8.1e-05
43861,Uzbekistan,28,34174000.0,8.2e-05
11189,Ethiopia,101,114916000.0,8.8e-05
44380,Zambia,25,18384000.0,0.000136
21643,"Korea, North",36,25779000.0,0.00014
40282,Thailand,112,66534000.0,0.000168
26336,Mozambique,58,31166000.0,0.000186
3871,Bangladesh,317,169809000.0,0.000187


Top 10 countries by relative quality: 10 highest-ranked countries in terms of the relative proportion of politician articles that are of GA and FA-quality

In [249]:
# get high quality articles by concatinating the rows either have an FA or GA prediction
high_quality = pd.concat([merged.loc[merged['Prediction']=='FA'], merged.loc[merged['Prediction']=='GA']])

# grouping by countries and getting a count of the number of articles
high_quality = high_quality.groupby('Country').count()['Article Name'].reset_index()

high_quality = pd.DataFrame({'Country':high_quality['Country'], 'high_quality_articles_count':high_quality['Article Name']})
high_quality.head()

# merging the two dataframes, calculating the percentage of high quality articles, sorting rows in descending order of percentage
high_quality_articles_proportion = high_quality.merge(article_per_country, left_on='Country', right_on='Country', how='inner')
high_quality_articles_proportion['percentage_of_high_quality'] = high_quality['high_quality_articles_count']*100/high_quality_articles_proportion['Article Name']
rank_of_countries_by_high_quality =high_quality_articles_proportion.sort_values(['percentage_of_high_quality'], ascending=[False])

rank_of_countries_by_high_quality.head(10)

Unnamed: 0,Country,high_quality_articles_count,Article Name,percentage_of_high_quality
63,"Korea, North",8,36,22.222222
109,Saudi Arabia,15,117,12.820513
106,Romania,42,343,12.244898
23,Central African Republic,8,66,12.121212
140,Uzbekistan,3,28,10.714286
82,Mauritania,5,48,10.416667
46,Guatemala,7,83,8.433735
33,Dominica,1,12,8.333333
125,Syria,10,128,7.8125
11,Benin,7,91,7.692308


Bottom 10 countries by relative quality: 10 lowest-ranked countries in terms of the relative proportion of politician articles that are of GA and FA-quality


In [250]:
rank_of_countries_by_high_quality.tail(10)


Unnamed: 0,Country,high_quality_articles_count,Article Name,percentage_of_high_quality
87,Morocco,1,206,0.485437
73,Lithuania,1,244,0.409836
27,Colombia,1,285,0.350877
104,Portugal,1,318,0.314465
94,Nigeria,2,676,0.295858
101,Peru,1,350,0.285714
89,Nepal,1,356,0.280899
124,Switzerland,1,402,0.248756
128,Tanzania,1,404,0.247525
10,Belgium,1,519,0.192678


In [251]:
# function to get the region of each country from the dataframe
def get_region():
    region = ' '
    regions_array = []

    for index, row in world_population.iterrows():

        if(row['Type'] ==  'Country'):
            regions_array.append(region)
        
        elif(row['Type'] ==  'Sub-Region' and row['Name'].isupper() == False):
            regions_array.append(region)
        else:
            region = row['Name']
            regions_array.append(region)

    return regions_array  

# adding region column to dataframe
world_population['Region'] = get_region()
world_population = world_population[['Region', 'Name','Population']]
world_population

Unnamed: 0,Region,Name,Population
0,WORLD,WORLD,7772850000
1,AFRICA,AFRICA,1337918000
2,NORTHERN AFRICA,NORTHERN AFRICA,244344000
3,NORTHERN AFRICA,Algeria,44357000
4,NORTHERN AFRICA,Egypt,100803000
...,...,...,...
229,OCEANIA,Samoa,200000
230,OCEANIA,Solomon Islands,715000
231,OCEANIA,Tonga,99000
232,OCEANIA,Tuvalu,10000


In [252]:
# create a dictionary that maps the countires to the regions and add the regions to the final dataframe
mapping = dict(world_population[['Name', 'Region']].values)
final_df['Region'] = final_df.country.map(mapping)
final_df

Unnamed: 0,rev_id,prediction,page,country,Region
0,355319463,Stub,Bir I of Kanem,Chad,MIDDLE AFRICA
1,393276188,Stub,Information Minister of the Palestinian Nation...,Palestinian Territory,WESTERN ASIA
2,393822005,Stub,Yos Por,Cambodia,SOUTHEAST ASIA
3,395521877,Stub,Julius Gregr,Czech Republic,
4,395526568,Stub,Edvard Gregr,Czech Republic,
...,...,...,...,...,...
46420,807481636,C,Hal Bidlack,United States,NORTHERN AMERICA
46421,807482007,GA,Yahya Jammeh,Gambia,WESTERN AFRICA
46422,807483006,C,Lucius Fairchild,United States,NORTHERN AMERICA
46423,807483153,GA,Fahd of Saudi Arabia,Saudi Arabia,WESTERN ASIA


In [253]:
# combine final df(politicians country) to world population dataset
merged = world_population.merge(final_df, left_on = 'Name', right_on = 'country', how = 'inner')
#select the important columns
merged = merged[['Region_x','country','page', 'rev_id', 'prediction', 'Population']]
# rename columns
merged = merged.rename(columns = {'country': 'Country', 'page': 'Article Name','rev_id': 'Revision ID', 'prediction':'Prediction', 'Region_x': 'Region'})
merged

Unnamed: 0,Region,Country,Article Name,Revision ID,Prediction,Population
0,NORTHERN AFRICA,Algeria,Ali Fawzi Rebaine,686269631,Stub,44357000
1,NORTHERN AFRICA,Algeria,Ahmed Attaf,705910185,Stub,44357000
2,NORTHERN AFRICA,Algeria,Ahmed Djoghlaf,707427823,Stub,44357000
3,NORTHERN AFRICA,Algeria,Hammi Larouissi,708060571,Stub,44357000
4,NORTHERN AFRICA,Algeria,Salah Goudjil,708980561,Stub,44357000
...,...,...,...,...,...,...
44563,OCEANIA,Vanuatu,Tallis Obed Moses,799954279,Stub,321000
44564,OCEANIA,Vanuatu,Esmon Saimon,799954813,Start,321000
44565,OCEANIA,Vanuatu,Baldwin Lonsdale,799955662,C,321000
44566,OCEANIA,Vanuatu,Sela Molisa,800106636,C,321000


Geographic regions by coverage: Ranking of geographic regions (in descending order) in terms of the total count of politician articles from countries in each region as a proportion of total regional population


In [254]:

# Adding article per population metric to merged the dataframe- Regional LEVEL
article_per_region = merged.groupby('Region')['Article Name'].nunique().to_frame().reset_index()

article_per_region_pop = article_per_region.merge(merged[['Region', 'Population']], on = 'Region')
article_per_region_pop = article_per_region_pop.drop_duplicates()
region_num_articles = article_per_region_pop.groupby(['Region','Article Name'])['Population'].sum().to_frame().reset_index()
region_num_articles = region_num_articles.rename(columns = {'Article Name': 'Number of Articles'})
region_num_articles['articles-per-population'] = (region_num_articles['Number of Articles']*100/region_num_articles['Population'])
region_num_articles.head()


Unnamed: 0,Region,Number of Articles,Population,articles-per-population
0,CARIBBEAN,695,39056000,0.001779
1,CENTRAL AMERICA,1543,162267000,0.000951
2,CENTRAL ASIA,245,74960000,0.000327
3,EAST ASIA,2473,1632883000,0.000151
4,EASTERN AFRICA,2502,443825000,0.000564


Geographic regions by coverage: Ranking of geographic regions (in descending order) in terms of the relative proportion of politician articles from countries in each region that are of GA and FA-quality

In [255]:
# sort the dataframe is descending order
ranked_regional = region_num_articles.sort_values(by ='articles-per-population', ascending = False)
ranked_regional.head(10)

Unnamed: 0,Region,Number of Articles,Population,articles-per-population
10,OCEANIA,3126,42031000,0.007437
9,NORTHERN EUROPE,3763,105680000,0.003561
15,SOUTHERN EUROPE,3710,151136000,0.002455
18,WESTERN EUROPE,4560,195479000,0.002333
0,CARIBBEAN,695,39056000,0.001779
5,EASTERN EUROPE,3732,281186000,0.001327
14,SOUTHERN AFRICA,634,66628000,0.000952
1,CENTRAL AMERICA,1543,162267000,0.000951
17,WESTERN ASIA,2563,272499000,0.000941
6,MIDDLE AFRICA,665,90189000,0.000737


In [256]:
# get high quality articles by concatinating the rows either have an FA or GA prediction

high_quality_regional = pd.concat([merged.loc[merged['Prediction']=='FA'], merged.loc[merged['Prediction']=='GA']])

high_quality_regional = high_quality_regional.groupby('Region').count()['Article Name'].reset_index()

high_quality_regional = pd.DataFrame({'Region':high_quality_regional['Region'], 'high_quality_articles_count':high_quality_regional['Article Name']})
high_quality_regional.head()

high_quality_regional_articles_proportion = high_quality_regional.merge(article_per_region, on='Region', how='inner')
high_quality_regional_articles_proportion['percentage_of_high_quality'] = high_quality_regional['high_quality_articles_count']*100/high_quality_regional_articles_proportion['Article Name']
rank_of_regions_by_high_quality =high_quality_regional_articles_proportion.sort_values(['percentage_of_high_quality'], ascending=[False])

rank_of_regions_by_high_quality.head(10)

Unnamed: 0,Region,high_quality_articles_count,Article Name,percentage_of_high_quality
8,NORTHERN AMERICA,104,1901,5.470805
13,SOUTHEAST ASIA,73,2020,3.613861
17,WESTERN ASIA,89,2563,3.472493
5,EASTERN EUROPE,118,3732,3.161844
3,EAST ASIA,76,2473,3.07319
2,CENTRAL ASIA,7,245,2.857143
9,NORTHERN EUROPE,102,3763,2.710603
6,MIDDLE AFRICA,16,665,2.406015
7,NORTHERN AFRICA,19,899,2.113459
10,OCEANIA,63,3126,2.015355


# Writeup: Reflections and Implications

Reflections:

1) The countries China and India are amongst the bottom 10 countires by coverage in terms of politician articles as a proportion of country population. This exposes the biases that may arise in data science since these two countries have some of the world's largest populations. Therefore, no matter how many politician articles they have, the proportion will stil be low due to their large population sizes.

2) Similar to point number 1, the countries with the top coverge are much smaller countries with smaller populations. The reason many of them are showing up in the top 10 is not because of their large number of articles but because of their small population size. 

3) Another interesting finding is the precense of North Korea as the top country with high quality articles. approxiately 22.2% of high quality articles despite the fact that is it a non-English speaking country. This raises concerns about the way that high quality articles are determind or flagged as "FA" and "GA". When I looked at the criteria/grading scheme to access the quality of an article, I realized that the emphasis is on the style, length and whether it has media or not. It doesn't seem like the actual content of the article is how they determine the quality.  
4) This project taught me the importance of questioning the source of the data and specifically how the data is labelled. I also learned that projects with a lot of data cleaning and aggregation steps, require a signifiant amount of documentation and these are more difficult to reproduce in general.

5) This experiment is biased due to the fact that the only data source is the English dictionary. It makes sense for articles about politicians in non-English speaking countries to have much higher quality Wikipedia articles in their native langauge and vive versa. The top 10 articles per population are all non-English speaking countries. This goes to show that the choice of data isnt necessarily the best. 

6) One of the main limitations of this experiment is the use of population and number of articles together to create a single metric. The range of number of articles is much smaller than the range of populations of countries. This inflates the article/population metric and therefore, it is not an accurate measure of how the coverage of politicians on Wikipedia and the quality of articles about politicians vary between countries. One way to change this is to come up with a better metric to measure and this could maybe involve normalizing values to get a more accurate comparison. 

