In [1]:
import numpy as np
import pandas as pd
import json
import requests
from datetime import datetime
import matplotlib.pyplot as plt
import oresapi
from copy import deepcopy
import warnings
warnings.filterwarnings('ignore')

## Step 1: Getting the data

Wikipedia: https://figshare.com/articles/Untitled_Item/5513449
Population: https://canvas.uw.edu/courses/1319253/files/folder/A2%3A%20bias%20in%20data

In [2]:
country_data = pd.read_csv('C:/Users/Kamala/OneDrive/Desktop/4thquarter/Data_Ethics/country/data/page_data.csv')
politician_data = pd.read_csv('C:/Users/Kamala/OneDrive/Desktop/4thquarter/Data_Ethics/WPDS_2018_data.csv')

In [3]:
country_data.head()

Unnamed: 0,page,country,rev_id
0,Template:ZambiaProvincialMinisters,Zambia,235107991
1,Bir I of Kanem,Chad,355319463
2,Template:Zimbabwe-politician-stub,Zimbabwe,391862046
3,Template:Uganda-politician-stub,Uganda,391862070
4,Template:Namibia-politician-stub,Namibia,391862409


In [4]:
country_data = country_data[~country_data.rev_id.isna()]

In [5]:
politician_data.head()

Unnamed: 0,Geography,Population mid-2018 (millions)
0,AFRICA,1284.0
1,Algeria,42.7
2,Egypt,97.0
3,Libya,6.5
4,Morocco,35.2


In [6]:
politician_data = politician_data.rename({'Geography':'country'},axis=1)

## Step 2: Cleaning the data
Cleaning the data
page_data.csv and WPDS_2018_data.csv contain some rows that need to be filtered out and/or ignore when combined. In the case of page_data.csv, the dataset contains some page names that start with the string "Template:". These pages are not Wikipedia articles, and should not be included in analysis.

Similarly, WPDS_2018_data contains some rows that provide cumulative regional population counts, rather than country-level counts. These rows are distinguished by having ALL CAPS values in the 'geography' field (e.g. AFRICA, OCEANIA). These rows won't match the country values in page_data, but retain them (either in the original file, or a separate file) so that it can be used to report coverage and quality by region in the analysis section.

In [7]:
allcaps_rows = politician_data.apply(lambda row: any([data.isupper() for data in row]), axis = 1)
cuml_region_count = politician_data[allcaps_rows]
cuml_region_count


Unnamed: 0,country,Population mid-2018 (millions)
0,AFRICA,1284
56,NORTHERN AMERICA,365
59,LATIN AMERICA AND THE CARIBBEAN,649
95,ASIA,4536
144,EUROPE,746
189,OCEANIA,41


In [8]:
politician_data = politician_data[~allcaps_rows]
country_data = country_data[~country_data.page.str.contains("Template")]

In [9]:
ores_session = oresapi.Session("https://ores.wikimedia.org", "Class project <kamalavj@uw.edu>")
results = ores_session.score("enwiki", ["articlequality"], list(country_data.rev_id))

In [10]:
scores = []
missing_id =[]
i = 0
for score in results:
    try:
        scores.append(score["articlequality"]["score"]["prediction"])
    except:
        missing_id.append(i)
    i += 1


In [11]:

country_data = country_data.reset_index()
country_data = country_data.drop(missing_id)
country_data['scores'] = scores

## Merge the dataframes on 'country'

In [13]:
combined_df = country_data.merge(politician_data, on = 'country', how = 'inner')


combined_df.rename(index=str, columns={"page": "article_name", "rev_id": "revision_id","ratings":"article_quality", "Population mid-2018 (millions)":"population"}, inplace=True)

combined_df['population'] = combined_df['population'].apply(lambda x:x.replace(',',''))
combined_df['population'] = combined_df['population'].astype('float')
combined_df['population'] = combined_df['population'].apply(lambda x:x*1000000)
combined_df['population'] = combined_df['population'].astype(int)

combined_df = combined_df.reset_index()
combined_df.drop(['index','level_0'],axis=1)


Unnamed: 0,article_name,country,revision_id,scores,population
0,Bir I of Kanem,Chad,355319463,Stub,15400000
1,Abdullah II of Kanem,Chad,498683267,Stub,15400000
2,Salmama II of Kanem,Chad,565745353,Stub,15400000
3,Kuri I of Kanem,Chad,565745365,Stub,15400000
4,Mohammed I of Kanem,Chad,565745375,Stub,15400000
5,Kuri II of Kanem,Chad,669719757,Stub,15400000
6,Bir II of Kanem,Chad,670893206,Stub,15400000
7,Mahamat Hissene,Chad,693055898,Stub,15400000
8,Othman I,Chad,705432607,Stub,15400000
9,Alphonse Kotiga,Chad,707593108,Stub,15400000


In [14]:
combined_df.to_csv('wp_wpds_politicians_by_country.csv')

## Step 3: Analysis

## 1. Top 10 countries by coverage: 10 highest-ranked countries in terms of number of politician articles as a proportion of country population


In [15]:

combined_df['revision_id'] = combined_df['revision_id'].astype(int)
combined_df['population'] = combined_df['population'].astype(float)
art_per_country = combined_df.groupby(['country','population'])['revision_id'].count().to_frame()
art_per_country = art_per_country.reset_index()
art_per_country['proportion'] = (art_per_country['revision_id']/art_per_country['population']) * 100

art_per_country = art_per_country.rename(columns = {'revision_id':'Total_num_articles'})
#10 highest ranked countries by coverage
art_per_country.sort_values(by='proportion', ascending = False).head(10)

Unnamed: 0,country,population,Total_num_articles,proportion
166,Tuvalu,10000.0,54,0.54
115,Nauru,10000.0,52,0.52
135,San Marino,30000.0,81,0.27
108,Monaco,40000.0,40,0.1
93,Liechtenstein,40000.0,28,0.07
161,Tonga,100000.0,63,0.063
103,Marshall Islands,60000.0,37,0.061667
68,Iceland,400000.0,201,0.05025
3,Andorra,80000.0,34,0.0425
61,Grenada,100000.0,36,0.036


## 2. Bottom 10 countries by coverage: 10 lowest-ranked countries in terms of number of politician articles as a proportion of country population


In [16]:

#10 lowest ranked countries per coverage
art_per_country.sort_values(by='proportion', ascending=True).head(10)

Unnamed: 0,country,population,Total_num_articles,proportion
69,India,1371300000.0,980,7.1e-05
70,Indonesia,265200000.0,210,7.9e-05
34,China,1393800000.0,1130,8.1e-05
173,Uzbekistan,32900000.0,28,8.5e-05
51,Ethiopia,107500000.0,101,9.4e-05
82,"Korea, North",25600000.0,36,0.000141
178,Zambia,17700000.0,25,0.000141
159,Thailand,66200000.0,112,0.000169
112,Mozambique,30500000.0,58,0.00019
13,Bangladesh,166400000.0,319,0.000192


## 3. Top 10 countries by relative quality: 10 highest-ranked countries in terms of the relative proportion of politician articles that are of GA and FA-quality


In [17]:
top_quality_art = deepcopy(combined_df)
top_quality_art =top_quality_art[(top_quality_art['scores'] == 'FA') | (top_quality_art['scores'] == 'GA')]
top_quality_art = top_quality_art.groupby(['country','population']).count()[['revision_id']]
top_quality_art.reset_index(inplace=True)
total_num_article = deepcopy(art_per_country)
total_num_article = total_num_article.rename(columns={'revision_id':'total_articles'})
top_quality_art = top_quality_art.rename(columns = {'revision_id':'top_quality_articles'})
high_quality = total_num_article.merge(top_quality_art, on= 'country', how = 'left')
high_quality = high_quality.fillna(0)
high_quality['high_quality_articles'] = high_quality['top_quality_articles'].astype(int)
high_quality['proportion'] = (high_quality['top_quality_articles']/high_quality['Total_num_articles'])*100
high_quality.sort_values(by='proportion', ascending = False).head(10)

Unnamed: 0,country,population_x,Total_num_articles,proportion,population_y,top_quality_articles,high_quality_articles
82,"Korea, North",25600000.0,36,19.444444,25600000.0,7.0,7
137,Saudi Arabia,33400000.0,118,12.711864,33400000.0,15.0,15
104,Mauritania,4500000.0,48,12.5,4500000.0,6.0,6
31,Central African Republic,4700000.0,66,12.121212,4700000.0,8.0,8
132,Romania,19500000.0,343,11.370262,19500000.0,39.0,39
166,Tuvalu,10000.0,54,9.259259,10000.0,5.0,5
19,Bhutan,800000.0,33,9.090909,800000.0,3.0,3
44,Dominica,70000.0,12,8.333333,70000.0,1.0,1
155,Syria,18300000.0,128,7.8125,18300000.0,10.0,10
18,Benin,11500000.0,91,7.692308,11500000.0,7.0,7


## 4. Bottom 10 countries by relative quality: 10 lowest-ranked countries in terms of the relative proportion of politician articles that are of GA and FA-quality


In [18]:

#Finding the countries with the lowest proportion of high ranked articles
high_quality.sort_values(by='proportion', ascending = True).head(10)

Unnamed: 0,country,population_x,Total_num_articles,proportion,population_y,top_quality_articles,high_quality_articles
143,Slovakia,5400000.0,116,0.0,0.0,0.0,0
114,Namibia,2500000.0,162,0.0,0.0,0.0,0
30,Cape Verde,600000.0,37,0.0,0.0,0.0,0
112,Mozambique,30500000.0,58,0.0,0.0,0.0,0
38,Costa Rica,5000000.0,147,0.0,0.0,0.0,0
108,Monaco,40000.0,40,0.0,0.0,0.0,0
43,Djibouti,1000000.0,37,0.0,0.0,0.0,0
107,Moldova,3500000.0,423,0.0,0.0,0.0,0
167,Uganda,44100000.0,185,0.0,0.0,0.0,0
49,Eritrea,6000000.0,16,0.0,0.0,0.0,0


## From the data, we have:

the indices of countries belonging to each region
we split the data accordingly

## Add a column region to assign the region for every country in dataframe

In [19]:
africa = politician_data.iloc[1:55]
africa['region'] = 'AFRICA'
north_america = politician_data[57:58]
north_america['region'] = 'NORTHERN AMERICA'
asia = politician_data[96:143]
asia['region'] = 'ASIA'
latin_america = politician_data[60:94]
latin_america['region'] = 'LATIN AMERICA AND THE CARIBBEAN'
europe = politician_data[145:188]
europe['region'] = 'EUROPE'
oceania = politician_data[190:]
oceania['region'] = 'OCEANIA'

df1 = pd.merge(combined_df,africa,on='country',how = 'inner')
df2 = pd.merge(combined_df,asia,on='country',how = 'inner')
df3 = pd.merge(combined_df,latin_america,on='country',how = 'inner')
df4 = pd.merge(combined_df,north_america,on='country',how = 'inner')
df5 = pd.merge(combined_df,europe,on='country',how = 'inner')
df6 = pd.merge(combined_df,oceania,on='country',how = 'inner')
df=pd.concat([df1,df2,df3,df4,df5,df6],axis=0)
df.drop(['index','level_0'],axis=1)

Unnamed: 0,article_name,country,revision_id,scores,population,Population mid-2018 (millions),region
0,Bir I of Kanem,Chad,355319463,Stub,15400000.0,15.4,AFRICA
1,Abdullah II of Kanem,Chad,498683267,Stub,15400000.0,15.4,AFRICA
2,Salmama II of Kanem,Chad,565745353,Stub,15400000.0,15.4,AFRICA
3,Kuri I of Kanem,Chad,565745365,Stub,15400000.0,15.4,AFRICA
4,Mohammed I of Kanem,Chad,565745375,Stub,15400000.0,15.4,AFRICA
5,Kuri II of Kanem,Chad,669719757,Stub,15400000.0,15.4,AFRICA
6,Bir II of Kanem,Chad,670893206,Stub,15400000.0,15.4,AFRICA
7,Mahamat Hissene,Chad,693055898,Stub,15400000.0,15.4,AFRICA
8,Othman I,Chad,705432607,Stub,15400000.0,15.4,AFRICA
9,Alphonse Kotiga,Chad,707593108,Stub,15400000.0,15.4,AFRICA


## 5. Geographic regions by coverage: Ranking of geographic regions (in descending order) in terms of the total count of politician articles from countries in each region as a proportion of total regional population


In [23]:
count_df = df.groupby('region').count()[['revision_id']]

count_df=pd.merge(count_df,cuml_region_count,left_on='region',right_on='country',how='inner')
count_df['Population mid-2018 (millions)'] = count_df['Population mid-2018 (millions)'].apply(lambda x:x.replace(',',''))
count_df['Population mid-2018 (millions)'] = count_df['Population mid-2018 (millions)'].astype('float')
count_df['Population mid-2018 (millions)'] = count_df['Population mid-2018 (millions)'].apply(lambda x:x*1000000)
count_df['Population mid-2018 (millions)'] = count_df['Population mid-2018 (millions)'].astype(float)
count_df['revision_id'] = count_df['revision_id'].astype(float)
count_df = count_df.rename(columns={'revision_id':'total_num_articles'})
count_df['proportion'] = count_df['total_num_articles']/count_df['Population mid-2018 (millions)']*100
count_df.sort_values(by='proportion', ascending = False).head(10)

Unnamed: 0,total_num_articles,country,Population mid-2018 (millions),proportion
5,1304.0,OCEANIA,41000000.0,0.00318
2,16076.0,EUROPE,746000000.0,0.002155
3,5380.0,LATIN AMERICA AND THE CARIBBEAN,649000000.0,0.000829
0,6735.0,AFRICA,1284000000.0,0.000525
1,12023.0,ASIA,4536000000.0,0.000265
4,16.0,NORTHERN AMERICA,365000000.0,4e-06


## 6. Geographic regions by coverage: Ranking of geographic regions (in descending order) in terms of the relative proportion of politician articles from countries in each region that are of GA and FA-quality

In [25]:
df_cp = deepcopy(df)
df_cp =df_cp[(df_cp['scores'] == 'FA') | (df_cp['scores'] == 'GA')]
df_cp = df_cp.groupby(['region']).count()[['revision_id']]
df_cp.reset_index(inplace=True)
high_quality = count_df.merge(df_cp, right_on='region',left_on= 'country', how = 'inner')
high_quality = high_quality.rename(columns={'revision_id':'top_quality_articles'})
high_quality = high_quality.fillna(0)
high_quality['top_quality_articles'] = high_quality['top_quality_articles'].astype(int)
high_quality['proportion'] = (high_quality['top_quality_articles']/high_quality['total_num_articles'])*100
high_quality.sort_values(by='proportion', ascending = False).head(10)[['region','proportion']]

Unnamed: 0,region,proportion
1,ASIA,2.536804
2,EUROPE,2.090072
4,OCEANIA,1.993865
0,AFRICA,1.826281
3,LATIN AMERICA AND THE CARIBBEAN,1.394052


## Conclusion

## What biases did you expect to find in the data (before you started working with it), and why?
* I expected countries with higher population to have more politician articles  as one would expect more articles to be written, however this turned out to be untrue. I also expected democratic countries to have more number of highly rated articles because freedom of speech is a fundamental right in such a country and one would expect to see many articles.
## What (potential) sources of bias did you discover in the course of your data processing and analysis?
* The ORES model only considers english wikipedia. This analysis is therefore not representative of all the wiki-pages about politicians. Inclusion of non-english wikipedia could significantly alter the results in my opinion.
## Can you think of a realistic data science research situation where using these data (to train a model, perform a hypothesis-driven research, or make business decisions) might create biased or misleading results, due to the inherent gaps and limitations of the data?
* These results are not holistic insights due to the initial subsetting of data.
* The results are not normalized to the country population
* A scenario where this data is used to see the correlation in number of articles written about politician, country-wise and the educated population of the country, could be biased and misleading. 

## Main Takeaway
I often find myself complaining about the lack of good documentation for various research-implementations/softwares. I have realized that it is absolutely difficult to document your flow of thoughts and the logic into plain simple words to the reader. I have started to judge documenation very differently and appreciate the details present in them as well. 