# A2: Bias in Wikipedia Data

The goal of the project is to explore the concept of 'bias' through data on Wikipedia articles on politicians from a various countries.

In [1]:
## importing required libraries
import requests
import json
import numpy as np
import pandas as pd

### The wikipedia dataset is found at https://figshare.com/articles/Untitled_Item/5513449

#### This dataset contains page, revision_id and the country names

In [2]:
# Reading the dataset 
page_data_df = pd.read_csv('page_data.csv')

In [3]:
# Initiating empty lists
prediction = []
predictionf = []

In [None]:
## Ores API is called and the predictions of each article is saved in the empty list above, by calling each of the revision id.

endpoint = 'https://ores.wikimedia.org/v3/scores/{project}/{revid}/{model}'
headers = {'User-Agent' : 'https://github.com/your_github_username', 'From' : 'your_uw_email@uw.edu'}


for i in range(0,page_data_df.shape[0]):
    params = {'project' : 'enwiki',
          'model' : 'wp10',
          'revid' : page_data_df['rev_id'][i]
          }
    id = str(page_data_df['rev_id'][i])
    api_call = requests.get(endpoint.format(**params))
    response = api_call.json()
    row = page_data_df['rev_id'][i]
    prediction.append(response['enwiki']['scores'][id]['wp10']['score']['prediction'])
    
    
    

In [20]:
for i in range(46864,page_data_df.shape[0]):
    params = {'project' : 'enwiki',
          'model' : 'wp10',
          'revid' : page_data_df['rev_id'][i]
          }
    id = str(page_data_df['rev_id'][i])
    api_call = requests.get(endpoint.format(**params))
    response = api_call.json()
    row = page_data_df['rev_id'][i]
    predictionf.append(response['enwiki']['scores'][id]['wp10']['score']['prediction'])

In [13]:
df = pd.DataFrame(data={"prediction": prediction})
df.to_csv("./prediction.csv", sep=',',index=False)

In [24]:
df = pd.DataFrame(data={"prediction2": predictionf})
df.to_csv("./prediction2.csv", sep=',',index=False)

In [3]:
pred1 = pd.read_csv('prediction.csv')
pred2 = pd.read_csv('prediction2.csv')

In [4]:
scores = np.vstack([pred1, 'NA', 'NA', pred2])

In [5]:
page_data_scores = np.hstack([page_data_df, scores])

In [6]:
page_data_scores = pd.DataFrame(page_data_scores)

In [7]:
page_data_scores.columns = ['article_name', 'Location', 'revision_id', 'article_quality']

### Population Data can be found at http://www.prb.org/DataFinder/Topic/Rankings.aspx?ind=14

In [26]:
# reading in the population dataset
population = pd.read_csv('Population Mid-2015.csv', header=1)
population['Data'] = population['Data'].str.replace(',', '')

## Combining datasets

Wikipedia and population datasets are merged and saved to a final dataframe, which is saved to a csv file.

In [9]:
merged_df = pd.merge(page_data_scores, population, on='Location', how='inner')

In [10]:
final_df = merged_df[['Location', 'article_name', 'revision_id', 'article_quality', 'Data']]

In [11]:
final_df.columns = ['country', 'article_name', 'revision_id', 'article_quality', 'population']

In [12]:
# writing the final required dataset into a csv file
final_df.to_csv("./final_df.csv", sep=',',index=False)

## Analysis

To begin with the analysis, firstly the total number of articles per country is calculated and then divided by the population of that corresponding country to give the percentage of proportion of political articles in a country

In [14]:
country_count = final_df[["country","article_name"]].groupby("country").count().astype(int).reset_index()

In [15]:
country_count = pd.DataFrame({'country':country_count['country'], 'count':country_count['article_name']})

In [16]:
prop = country_count.merge(population, left_on='country', right_on='Location', how='inner')
prop['percentage'] = prop['count']*100/pd.to_numeric(prop['Data'])

In [17]:
prop = prop[['country', 'count', 'Data', 'percentage']]

### 10 highest-ranked countries in terms of number of politician articles as a proportion of country population

In [18]:
prop.sort_values(by=['percentage'], ascending=0).head(10)

Unnamed: 0,country,count,Data,percentage
120,Nauru,53,10860,0.488029
173,Tuvalu,55,11800,0.466102
141,San Marino,82,33000,0.248485
113,Monaco,40,38088,0.10502
97,Liechtenstein,29,37570,0.077189
107,Marshall Islands,37,55000,0.067273
72,Iceland,206,330828,0.062268
168,Tonga,63,103300,0.060987
3,Andorra,34,78000,0.04359
54,Federated States of Micronesia,38,103000,0.036893


### 10 lowest-ranked countries in terms of number of politician articles as a proportion of country population

In [19]:
prop.sort_values(by=['percentage'], ascending=1).head(10)

Unnamed: 0,country,count,Data,percentage
73,India,990,1314097616,7.5e-05
34,China,1138,1371920000,8.3e-05
74,Indonesia,215,255741973,8.4e-05
180,Uzbekistan,29,31290791,9.3e-05
53,Ethiopia,105,98148000,0.000107
86,"Korea, North",39,24983000,0.000156
185,Zambia,26,15473900,0.000168
166,Thailand,112,65121250,0.000172
38,"Congo, Dem. Rep. of",142,73340200,0.000194
13,Bangladesh,324,160411000,0.000202


To get the proportion of high quality articles in eac country, only the articles that are qualified in GA and FA groups are taken.

In [20]:
GA = final_df.loc[final_df['article_quality'] == 'GA']
FA = final_df.loc[final_df['article_quality'] == 'FA']
GA_FA_final = pd.concat([GA,FA])

In [21]:
GA_FA_count = GA_FA_final[["country","article_name"]].groupby("country").count().astype(int).reset_index()

In [22]:
GA_FA_prop = GA_FA_count.merge(country_count, left_on='country', right_on='country', how='inner')
GA_FA_prop['percentage'] = GA_FA_count['article_name']*100/GA_FA_prop['count']

### 10 highest-ranked countries in terms of number of GA and FA-quality articles as a proportion of all articles about politicians from that country

In [23]:
GA_FA_prop.sort_values(by=['percentage'], ascending=0).head(10)

Unnamed: 0,country,article_name,count,percentage
67,"Korea, North",9,39,23.076923
111,Romania,45,348,12.931034
114,Saudi Arabia,15,119,12.605042
22,Central African Republic,8,68,11.764706
110,Qatar,5,51,9.803922
52,Guinea-Bissau,2,21,9.52381
145,Vietnam,18,191,9.424084
12,Bhutan,3,33,9.090909
60,Ireland,31,381,8.136483
140,United States,86,1098,7.832423


### 10 lowest-ranked countries in terms of number of GA and FA-quality articles as a proportion of all articles about politicians from that country

In [24]:
GA_FA_prop.sort_values(by=['percentage'], ascending=1).head(10)

Unnamed: 0,country,article_name,count,percentage
42,Finland,1,572,0.174825
130,Tanzania,1,408,0.245098
106,Peru,1,354,0.282486
33,Czech Republic,1,254,0.393701
77,Lithuania,1,248,0.403226
89,Moldova,2,426,0.469484
41,Fiji,1,199,0.502513
136,Uganda,1,188,0.531915
78,Luxembourg,1,180,0.555556
99,Nigeria,4,684,0.584795


## Challenges

I have tried another workaround for producing the above tables using SQL.
The below SQL query gave the exact results as above.

In [54]:
from sqlalchemy import create_engine
## create a database engine and insert the final_df into a table 
disk_engine = create_engine('sqlite:///my_lite_store.db')
final_df.to_sql('final', disk_engine, if_exists='append')

In [83]:
proportion_df = pd.read_sql_query('SELECT country, count(*) as number, population FROM final GROUP BY country ',disk_engine)

In [84]:
proportion_df['prop'] = proportion_df['number']/proportion_df['population']

### 10 highest-ranked countries in terms of number of politician articles as a proportion of country population

In [86]:
proportion_df.sort_values(by=['prop'], ascending=0).head(10)

Unnamed: 0,country,number,population,prop
120,Nauru,53,10860,0.00488
173,Tuvalu,55,11800,0.004661
141,San Marino,82,33000,0.002485
113,Monaco,40,38088,0.00105
97,Liechtenstein,29,37570,0.000772
107,Marshall Islands,37,55000,0.000673
72,Iceland,206,330828,0.000623
168,Tonga,63,103300,0.00061
3,Andorra,34,78000,0.000436
54,Federated States of Micronesia,38,103000,0.000369


### 10 lowest-ranked countries in terms of number of politician articles as a proportion of country population

In [85]:
proportion_df.sort_values(by=['prop'], ascending=1).head(10)

Unnamed: 0,country,number,population,prop
73,India,990,1314097616,7.533687e-07
34,China,1138,1371920000,8.294944e-07
74,Indonesia,215,255741973,8.406911e-07
180,Uzbekistan,29,31290791,9.267902e-07
53,Ethiopia,105,98148000,1.069813e-06
86,"Korea, North",39,24983000,1.561062e-06
185,Zambia,26,15473900,1.680249e-06
166,Thailand,112,65121250,1.719869e-06
38,"Congo, Dem. Rep. of",142,73340200,1.936182e-06
13,Bangladesh,324,160411000,2.019812e-06
