# A2: Bias in Data

Frank Chen

This code is made available for reuse under a [CC0 license](https://creativecommons.org/share-your-work/public-domain/cc0/).

## Cleaning the Data

In [1]:
import pandas as pd
import numpy as np
import requests
import json
import copy as cp

In [2]:
# Cleaning page_data.csv
page_data = pd.read_csv('page_data.csv')

In [3]:
# Removing rows with 'Template'
page_data = page_data[~page_data.page.str.contains("Template")]

In [4]:
# Save file
page_data.to_csv('page_data_clean.csv', index=False)

In [5]:
# Cleaning WPDS_2018_data.csv
wpds_data = pd.read_csv('WPDS_2018_data.csv')
wpds_data.size

414

In [6]:
# Remove regions
wpds_data = wpds_data[~wpds_data.Geography.str.isupper()]

In [7]:
# Save file
wpds_data.to_csv('WPDS_2018_data_clean.csv', index=False)

In [8]:
# Extract Regional Data
wpds_data_tmp = pd.read_csv('WPDS_2018_data.csv')
wpds_data_tmp = wpds_data_tmp[wpds_data_tmp.Geography.str.isupper()]
wpds_data_tmp.to_csv('WPDS_2018_region_data.csv', index=False)

## Getting article quality predictions

In [9]:
headers = {'User-Agent' : 'https://github.com/kfrankc', 'From' : 'kfrankc@uw.edu'}

def get_ores_data(revision_ids, headers, rev_id):
    
    # Define the endpoint
    endpoint = 'https://ores.wikimedia.org/v3/scores/{project}/?models={model}&revids={revids}'
    
    # Specify the parameters - smushing all the revision IDs together separated by | marks.
    # Yes, 'smush' is a technical term, trust me I'm a scientist.
    # What do you mean "but people trusting scientists regularly goes horribly wrong" who taught you tha- oh.  
    params = {'project' : 'enwiki',
              'model'   : 'wp10',
              'revids'  : '|'.join(str(x) for x in revision_ids)
              }
    api_call = requests.get(endpoint.format(**params))
    response = api_call.json()
    if "error" in response["enwiki"]["scores"][str(rev_id)]['wp10']:
        print("Failed to process; rev_id: {}".format(rev_id))
        return "error"
    else:
        article_quality = response["enwiki"]["scores"][str(rev_id)]['wp10']['score']['prediction']
#     except Exception as e:
#         print("Error {} processing rev_id: {}".format(e, rev_id))
        return article_quality
#     print(json.dumps(response, indent=4, sort_keys=True))

In [10]:
# Add article_quality column to page_data pandas object
# Loop through each rev_id
# Get article quality
# Append to corresponding article_quality column

page_data_tmp = pd.read_csv('page_data_clean.csv')
page_data_tmp["article_quality"] = ""
page_data_tmp.shape

(46701, 4)

In [11]:
for i, row_value in page_data_tmp['rev_id'].iteritems():
    if (i % 1000 == 0 or i == 46435):
        print("Row: {}, row_value: {}".format(i, row_value))
        example_ids = [row_value]
        article_quality = get_ores_data(example_ids, headers, row_value)
        print("Article quality: {}".format(article_quality))
    try:
        page_data_tmp.iloc[i, page_data_tmp.columns.get_loc("article_quality")] = article_quality
    except:
        print("Error happened at: ".format(i))

Row: 0, row_value: 355319463
Article quality: Stub
Row: 1000, row_value: 705737107
Article quality: Start
Row: 2000, row_value: 711506001
Article quality: Stub
Row: 3000, row_value: 715574067
Article quality: Stub
Row: 4000, row_value: 719256142
Article quality: Start
Row: 5000, row_value: 722984083
Article quality: Stub
Row: 6000, row_value: 726608152
Article quality: Stub
Row: 7000, row_value: 733933703
Article quality: Stub
Row: 8000, row_value: 739516888
Article quality: Stub
Row: 9000, row_value: 746096776
Article quality: Stub
Row: 10000, row_value: 750138485
Article quality: Stub
Row: 11000, row_value: 756588999
Article quality: Start
Row: 12000, row_value: 757217094
Article quality: Stub
Row: 13000, row_value: 758716972
Article quality: Stub
Row: 14000, row_value: 763763300
Article quality: Stub
Row: 15000, row_value: 768011808
Article quality: Stub
Row: 16000, row_value: 771454540
Article quality: Start
Row: 17000, row_value: 775251439
Article quality: Stub
Row: 18000, row_val

In [12]:
page_data_tmp.loc[page_data_tmp['page'] == 'Franklin D. Roosevelt']

Unnamed: 0,page,country,rev_id,article_quality
46435,Franklin D. Roosevelt,United States,807395895,FA


In [13]:
# Reorder the dataset
page_data_tmp.columns = ['article_name', 'country', 'revision_id', 'article_quality']

In [14]:
cols = ['country', 'article_name', 'revision_id', 'article_quality']
page_data_tmp = page_data_tmp[cols]
page_data_tmp.to_csv('page_data_tmp.csv', index=False)
page_data_tmp.head()
page_data_tmp.loc[page_data_tmp['article_name'] == 'Franklin D. Roosevelt']

Unnamed: 0,country,article_name,revision_id,article_quality
46435,United States,Franklin D. Roosevelt,807395895,FA


In [15]:
# Merge with population data wp_wpds_politicians_by_country.csv
page_data_tmp = pd.read_csv('page_data_tmp.csv')
wpds_data_tmp = pd.read_csv('WPDS_2018_data_clean.csv')

In [16]:
result_left_join = pd.merge(page_data_tmp, wpds_data_tmp, how='left', left_on='country', right_on='Geography')

In [17]:
result_left_join.head()
result_left_join.loc[result_left_join['article_name'] == 'Franklin D. Roosevelt']

Unnamed: 0,country,article_name,revision_id,article_quality,Geography,Population mid-2018 (millions)
46435,United States,Franklin D. Roosevelt,807395895,FA,United States,328


In [18]:
result_left_join[pd.isnull(result_left_join['Geography'])].to_csv('unmerged_data.csv')

In [19]:
result = pd.merge(page_data_tmp, wpds_data_tmp, left_on='country', right_on='Geography')
result = result.drop(['Geography'], axis=1)
result.columns = ['country', 'article_name', 'revision_id', 'article_quality', 'population']
result['population'] = result['population'].str.replace(',', '')
result.head()

Unnamed: 0,country,article_name,revision_id,article_quality,population
0,Chad,Bir I of Kanem,355319463,Stub,15.4
1,Chad,Abdullah II of Kanem,498683267,Stub,15.4
2,Chad,Salmama II of Kanem,565745353,Stub,15.4
3,Chad,Kuri I of Kanem,565745365,Stub,15.4
4,Chad,Mohammed I of Kanem,565745375,Stub,15.4


In [20]:
result.to_csv('wp_wpds_politicians_by_country.csv', index=False)

## Analysis

* Top 10 countries by coverage: 10 highest-ranked countries in terms of number of politician articles as a proportion of country population
* Bottom 10 countries by coverage: 10 lowest-ranked countries in terms of number of politician articles as a proportion of country population
* Top 10 countries by relative quality: 10 highest-ranked countries in terms of the relative proportion of politician articles that are of GA and FA-quality
* Bottom 10 countries by relative quality: 10 lowest-ranked countries in terms of the relative proportion of politician articles that are of GA and FA-quality
* Geographic regions by coverage: Ranking of geographic regions (in descending order) in terms of the total count of politician articles from countries in each region as a proportion of total regional population
* Geographic regions by coverage: Ranking of geographic regions (in descending order) in terms of the relative proportion of politician articles from countries in each region that are of GA and FA-quality

In [21]:
a2_data = pd.read_csv('wp_wpds_politicians_by_country.csv')
a2_data.iloc[3527]
a2_data.loc[a2_data['article_name'] == 'Franklin D. Roosevelt']

Unnamed: 0,country,article_name,revision_id,article_quality,population
3512,United States,Franklin D. Roosevelt,807395895,FA,328.0


In [22]:
# For each country, generate: % of articles-per-population

# Represent population in actual numbers
a2_data['population'] = pd.to_numeric(a2_data['population'])
a2_data['population'] = a2_data['population'].apply(lambda x: x*1000000)
a2_data.head()

Unnamed: 0,country,article_name,revision_id,article_quality,population
0,Chad,Bir I of Kanem,355319463,Stub,15400000.0
1,Chad,Abdullah II of Kanem,498683267,Stub,15400000.0
2,Chad,Salmama II of Kanem,565745353,Stub,15400000.0
3,Chad,Kuri I of Kanem,565745365,Stub,15400000.0
4,Chad,Mohammed I of Kanem,565745375,Stub,15400000.0


In [23]:
# Merge count of articles and population
a2_data_articles = a2_data.groupby(['country'])['article_name'].agg('count').to_frame()
a2_data_population = a2_data.groupby(['country'])['population'].agg(pd.Series.mode).to_frame()

In [24]:
a2_data_join = pd.merge(a2_data_articles, a2_data_population, on='country')
a2_data_join.head()

Unnamed: 0_level_0,article_name,population
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,322,36500000.0
Albania,457,2900000.0
Algeria,116,42700000.0
Andorra,34,80000.0
Angola,106,30400000.0


In [25]:
a2_data_join['coverage'] = a2_data_join['article_name']/a2_data_join['population']*100

In [26]:
a2_data_join.columns = ['article_count', 'population', 'coverage']
a2_data_join.head()

Unnamed: 0_level_0,article_count,population,coverage
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,322,36500000.0,0.000882
Albania,457,2900000.0,0.015759
Algeria,116,42700000.0,0.000272
Andorra,34,80000.0,0.0425
Angola,106,30400000.0,0.000349


### Top 10 countries by coverage

In [27]:
a2_data_join.nlargest(10, 'coverage')

Unnamed: 0_level_0,article_count,population,coverage
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Tuvalu,54,10000.0,0.54
Nauru,52,10000.0,0.52
San Marino,81,30000.0,0.27
Monaco,40,40000.0,0.1
Liechtenstein,28,40000.0,0.07
Tonga,63,100000.0,0.063
Marshall Islands,37,60000.0,0.061667
Iceland,202,400000.0,0.0505
Andorra,34,80000.0,0.0425
Federated States of Micronesia,36,100000.0,0.036


### Bottom 10 countries by coverage

In [28]:
a2_data_join.nsmallest(10, 'coverage')

Unnamed: 0_level_0,article_count,population,coverage
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
India,985,1371300000.0,7.2e-05
Indonesia,211,265200000.0,8e-05
China,1133,1393800000.0,8.1e-05
Uzbekistan,28,32900000.0,8.5e-05
Ethiopia,101,107500000.0,9.4e-05
"Korea, North",36,25600000.0,0.000141
Zambia,25,17700000.0,0.000141
Thailand,112,66200000.0,0.000169
Mozambique,58,30500000.0,0.00019
Bangladesh,321,166400000.0,0.000193


In [29]:
# For each country, generate: % of high-quality articles
# Find # of GA or FA articles, then merge dataframe with a2_data_articles

a2_data_quality = a2_data.groupby(['country', 'article_quality']).size().to_frame()
a2_data_quality.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,0
country,article_quality,Unnamed: 2_level_1
Afghanistan,C,17
Afghanistan,FA,3
Afghanistan,Start,132
Afghanistan,Stub,170
Albania,C,49


In [30]:
# Upon checking, there are no GA article quality
a2_data_quality.loc[a2_data_quality.index.get_level_values('article_quality') == 'GA']

Unnamed: 0_level_0,Unnamed: 1_level_0,0
country,article_quality,Unnamed: 2_level_1


In [31]:
a2_data_quality_fa = a2_data_quality.loc[a2_data_quality.index.get_level_values('article_quality') == 'FA']

In [32]:
a2_data_quality_join = pd.merge(a2_data_quality_fa, a2_data_articles, on='country')
a2_data_quality_join.head()

Unnamed: 0_level_0,0,article_name
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,3,322
Albania,5,457
Argentina,3,491
Australia,7,1561
Austria,1,336


In [33]:
a2_data_quality_join['relative_quality'] = a2_data_quality_join[0]/a2_data_quality_join['article_name']*100
a2_data_quality_join.columns = ['fa_count', 'article_count', 'relative_quality']

### Top 10 countries by relative quality

In [34]:
a2_data_quality_join.nlargest(10, 'relative_quality')

Unnamed: 0_level_0,fa_count,article_count,relative_quality
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Korea, North",2,36,5.555556
Guinea-Bissau,1,20,5.0
Kyrgyzstan,3,70,4.285714
Qatar,2,50,4.0
Ethiopia,3,101,2.970297
Finland,16,570,2.807018
Tajikistan,1,39,2.564103
Saudi Arabia,3,118,2.542373
Chile,8,348,2.298851
Denmark,6,287,2.090592


### Bottom 10 countries by relative quality

In [35]:
a2_data_quality_join.nsmallest(10, 'relative_quality')

Unnamed: 0_level_0,fa_count,article_count,relative_quality
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Hungary,1,611,0.163666
Brazil,1,552,0.181159
Mexico,2,1077,0.185701
Belgium,1,520,0.192308
Taiwan,1,500,0.2
Sri Lanka,1,461,0.21692
Moldova,1,424,0.235849
Poland,2,805,0.248447
Ghana,1,393,0.254453
Netherlands,2,700,0.285714


## Analysis of Region

In [36]:
# For each geographic region, generate: % of articles-per-population, % of high-quality articles

# Add column called 'continent', and populate it with the continent for each country
wpds_raw = pd.read_csv('WPDS_2018_data.csv')
wpds_raw['continent'] = ""
wpds_raw.columns = ['country', 'population', 'region']
wpds_raw['population'] = wpds_raw['population'].str.replace(',', '')
wpds_raw['population'] = pd.to_numeric(wpds_raw['population'])
wpds_raw['population'] = wpds_raw['population'].apply(lambda x: x*1000000)
wpds_raw.head()

Unnamed: 0,country,population,region
0,AFRICA,1284000000.0,
1,Algeria,42700000.0,
2,Egypt,97000000.0,
3,Libya,6500000.0,
4,Morocco,35200000.0,


In [37]:
print(wpds_raw.loc[wpds_raw['country'] == 'AFRICA']) # index 0
print(wpds_raw.loc[wpds_raw['country'] == 'NORTHERN AMERICA']) # index 56
print(wpds_raw.loc[wpds_raw['country'] == 'LATIN AMERICA AND THE CARIBBEAN']) # index 59
print(wpds_raw.loc[wpds_raw['country'] == 'ASIA']) # index 95
print(wpds_raw.loc[wpds_raw['country'] == 'EUROPE']) # index 144
print(wpds_raw.loc[wpds_raw['country'] == 'OCEANIA']) # index 189

  country    population region
0  AFRICA  1.284000e+09       
             country   population region
56  NORTHERN AMERICA  365000000.0       
                            country   population region
59  LATIN AMERICA AND THE CARIBBEAN  649000000.0       
   country    population region
95    ASIA  4.536000e+09       
    country   population region
144  EUROPE  746000000.0       
     country  population region
189  OCEANIA  41000000.0       


In [38]:
wpds_raw.region.iloc[0:55] = 'Africa'
wpds_raw.region.iloc[56:58] = 'Northern America'
wpds_raw.region.iloc[59:94] = 'Latin America and the Caribbean'
wpds_raw.region.iloc[95:143] = 'Asia'
wpds_raw.region.iloc[144:188] = 'Europe'
wpds_raw.region.iloc[188:] = 'Oceania'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [39]:
wpds_raw.head()

Unnamed: 0,country,population,region
0,AFRICA,1284000000.0,Africa
1,Algeria,42700000.0,Africa
2,Egypt,97000000.0,Africa
3,Libya,6500000.0,Africa
4,Morocco,35200000.0,Africa


In [40]:
a2_data_region_join = pd.merge(a2_data_join, wpds_raw, on='country')
a2_data_region_join.head()

Unnamed: 0,country,article_count,population_x,coverage,population_y,region
0,Afghanistan,322,36500000.0,0.000882,36500000.0,Asia
1,Albania,457,2900000.0,0.015759,2900000.0,Europe
2,Algeria,116,42700000.0,0.000272,42700000.0,Africa
3,Andorra,34,80000.0,0.0425,80000.0,Europe
4,Angola,106,30400000.0,0.000349,30400000.0,Africa


In [41]:
# Merge count of articles and population
a2_data_region_groupby = a2_data_region_join.groupby(['region'])['article_count', 'population_x'].agg('sum')

In [42]:
a2_data_region_groupby['coverage'] = a2_data_region_groupby['article_count']/a2_data_region_groupby['population_x']*100

In [43]:
a2_data_region_groupby = a2_data_region_groupby.drop(a2_data_region_groupby.index[0])

### Geographic regions by coverage

In [44]:
a2_data_region_groupby.nlargest(6, 'coverage')

Unnamed: 0_level_0,article_count,population_x,coverage
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Oceania,4008,86480000.0,0.004635
Northern America,848,37200000.0,0.00228
Europe,15047,687890000.0,0.002187
Latin America and the Caribbean,5043,596470000.0,0.000845
Africa,6483,1114700000.0,0.000582
Asia,11088,4489500000.0,0.000247


In [45]:
# Merge count of quality articles and population
a2_data_region_quality_join = pd.merge(a2_data_quality_join, wpds_raw, on='country')
a2_data_region_quality_join.head()

Unnamed: 0,country,fa_count,article_count,relative_quality,population,region
0,Afghanistan,3,322,0.931677,36500000.0,Asia
1,Albania,5,457,1.094092,2900000.0,Europe
2,Argentina,3,491,0.610998,44500000.0,Latin America and the Caribbean
3,Australia,7,1561,0.44843,24100000.0,Oceania
4,Austria,1,336,0.297619,8800000.0,Europe


In [46]:
a2_data_region_quality_groupby = a2_data_region_quality_join.groupby(['region'])['fa_count', 'article_count'].agg('sum')
a2_data_region_quality_groupby = a2_data_region_quality_groupby.drop(a2_data_region_quality_groupby.index[0])
a2_data_region_quality_groupby.head()

Unnamed: 0_level_0,fa_count,article_count
region,Unnamed: 1_level_1,Unnamed: 2_level_1
Africa,33,3547
Asia,62,8674
Europe,88,12197
Latin America and the Caribbean,23,3749
Northern America,3,848


### Geographic regions by quality

In [47]:
a2_data_region_quality_groupby['quality'] = a2_data_region_quality_groupby['fa_count']/a2_data_region_quality_groupby['article_count']*100
a2_data_region_quality_groupby.nlargest(6, 'quality')

Unnamed: 0_level_0,fa_count,article_count,quality
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Africa,33,3547,0.930364
Europe,88,12197,0.721489
Asia,62,8674,0.71478
Oceania,22,3579,0.614697
Latin America and the Caribbean,23,3749,0.613497
Northern America,3,848,0.353774
