In [1]:
import pandas as pd
import numpy as np
import requests

## Step 2: Cleaning the Data

In [2]:
page_data = pd.read_csv('page_data.csv')

In [3]:
page_data_2 = page_data[page_data["page"].str.contains("Template") == False]

In [4]:
wpds_data = pd.read_csv('WPDS_2020_data.csv')

In [5]:
# WPDS_2020_data_country = WPDS_2020_data[WPDS_2020_data["Type"].str.contains("Sub-Region") == False]
wpds_2020_data_ctry = wpds_data[wpds_data.Type != 'Sub-Region']
wpds_2020_data_ctry_rmw = wpds_2020_data_ctry[wpds_2020_data_ctry.Type != 'World']
wpds_2020_data_ctry_rmw.tail()

Unnamed: 0,FIPS,Name,Type,TimeFrame,Data (M),Population
229,WS,Samoa,Country,2019,0.2,200000
230,SB,Solomon Islands,Country,2019,0.715,715000
231,TO,Tonga,Country,2019,0.099,99000
232,TV,Tuvalu,Country,2019,0.01,10000
233,VU,Vanuatu,Country,2019,0.321,321000


In [6]:
wpds_2020_data_ctry_rmw.to_csv('wpds_2020_data_ctry_rmw.csv', index=False)

In [7]:
wpds_2020_data_sub = wpds_data[wpds_data.Type != 'Country']
wpds_2020_data_sub_rmw = wpds_2020_data_sub[wpds_2020_data_sub.Type != 'World']
wpds_2020_data_sub_rmw.tail()

Unnamed: 0,FIPS,Name,Type,TimeFrame,Data (M),Population
168,Channel Islands,Channel Islands,Sub-Region,2019,0.172,172000
179,WESTERN EUROPE,WESTERN EUROPE,Sub-Region,2019,195.479,195479000
189,EASTERN EUROPE,EASTERN EUROPE,Sub-Region,2019,291.902,291902000
200,SOUTHERN EUROPE,SOUTHERN EUROPE,Sub-Region,2019,153.251,153251000
216,OCEANIA,OCEANIA,Sub-Region,2019,43.155,43155000


In [8]:
wpds_2020_data_sub_rmw.to_csv('wpds_2020_data_sub_rmw.csv', index=False)

## Step 3 Getting Article Quality Predictions

In [9]:
endpoint = 'https://ores.wikimedia.org/v3/scores/enwiki?models=articlequality&revids={rev_ids}'
# Customize these with your own information
headers = {
    'User-Agent': 'https://github.com/lanfuli',
    'From': 'lanfuli@uw.edu'
}

In [10]:
# call the api function
def api_call(endpoint, rev_ids):
    call = requests.get(endpoint.format(rev_ids = rev_ids), headers=headers)
    response = call.json()
    
    return response

In [11]:
def get_score(score_map, data, endpoint):
    l = len(data)
    
#   send 50 rev_ids each time avoid crash (information from class slack)
    for i in range(0, l, 50):
        if i + 50 <= l:
            mini_batch_id = data['rev_id'].iloc[i:i+50]
        else:
            mini_batch_id = data['rev_id'].iloc[i:]
        
        temp = api_call(endpoint, '|'.join(str(s) for s in mini_batch_id))

        for j in temp['enwiki']['scores']:
            if 'score' in temp['enwiki']['scores'][j]['articlequality']:
                score_map[j] = temp['enwiki']['scores'][j]['articlequality']['score']['prediction']
            else:
                score_map[j] = 'NA'

In [12]:
# It takes time to finish
score_map = {}
get_score(score_map, page_data_2, endpoint)

In [13]:
score = page_data_2['rev_id'].astype(str).map(score_map)
page_data_2['article_score'] = score

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  page_data_2['article_score'] = score


In [14]:
na_score = page_data_2[page_data_2.article_score == 'NA']

In [15]:
print(len(na_score))

276


In [16]:
page_data_score_nna = page_data_2[page_data_2.article_score != 'NA']
print(len(page_data_score_nna))

46425


In [17]:
# Save to csv log
page_data_score_nna.to_csv('page_data_score_nna.csv', index=False)
na_score.to_csv('na_score.csv', index=False)

## Step 4 Combining the Datasets

In [18]:
page_data_score_nna

Unnamed: 0,page,country,rev_id,article_score
1,Bir I of Kanem,Chad,355319463,Stub
10,Information Minister of the Palestinian Nation...,Palestinian Territory,393276188,Stub
12,Yos Por,Cambodia,393822005,Stub
23,Julius Gregr,Czech Republic,395521877,Stub
24,Edvard Gregr,Czech Republic,395526568,Stub
...,...,...,...,...
47191,Hal Bidlack,United States,807481636,C
47192,Yahya Jammeh,Gambia,807482007,GA
47193,Lucius Fairchild,United States,807483006,C
47194,Fahd of Saudi Arabia,Saudi Arabia,807483153,GA


In [19]:
data_merge = page_data_score_nna.merge(wpds_2020_data_ctry_rmw, how = 'outer', left_on = ['country'], right_on = ['Name'])

In [20]:
no_match = data_merge[data_merge.country.isna() | data_merge.Name.isna()]
no_match.to_csv('no_match.csv', index=False)

In [21]:
data_merge_2 = data_merge.dropna(subset = ['country', 'Name'])

In [22]:
data_merge_match = data_merge_2[['country', 'page', 'rev_id', 'article_score', 'Population']]
data_merge_match = data_merge_match.rename(columns={'page': 'article_name', 'rev_id' : 'revision_id', 
                                                    'article_score' : 'article_quality_est.', 'Population' : 'population'})

In [23]:
data_merge_match.to_csv('data_merge_match.csv', index=False)

## Step 5 Analysis

In [24]:
articles_per_country = data_merge_match.groupby('country').agg({'article_name':'count'})
articles_per_country

Unnamed: 0_level_0,article_name
country,Unnamed: 1_level_1
Afghanistan,319
Albania,456
Algeria,116
Andorra,34
Angola,106
...,...
Venezuela,130
Vietnam,187
Yemen,116
Zambia,25


In [25]:
pop_data = data_merge_match.groupby('country').agg({'population':'mean'})
pop_data

Unnamed: 0_level_0,population
country,Unnamed: 1_level_1
Afghanistan,38928000.0
Albania,2838000.0
Algeria,44357000.0
Andorra,82000.0
Angola,32522000.0
...,...
Venezuela,28645000.0
Vietnam,96209000.0
Yemen,29826000.0
Zambia,18384000.0


In [26]:
# merge these two tables: population and article per country
articles_per_pop = articles_per_country.merge(pop_data, left_on='country', right_on='country', how='inner')
articles_per_pop

Unnamed: 0_level_0,article_name,population
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,319,38928000.0
Albania,456,2838000.0
Algeria,116,44357000.0
Andorra,34,82000.0
Angola,106,32522000.0
...,...,...
Venezuela,130,28645000.0
Vietnam,187,96209000.0
Yemen,116,29826000.0
Zambia,25,18384000.0


In [27]:
# 1. Top 10 countries by coverage: 10 highest-ranked countries in terms of number of
# politician articles as a proportion of country population
articles_per_pop['percentage'] = articles_per_pop['article_name']  * 100 / articles_per_pop['population']
articles_per_pop_rank =articles_per_pop.sort_values(['percentage'], ascending=[False])
articles_per_pop_rank.head(10)

Unnamed: 0_level_0,article_name,population,percentage
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Tuvalu,54,10000.0,0.54
Nauru,52,11000.0,0.472727
San Marino,81,34000.0,0.238235
Monaco,40,38000.0,0.105263
Liechtenstein,28,39000.0,0.071795
Marshall Islands,37,57000.0,0.064912
Tonga,63,99000.0,0.063636
Iceland,201,368000.0,0.05462
Andorra,34,82000.0,0.041463
Federated States of Micronesia,36,106000.0,0.033962


In [28]:
# 2. Bottom 10 countries by coverage: 10 lowest-ranked countries in terms of number of politician articles as a proportion of country population 
articles_per_pop_rank.tail(10)

Unnamed: 0_level_0,article_name,population,percentage
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bangladesh,317,169809000.0,0.000187
Mozambique,58,31166000.0,0.000186
Thailand,112,66534000.0,0.000168
"Korea, North",36,25779000.0,0.00014
Zambia,25,18384000.0,0.000136
Ethiopia,101,114916000.0,8.8e-05
Uzbekistan,28,34174000.0,8.2e-05
China,1129,1402385000.0,8.1e-05
Indonesia,209,271739000.0,7.7e-05
India,968,1400100000.0,6.9e-05


In [29]:
high_qua = pd.concat([data_merge_match[data_merge_match['article_quality_est.'] == 'FA'], 
                     data_merge_match[data_merge_match['article_quality_est.'] == 'GA']])

In [30]:
# 3. Top 10 countries by relative quality: 10 highest-ranked countries in terms of the
# relative proportion of politician articles that are of GA and FA-quality
high_articles_per_country = high_qua.groupby('country').agg({'article_name':'count'})
high_pop_data = high_qua.groupby('country').agg({'population':'mean'})

high_articles_per_pop = high_articles_per_country.merge(high_pop_data, left_on='country', right_on='country', how='inner')
high_articles_per_pop

Unnamed: 0_level_0,article_name,population
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,13,38928000.0
Albania,3,2838000.0
Algeria,2,44357000.0
Argentina,16,45377000.0
Armenia,5,2956000.0
...,...,...
Vanuatu,3,321000.0
Venezuela,3,28645000.0
Vietnam,13,96209000.0
Yemen,3,29826000.0


In [33]:
high_articles_per_pop_2 = high_articles_per_pop.rename(columns={'article_name': 'high_article_count', 'population': 'high_qty_population'})
combine_all_article = high_articles_per_pop_2.merge(articles_per_pop_rank, left_on='country', right_on='country', how='inner')
combine_all_article = combine_all_article.drop(columns={'percentage', 'population'})

combine_all_article['percentage'] = combine_all_article['high_article_count']  * 100 / combine_all_article['article_name']
combine_all_article = combine_all_article.sort_values('percentage', ascending=False)
combine_all_article.head(10)

Unnamed: 0_level_0,high_article_count,high_qty_population,article_name,percentage
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Korea, North",8,25779000.0,36,22.222222
Saudi Arabia,15,35041000.0,117,12.820513
Romania,42,19241000.0,343,12.244898
Central African Republic,8,4830000.0,66,12.121212
Uzbekistan,3,34174000.0,28,10.714286
Mauritania,5,4650000.0,48,10.416667
Guatemala,7,18066000.0,83,8.433735
Dominica,1,72000.0,12,8.333333
Syria,10,19398000.0,128,7.8125
Benin,7,12209000.0,91,7.692308


In [34]:
# 4. Bottom 10 countries by relative quality: 10 lowest-ranked countries in terms of the relative proportion of politician articles that are of GA and FA-quality
combine_all_article.tail(10)

Unnamed: 0_level_0,high_article_count,high_qty_population,article_name,percentage
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Morocco,1,35952000.0,206,0.485437
Lithuania,1,2794000.0,244,0.409836
Colombia,1,49444000.0,285,0.350877
Portugal,1,10255000.0,318,0.314465
Nigeria,2,206140000.0,676,0.295858
Peru,1,32824000.0,350,0.285714
Nepal,1,29996000.0,356,0.280899
Switzerland,1,8634000.0,402,0.248756
Tanzania,1,59734000.0,404,0.247525
Belgium,1,11515000.0,519,0.192678


In [35]:
# 5. Geographic regions by coverage: Ranking of geographic regions (in descending order) in 
# terms of the total count of politician articles from countries in each region 
# as a proportion of total regional population
df = wpds_data.drop(columns=['FIPS', 'TimeFrame', 'Data (M)', 'Population'])

In [36]:
# loop the dataframe, add country to sub-region dict, return a dict
def region_country_map(df, region_country_dict):
#     read each row, add 
    region_name = ''
#   source from: https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas
    for index, row in df.iterrows():
#         remove the world
        if row['Type'] == 'World':
            continue
#             check upper for the special case: channel island
        elif row['Type'] == 'Sub-Region' and row['Name'].isupper():
            region_name = row['Name']
            continue
        region_country_dict[row['Name']] = region_name
    return region_country_dict

region_country_dict = {}
region_country = region_country_map(df,region_country_dict )     

In [37]:
# map articles_per_pop with region_country_dict
articles_per_pop['Sub-Region'] = articles_per_pop.index.map(region_country)
articles_per_pop

Unnamed: 0_level_0,article_name,population,percentage,Sub-Region
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan,319,38928000.0,0.000819,SOUTH ASIA
Albania,456,2838000.0,0.016068,SOUTHERN EUROPE
Algeria,116,44357000.0,0.000262,NORTHERN AFRICA
Andorra,34,82000.0,0.041463,SOUTHERN EUROPE
Angola,106,32522000.0,0.000326,MIDDLE AFRICA
...,...,...,...,...
Venezuela,130,28645000.0,0.000454,SOUTH AMERICA
Vietnam,187,96209000.0,0.000194,SOUTHEAST ASIA
Yemen,116,29826000.0,0.000389,WESTERN ASIA
Zambia,25,18384000.0,0.000136,EASTERN AFRICA


In [38]:
articles_per_region = articles_per_pop.groupby('Sub-Region').agg({'article_name':'sum'})
articles_per_region.reset_index()

Unnamed: 0,Sub-Region,article_name
0,CARIBBEAN,695
1,CENTRAL AMERICA,1543
2,CENTRAL ASIA,245
3,EAST ASIA,2473
4,EASTERN AFRICA,2502
5,EASTERN EUROPE,3732
6,MIDDLE AFRICA,665
7,NORTHERN AFRICA,899
8,NORTHERN AMERICA,1901
9,NORTHERN EUROPE,3763


In [39]:
# merge the population by region and articles_per_region
pop_per_region = wpds_2020_data_sub_rmw[['Name', 'Population']]

In [40]:
articles_per_region = articles_per_region.merge(pop_per_region, how = 'left', left_on = 'Sub-Region', right_on = ['Name'] )

In [41]:
articles_per_region['article_region_percentage'] = articles_per_region['article_name'] *100 / articles_per_region['Population']
articles_per_region.sort_values(by = ['article_region_percentage'], axis=0, ascending=False, inplace=True)
articles_per_region

Unnamed: 0,article_name,Name,Population,article_region_percentage
10,3126,OCEANIA,43155000,0.007244
9,3763,NORTHERN EUROPE,105990000,0.00355
15,3710,SOUTHERN EUROPE,153251000,0.002421
18,4560,WESTERN EUROPE,195479000,0.002333
0,695,CARIBBEAN,43233000,0.001608
5,3732,EASTERN EUROPE,291902000,0.001279
14,634,SOUTHERN AFRICA,67732000,0.000936
17,2563,WESTERN ASIA,280927000,0.000912
1,1543,CENTRAL AMERICA,178611000,0.000864
11,3032,SOUTH AMERICA,429191000,0.000706


In [42]:
# 6. Geographic regions by coverage: Ranking of geographic regions (in descending
# order) in terms of the relative proportion of politician articles from countries in each
# region that are of GA and FA-quality
high_article_pop = high_articles_per_pop.reset_index()
high_article_pop['Sub-Region'] = high_article_pop['country'].map(region_country)
high_article_region = high_article_pop.groupby('Sub-Region').agg({'article_name':'sum'})
articles_per_region_pop = high_article_region.merge(pop_per_region, how = 'left', left_on = 'Sub-Region', right_on = 'Name' )
articles_per_region_pop = articles_per_region_pop.rename(columns={'article_name': 'high_qty_count', 'Name': 'high_qty_Name', 'Population' : 'high_qty_pop'})
frames = [articles_per_region, articles_per_region_pop]
combine_all = pd.concat(frames, axis=1)
combine_all = combine_all.drop(columns=['article_region_percentage', 'Name', 'Population'])
combine_all['high_art_percentage'] = combine_all['high_qty_count'] *100 / combine_all['article_name']
combine_all.sort_values(by = ['high_art_percentage'], axis=0, ascending=False, inplace=True)
combine_all

Unnamed: 0,article_name,high_qty_count,high_qty_Name,high_qty_pop,high_art_percentage
8,1901,104,NORTHERN AMERICA,368193000,5.470805
13,2020,73,SOUTHEAST ASIA,661845000,3.613861
17,2563,89,WESTERN ASIA,280927000,3.472493
5,3732,118,EASTERN EUROPE,291902000,3.161844
3,2473,76,EAST ASIA,1641063000,3.07319
2,245,7,CENTRAL ASIA,74961000,2.857143
9,3763,102,NORTHERN EUROPE,105990000,2.710603
6,665,16,MIDDLE AFRICA,179757000,2.406015
7,899,19,NORTHERN AFRICA,244344000,2.113459
10,3126,63,OCEANIA,43155000,2.015355


## Write Up

# 1. What biases did you expect to find in the data (before you started working with it), and why?

Before I started working with this data, I thought regions with the larger population should have more high quality articles as least for English countries. The results somehow prove my assumption. Because most of Asia regions are non-English countries. NORTHERN AMERICA comes with the most population for the English region.


## 2. What (potential) sources of bias did you discover in the course of your data processing and analysis?

There are 1800 unmatch article data. If they are match the rev_id, the result may change. Also, for some small countries and developing countries, their education level can't compare with developed countries. So the article qty/quality will decrease.



## 3. Can you think of a realistic data science research situation where using these data (to train a model, perform a hypothesis-driven research, or make business decisions) might create biased or misleading results, due to the inherent gaps and limitations of the data?
Yes. The model may create biased or misleading results. As I said with Q2, the data is incomplete. And different language, education level and more factors will affect the results.