# Randomly sample Wikipedia articles

In [1]:
import pandas as pd
import numpy as np

Stratified sample using [wp1.openzim.org](https://github.com/openzim/wp1) data:

In [2]:
wp10_raw = pd.read_csv('data/wp1_openzim_20200419.csv.zip')

In [3]:
wp10_raw['importance'] = pd.Categorical(wp10_raw['importance'], categories = ['Unknown', 'Low', 'Mid', 'High', 'Top'], ordered = True)
wp10_raw['quality'] = pd.Categorical(wp10_raw['quality'], categories = ['Unassessed', 'List', 'Stub', 'Start', 'C', 'B', 'GA', 'A', 'FL', 'FA'], ordered = True)

In [4]:
wp10_quality = wp10_raw.groupby('article')['quality'].max().reset_index()

In [5]:
wp10_quality_count = wp10_quality['quality'].value_counts().reset_index()

In [6]:
wp10_quality_count['index'] = pd.Categorical(wp10_quality_count['index'], categories = ['Unassessed', 'List', 'Stub', 'Start', 'C', 'B', 'GA', 'A', 'FL', 'FA'], ordered = True)

In [7]:
wp10_quality_count.sort_values('index', ascending=False)

Unnamed: 0,index,quality
5,FA,5347
6,FL,2802
9,A,568
3,GA,18788
1,B,30252
0,C,31730
2,Start,26633
4,Stub,5700
8,List,804
7,Unassessed,998


Compared with https://en.wikipedia.org/wiki/Wikipedia:Version_1.0_Editorial_Team/Statistics

Quality | Total
:--|--:
FA-Class|6,866
FL-Class|2,020
A-Class|2,109
GA-Class|33,420
B-Class|131,091
C-Class|332,133
Start-Class|1,826,843
Stub-Class|3,390,074
List-Class|256,087
Assessed-Class|5,980,643
Unassessed|493,561
Total|6,474,204


In [8]:
np.random.seed(1337)
s_size = 5
wp10_sample = wp10_quality[~wp10_quality['quality'].isin(['FL', 'List', 'Unassessed'])].groupby('quality', group_keys=False).apply(lambda x: x.sample(min(len(x), s_size)))

In [9]:
wp10_sample.reset_index(inplace = True, drop = True) 
wp10_sample.head()

Unnamed: 0,article,quality
0,1846 Havana hurricane,A
1,Priscus (general),A
2,Lê Văn Duyệt,A
3,Brilliant Pebbles,A
4,Dartford Crossing,A


## Exploration of the sample articles

In [10]:
import datetime
from external.wikipedia import WikipediaDV, WikipediaAPI
from wikiwho_wrapper import WikiWho
from metrics.conflict import ConflictManager
import time
import os.path
import articlequality
import mwxml
import requests
import pickle

In [11]:
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

In [12]:
wikipedia_dv = WikipediaDV(WikipediaAPI(domain='en.wikipedia.org'))
wikiwho = WikiWho(lng='en')

In [13]:
wp10_sample['page_id'] = wp10_sample['article'].apply(lambda x: wikipedia_dv.get_page(x)['page_id'] )
wp10_sample['talkpage_id'] = wp10_sample['article'].apply(lambda x: wikipedia_dv.get_page('Talk:%s'%x)['page_id'] )

In [14]:
wp10_sample

Unnamed: 0,article,quality,page_id,talkpage_id
0,1846 Havana hurricane,A,3608389,3608560
1,Priscus (general),A,26377832,26384943
2,Lê Văn Duyệt,A,23964203,23964481
3,Brilliant Pebbles,A,1291047,55908840
4,Dartford Crossing,A,62740,5185736
5,Saul Bass,B,849349,1814266
6,History of the Maya civilization,B,46998769,46999152
7,RAF Church Lawford,B,35170176,35217196
8,Orbiter (Canada's Wonderland),B,33242610,33244369
9,WBMX (FM),B,2381612,2752307


Get labels and timeseries of revision history

In [15]:
# TODO: Limits? Max number of revisions? Filesize?
# TODO: compress XML files
def get_labels(x):
    talk_path = 'data/talk/%s.xml' % x['talkpage_id']
    if not os.path.isfile(talk_path):
        talk_url = 'https://en.wikipedia.org/wiki/Special:Export/Talk:%s?templates=True&history=True' % x['article']
        with open(talk_path, 'wb') as f:
            resp = requests.get(talk_url, verify=False)
            f.write(resp.content)
    with open(talk_path, 'r') as f:
        xmldump = mwxml.Dump.from_file(f)
        labels_generator = articlequality.extract_labelings(dump = xmldump)
        labels = pd.DataFrame([x for x in labels_generator])
    # Convert
    if len(labels) > 0:
        labels['timestamp'] = pd.to_datetime(labels['timestamp'])
        labels.rename(columns={'wp10': 'quality'}, inplace=True)
        labels['quality'] = pd.Categorical(labels['quality'], ordered = True, categories = ['stub', 'start', 'c', 'b', 'ga', 'a', 'fa'])
    return labels

In [16]:
def get_ts(x):
    print(x['article'])
    ts_path = 'data/ts/%s.csv.zip' % x['page_id']
    if not os.path.isfile(ts_path):
        _content = wikiwho.dv.all_content(x['page_id'])
        _revisions = wikiwho.dv.rev_ids_of_article(x['page_id'])
        _calculator = ConflictManager(_content, _revisions, 'en')
        _calculator.calculate();
        ts = _calculator.all_actions
        ts.to_csv(ts_path, compression={'method': 'zip', 'archive_name': '%s.csv'%x['page_id']})
    else:
        ts = pd.read_csv(ts_path)
    # Convert
    ts = ts.sort_values(['token_id', 'rev_time'], ascending=True).set_index('token_id')
    ts['rev_time'] = pd.to_datetime(ts['rev_time']).dt.tz_localize(None)
    return ts

In [17]:
#tsr = get_ts( wp10_sample.iloc[0] )
#tsr.head()

In [18]:
list_ts = []
list_labels = []
for i, page in wp10_sample.iterrows():
    ts = get_ts(page)
    list_ts.append(ts)
    labels = get_labels(page)
    list_labels.append(labels)
    time.sleep(0.5)

...................................

1846 Havana hurricane


.......................................

Priscus (general)


....................................

Lê Văn Duyệt


......

Brilliant Pebbles


......................................

Dartford Crossing


..........................................................................................................................................................................

Saul Bass


...................................

History of the Maya civilization


...

RAF Church Lawford


.......

Orbiter (Canada's Wonderland)


...................

WBMX (FM)


.....

Cassareep


.............

Sushil Kumar Modi


.............................

Franz Miklosich


...................................................

Yum cha


.....................

Notability in the English Wikipedia


...................................................................................................................................

Tasha Yar


....................................................................................................................................

Pengkhianatan G30S/PKI
George V


........................................................................................................................................................................................................................................................................................................................................

William Hely
John McCain


  if (yield from self.run_code(code, result)):
.........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

Half-Way Covenant


.......................................................................

Gilbert Thomas Carter


...........................

Spontaneous cerebrospinal fluid leak


...................................................................................................................................................................................

Donald Glover


...............................................................................................................

Howl's Moving Castle (film)


................................................................................................................................................................

Faaa


................

Nerang–Broadbeach Road


.......

Leo Holub


........

Blood of the Fold


.....

Breathless (CSI: Miami)


......

Sea Life Busan Aquarium


.........

Cross Lake (Shreveport, Louisiana)


......

International School of Choueifat, Sharjah


..........

Splatter: Love, Honor and Paintball


..........

Mulligan Highway


In [19]:
with open("data/sample_data.pickle", "wb") as f:
    pickle.dump((list_ts, list_labels), f)

In [20]:
#with open("data/sample_data.pickle", "rb") as f:
#    list_ts, list_labels = pickle.load(f) 

### Contribution

In [21]:
def return_first_labels(labels):
    labels_first = labels.groupby('quality', observed=True).first().reset_index().sort_values('timestamp')
    # Remove grade if the grade is lower than an earlier grade, and if grades are not 'GA' and 'A'.
    labels_first['quality_lt_prev'] = labels_first['quality'] < labels_first['quality'].shift(1)
    labels_first['quality_a_ga'] = np.logical_and(labels_first['quality'] == 'a', labels_first['quality'].shift(1) == 'ga')
    labels_first['quality_ga_a'] = np.logical_and(labels_first['quality'] == 'ga', labels_first['quality'].shift(1) == 'a')
    labels_first['quality_not_ga_a'] = ~np.logical_or(labels_first['quality_a_ga'], labels_first['quality_ga_a'])
    labels_first['quality_drop'] = np.logical_and(labels_first['quality_lt_prev'], labels_first['quality_not_ga_a'])
    labels_first = labels_first[~labels_first['quality_drop']]
    labels_first = labels_first.loc[:,~labels_first.columns.str.startswith('quality_')]
    return labels_first

In [22]:
def quality_transitions(i):
    transition_bins = []
    tl = pd.DataFrame()
    if len(list_labels[i]) > 0:
        labels_first = return_first_labels(list_labels[i])
        
        rev_time_min = (list_ts[i]['rev_time'].min()-pd.DateOffset(days=1)).floor('D')
        rev_time_max = (list_ts[i]['rev_time'].max()+pd.DateOffset(days=1)).ceil('D')
        transition_bins = labels_first['timestamp'].values
        quality_labels = labels_first['quality'].values.astype(str)
        if min(transition_bins) >= rev_time_min:
            transition_bins = np.insert(transition_bins, 0, rev_time_min)
            quality_labels = np.insert(quality_labels, 0, ' ')
        if max(transition_bins) <= rev_time_max:
            transition_bins = np.insert(transition_bins, len(transition_bins), rev_time_max)
            quality_labels = np.insert(quality_labels, len(quality_labels), ' ')
        
        tl['time'] = transition_bins
        tl['quality'] = quality_labels
        tl['quality_transition'] = tl['quality'].shift(1) + " → " + tl['quality']
        tl['quality_transition'] = pd.Categorical(tl['quality_transition'], categories = tl['quality_transition'][1:].tolist(), ordered = True)
        tl.insert(0, 'time_prior', tl['time'].shift(1))
        tl.drop(0, inplace=True)
        tl.drop('quality', axis=1, inplace=True)
    return transition_bins, tl

In [23]:
list_transition_bins= []
list_transition_labels = []
for i, page in wp10_sample.iterrows():
    transition_bins, tl = quality_transitions(i)
    list_transition_bins.append(transition_bins)
    list_transition_labels.append(tl)

In [24]:
for i, page in wp10_sample.iterrows():
    if len(list_labels[i]) > 0:
        list_ts[i]['quality_transition'] = pd.cut(list_ts[i]['rev_time'], list_transition_bins[i], include_lowest=True, labels = list_transition_labels[i]['quality_transition'].values)

In [25]:
# TODO: Ranking without ties
list_transition_contribution = []
for i, page in wp10_sample.iterrows():
    _transition_contribution = pd.DataFrame()
    if len(list_labels[i]) > 0:
        last_action = list_ts[i].groupby(['quality_transition', 'token_id']).last()
        survived_tokens = last_action[last_action['action'] != 'out']
        count = survived_tokens.groupby(['quality_transition', 'editor']).size()
        transition_contribution = pd.DataFrame({'abs': count}).reset_index()
        # Convert
        transition_contribution['ranking'] = transition_contribution.groupby('editor')['abs'].transform(np.sum).rank(ascending=False, method='dense').astype(int)
        transition_contribution['editor_top10'] = np.where(transition_contribution['ranking'] < 10, transition_contribution['editor'], 0)
    list_transition_contribution.append(transition_contribution)

### Plots

In [26]:
import altair as alt

In [27]:
alt.renderers.enable('default', embed_options={'actions': False})
alt.data_transformers.disable_max_rows();

In [28]:
def plot_transition_contribution(article, transition_contribution, tl):
    editor_order = transition_contribution[['editor', 'ranking']].drop_duplicates().sort_values(by=['ranking'])['editor'].values
    fig = alt.Chart(transition_contribution).mark_bar().encode(
        x = alt.X('quality_transition:O', title = "Transitions of the content assesment class", sort = tl['quality_transition'].tolist(), axis = alt.Axis(labelAngle = 0, labelFontSize = 13, labelFontWeight = 'bold')),
        y = alt.Y('abs', title = 'New token ownership'),
        color = alt.Color('editor_top10:N', title = "Top 10 editors", sort = editor_order),
        order = alt.Order('ranking', sort = 'ascending')
    ).properties(width = 400, title = ['%s'%article, 'New contributions by editor across content assesment phases'])
    return fig

In [29]:
list_plot_transition_contribution = []
for i, page in wp10_sample.iterrows():
    if len(list_labels[i]) > 0:
        plot = plot_transition_contribution(wp10_sample['article'].values[i], list_transition_contribution[i], list_transition_labels[i])
    else:
        plot = alt.Chart(pd.DataFrame()).mark_bar()
    list_plot_transition_contribution.append(plot)

In [30]:
# All plots
#alt.vconcat(*list_plot_transition_contribution).resolve_scale(color = 'independent')

In [31]:
# Random plot
import random
random.choice(list_plot_transition_contribution)

Save all plots

In [32]:
from altair_saver import save

In [None]:
for i, page in wp10_sample.iterrows():
    if len(list_labels[i]) > 0:
        save(list_plot_transition_contribution[i], 'charts/contributions_%s.svg'%wp10_sample['page_id'].values[i])

In [None]:
# If error, then save plot manually using action.
#alt.renderers.enable('default', embed_options={'actions': True})
#i = 19
#list_plot_transition_contribution[i]