# Attribution and content assessment on Wikipedia

In [1]:
import pandas as pd
import numpy as np
import datetime
import altair as alt

In [2]:
from external.wikipedia import WikipediaDV, WikipediaAPI
from wikiwho_wrapper import WikiWho

In [3]:
from metrics.conflict import ConflictManager

In [4]:
#from visualization.owned_listener import OwnedListener

In [5]:
alt.renderers.enable('default', embed_options={'actions': False})
alt.data_transformers.disable_max_rows();

Input / Wikipedia page

In [6]:
page_title = 'Andha_Naal'

## Attribution

Python package [wikiwho_wrapper](https://github.com/gesiscss/wikiwho_wrapper).

Gets content, revisions and editors. Based on this the package calculates word tokens, more specifically it assigns ownership of each token.

In [7]:
wikipedia_dv = WikipediaDV(WikipediaAPI(domain='en.wikipedia.org'))
the_page = wikipedia_dv.get_page(page_title)

Get actions and merge with editor names

In [8]:
wikiwho = WikiWho(lng='en')
agg_actions = wikiwho.dv.edit_persistence(the_page.page_id)

In [9]:
editors = wikipedia_dv.get_editors(agg_actions['editor_id'].unique()).rename(columns = {'userid': 'editor_id'})

Get all content/tokens and revisions – then merge in `ConflictManager()` function, which returns a reformated df called `all_actions`

In [10]:
all_content = wikiwho.dv.all_content(the_page['page_id'])

In [11]:
revisions = wikiwho.dv.rev_ids_of_article(the_page['page_id'])

TODO: Understand role of stop words.

In [12]:
calculator = ConflictManager(all_content, revisions, 'en')
calculator.calculate();

Preparing elegible token actions
Merge elegible actions and revisions
Get the conflicts
Calculate time differences of undos
Get elegible_actions
Calculate the token conflict


In [13]:
owned = calculator.all_actions

In [14]:
#owned['editor_id'] = owned.editor.map( lambda x: -1 if x.startswith('0|') else x).astype(int)
#owned = owned.merge(editors[['editor_id', 'name']], on='editor_id')
#owned.rename(columns={'name': 'editor_name'}, inplace=True)
#owned['editor_name'] = owned['editor_name'].fillna("Unregistered")

In [15]:
#owned.head()

Convert data into a time-series

In [16]:
tsr = owned.sort_values(['token_id', 'rev_time'], ascending=True).set_index('token_id')

In [17]:
tsr['rev_time'] = pd.to_datetime(tsr['rev_time']).dt.tz_localize(None)

In [18]:
# Every day with observations, offset and sorted in descending order
days = pd.Series(tsr['rev_time'].dt.to_period('D').unique()).sort_values(ascending=False).dt.to_timestamp('D') + pd.DateOffset(days=1)
days[1:5]

1     2020-04-14
135   2020-04-07
197   2020-03-05
249   2020-01-09
dtype: datetime64[ns]

In [19]:
tsr.head()

Unnamed: 0_level_0,action,article_title,page_id,o_editor,o_rev_id,token,rev_id,rev_time,editor
token_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,in,Andha_Naal,19392462,1226602,239670862,infobox,239670862,2008-09-19 22:44:16,1226602
1,out,Andha_Naal,19392462,1226602,239670862,infobox,950770666,2020-04-13 19:07:25,0|2405:204:2283:7BC6:0:0:148B:28A0
1,in,Andha_Naal,19392462,1226602,239670862,infobox,950771190,2020-04-13 19:11:07,35519569
2,in,Andha_Naal,19392462,1226602,239670862,film,239670862,2008-09-19 22:44:16,1226602
2,out,Andha_Naal,19392462,1226602,239670862,film,950770666,2020-04-13 19:07:25,0|2405:204:2283:7BC6:0:0:148B:28A0


In [20]:
_abs = []
_tsr = tsr
for rev_time in days:
    _tsr = _tsr[_tsr['rev_time'] <= rev_time]
    last_action = _tsr.groupby('token_id').last()
    surv = last_action[last_action['action'] != 'out']
    count = surv.groupby(['editor']).size()
    count = pd.DataFrame({'abs': count}).reset_index()
    count.insert(0, 'rev_time', rev_time)
    _abs.append(count)

In [21]:
ts = pd.concat(_abs)
ts = ts.groupby(['rev_time', 'editor']).agg({'abs': 'sum'})
#ts['rel'] = ts.groupby(level=0).apply( lambda x:  100*x / x.sum() )
ts = ts.reset_index()
ts['ranking'] = ts.groupby('editor')['abs'].transform(np.sum).rank(ascending=False, method='dense').astype(int)
ts['editor_top10'] = np.where(ts['ranking'] < 10, ts['editor'], 0)
print(len(ts))
ts.head()

7072


Unnamed: 0,rev_time,editor,abs,ranking,editor_top10
0,2008-09-20,1226602,200,5,1226602
1,2008-10-06,0|75.169.107.18,21,38,0
2,2008-10-06,1226602,197,5,1226602
3,2008-10-16,0|75.169.107.18,21,38,0
4,2008-10-16,1226602,197,5,1226602


Plot the time-series data

In [22]:
editor_order = ts[['editor', 'ranking']].drop_duplicates().sort_values(by=['ranking'])['editor'].values

In [23]:
attribution = alt.Chart(ts).mark_area().encode(
    x = alt.X("rev_time", title = 'Time of revision'),
    y = alt.Y("abs", title = 'Tokens owned'),
    #y = alt.Y("abs", stack="normalize", axis = alt.Axis(title = 'Tokens Owned (%)', format='%')),
    color = alt.Color('editor_top10:N', title = 'Top 10 editors', sort = editor_order),
    order = alt.Order('ranking', sort = 'ascending')
).properties(width = 800, title = 'Ownership by time and editor')

In [24]:
attribution

## Content assessment

Python package [articlequality](https://github.com/wikimedia/articlequality)

In [25]:
import articlequality
import mwxml
import requests

Download talk page

TODO: Limits? Max number of revisions? Filesize? Cache?

In [26]:
#talk_file = 'data/Talk_Andha_Naal_20200415150915.xml'

In [None]:
talk_now = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
talk_file = 'data/Talk_%s_%s.xml' % (page_title, talk_now)
talk_url = 'https://en.wikipedia.org/wiki/Special:Export/Talk:%s?templates=True&history=True' % page_title
with open(talk_file, 'wb') as f:
    resp = requests.get(talk_url, verify=False)
    f.write(resp.content)

Extract labels  
Specifically, the first-observations of a project/label pair.

In [27]:
with open(talk_file) as f:
    dump = mwxml.Dump.from_file(f)
    _labels = articlequality.extract_labelings(dump=dump)
    labels = pd.DataFrame([x for x in _labels])

.............................................................................................

In [28]:
labels['timestamp'] = pd.to_datetime(labels['timestamp'])
labels.rename(columns={'wp10': 'class'}, inplace=True)
labels['class'] = pd.Categorical(labels['class'], ordered = True, categories = ['stub', 'start', 'c', 'b', 'a', 'ga', 'fa'])

In [29]:
labels

Unnamed: 0,page_title,project,timestamp,class
0,Andha Naal,india,2009-01-11 05:49:17,stub
1,Andha Naal,india,2011-12-29 06:17:08,start
2,Andha Naal,india,2012-06-07 09:28:04,c
3,Andha Naal,film,2014-02-14 19:41:38,c
4,Andha Naal,india,2014-05-15 17:06:28,ga
5,Andha Naal,film,2014-05-15 17:06:28,ga
6,Andha Naal,india,2017-03-31 23:09:24,fa
7,Andha Naal,film,2017-03-31 23:09:24,fa


Plot assesments

In [30]:
assesment = alt.Chart(labels).mark_tick(thickness=3).encode(
    x = alt.X('timestamp', title = 'Time of adoption'),
    y = alt.X('project', title = 'WikiProject'),
    color = alt.Color('class:O', title = "Class", sort = ['stub', 'start', 'c', 'b', 'a', 'ga', 'fa'])
).properties(title = 'Content assesment')

In [31]:
assesment

**Combining plots**

In [32]:
attribution_assesment = alt.vconcat(attribution, assesment).resolve_scale(color = 'independent', x = 'shared')
attribution_assesment

## Contributions to content assesment *transitions*

In [33]:
labels_first = labels.groupby('class', observed=True).first().reset_index()

In [34]:
transition_bins = np.insert(labels_first['timestamp'].values, [0,labels_first['timestamp'].size], [tsr['rev_time'].min(),ts['rev_time'].max()])

In [35]:
# Transition labels
tl = pd.DataFrame()
tl['time'] = transition_bins
tl['class'] = np.insert(labels_first['class'].values.astype(str), [0,labels_first['class'].size], [' ',' '])
tl['class_transition'] = tl['class'].shift(1) + " → " + tl['class']
tl['class_transition'] = pd.Categorical(tl['class_transition'], categories = tl['class_transition'][1:].tolist(), ordered = True)
tl.insert(0, 'time_prior', tl['time'].shift(1))
tl.drop(0, inplace=True)
tl.drop('class', axis=1, inplace=True)
tl

Unnamed: 0,time_prior,time,class_transition
1,2008-09-19 22:44:16,2009-01-11 05:49:17,→ stub
2,2009-01-11 05:49:17,2011-12-29 06:17:08,stub → start
3,2011-12-29 06:17:08,2012-06-07 09:28:04,start → c
4,2012-06-07 09:28:04,2014-05-15 17:06:28,c → ga
5,2014-05-15 17:06:28,2017-03-31 23:09:24,ga → fa
6,2017-03-31 23:09:24,2020-04-15 00:00:00,fa →


In [36]:
ts['class_transition'] = pd.cut(ts['rev_time'], transition_bins, include_lowest=True, labels = tl['class_transition'].values)

In [37]:
ts.tail()

Unnamed: 0,rev_time,editor,abs,ranking,editor_top10,class_transition
7067,2020-04-15,7098284,4,47,0,fa →
7068,2020-04-15,7611264,34,11,0,fa →
7069,2020-04-15,7902442,3,21,0,fa →
7070,2020-04-15,8372814,11,55,0,fa →
7071,2020-04-15,915833,188,31,0,fa →


Plot each change in content assesment

In [38]:
def big_diff(nums): return max(nums) - min(nums)

In [39]:
transition_duration = np.diff(transition_bins) / big_diff(transition_bins) # normalised

In [40]:
tick_count = np.rint(transition_duration*20)
center_panel = round(len(tl['class_transition'])/2)
fig_list = []
for i, transition in enumerate(tl['class_transition'].values):
    fig = alt.Chart(ts[ts['class_transition'] == transition]).mark_area().encode(
        x = alt.X(
            "rev_time", 
            title = None if i != center_panel else 'Time of revision', 
            axis = alt.Axis(grid = False, format = '%b %Y', tickCount = tick_count[i])
        ),
        y = alt.Y(
            "abs", 
            title = None if i > 0 else 'Token ownership', 
            axis = alt.Axis(labels = (i==0), domain = (i==0), ticks = (i==0))
        ),
        color = alt.Color('editor_top10:N', title = "Top 10 editors", sort = editor_order),
        order = alt.Order('ranking', sort = 'ascending')
    ).properties(
        title = transition, 
        width = 800*transition_duration[i]
    )
    fig_list.append(fig)
attribution_assesment_transitions = alt.hconcat(*fig_list).resolve_scale(y='shared').resolve_axis(y='shared')
attribution_assesment_transitions

### New tokens contribution in each transition

For each editor and each improved content assesment (i.e. change in class), find the number of new tokens. That is, the tokens that the editor adds between changes in the class (Stub, Start, C-class, B-class, A-class, Good article, Feature article).


In [41]:
_tsr = tsr
_tsr['class_transition'] = pd.cut(_tsr['rev_time'], transition_bins, include_lowest=True, labels = tl['class_transition'].values)
_tsr.head()

Unnamed: 0_level_0,action,article_title,page_id,o_editor,o_rev_id,token,rev_id,rev_time,editor,class_transition
token_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,in,Andha_Naal,19392462,1226602,239670862,infobox,239670862,2008-09-19 22:44:16,1226602,→ stub
1,out,Andha_Naal,19392462,1226602,239670862,infobox,950770666,2020-04-13 19:07:25,0|2405:204:2283:7BC6:0:0:148B:28A0,fa →
1,in,Andha_Naal,19392462,1226602,239670862,infobox,950771190,2020-04-13 19:11:07,35519569,fa →
2,in,Andha_Naal,19392462,1226602,239670862,film,239670862,2008-09-19 22:44:16,1226602,→ stub
2,out,Andha_Naal,19392462,1226602,239670862,film,950770666,2020-04-13 19:07:25,0|2405:204:2283:7BC6:0:0:148B:28A0,fa →


In [42]:
last_action = _tsr.groupby(['class_transition', 'token_id']).last()
surv = last_action[last_action['action'] != 'out']
count = surv.groupby(['class_transition', 'editor']).size()
count = pd.DataFrame({'abs': count}).reset_index()
transition_contr = count

In [43]:
transition_contr['ranking'] = transition_contr.groupby('editor')['abs'].transform(np.sum).rank(ascending=False, method='dense').astype(int)
transition_contr['editor_top10'] = np.where(transition_contr['ranking'] < 10, transition_contr['editor'], 0)

In [44]:
contributions_assesment_transitions = alt.Chart(transition_contr).mark_bar().encode(
    x = alt.X('class_transition:O', title = "Transitions of the content assesment class", sort = tl['class_transition'].tolist(), axis = alt.Axis(labelAngle = 0, labelFontSize = 13, labelFontWeight = 'bold')),
    y = alt.Y('abs', title = 'New token ownership'),
    color = alt.Color('editor_top10:N', title = "Top 10 editors", sort = editor_order),
    order = alt.Order('ranking', sort = 'ascending')
).properties(width = 800, title = 'New contributions by editor across content assesment phases')
contributions_assesment_transitions

**Save charts**

In [45]:
from altair_saver import save

In [None]:
save(attribution, 'charts/attribution.png', scale_factor=2.0)
save(assesment, 'charts/assesment.png', scale_factor=2.0)
save(attribution_assesment, 'charts/attribution_assesment.png', scale_factor=2.0)
save(attribution_assesment_transitions, 'charts/attribution_assesment_transitions.png', scale_factor=2.0)
save(contributions_assesment_transitions, 'charts/contributions_assesment_transitions.png', scale_factor=2.0)