In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import glob
%matplotlib inline
%load_ext Cython

pd.set_option("display.max_columns",100)

## Data processing

In [3]:
glob.glob("../../datasets/??wiki_20161?01_*bot2bot.tsv")

['../../datasets/enwiki_20161201_reverted_bot2bot.tsv',
 '../../datasets/zhwiki_20161001_reverted_bot2bot.tsv',
 '../../datasets/frwiki_20161001_reverted_bot2bot.tsv',
 '../../datasets/jawiki_20161001_reverted_bot2bot.tsv',
 '../../datasets/eswiki_20161001_reverted_bot2bot.tsv',
 '../../datasets/ptwiki_20161001_reverted_bot2bot.tsv',
 '../../datasets/dewiki_20161001_reverted_bot2bot.tsv']

In [4]:
df_dict = {}
for filename in glob.glob("../../datasets/*bot2bot.tsv"):
    lang_code = filename[15:17]
    df_dict[lang_code] = pd.read_csv(filename, sep="\t")
    df_dict[lang_code] = df_dict[lang_code].drop_duplicates()

In [5]:
for lang, lang_df in df_dict.items():
    print(lang, len(lang_df))

fr 96533
ja 44990
pt 70973
zh 51536
es 88684
en 500553
de 68922


In [6]:
df_dict['en'][0:2].transpose()

Unnamed: 0,0,1
rev_id,273691771,136526894
rev_timestamp,20090227173507,20070607044209
rev_user,6505923,4534303
rev_user_text,Kbdankbot,PbBot
rev_page,5040439,3046554
rev_sha1,qj45ne2z4yfexmpaz5wfnbm2yrmqt4j,3xtnw7u4w9h6cg1smw97mqnr1en6a55
rev_minor_edit,False,False
rev_deleted,False,False
rev_parent_id,2.59117e+08,1.20932e+08
archived,False,False


### Combining into one dataframe

In [7]:
df_all = df_dict['en'].copy()
df_all = df_all.drop(df_all.index, axis=0)

for lang, lang_df in df_dict.items():
    lang_df['language'] = lang
    df_all = pd.concat([df_all, lang_df])

In [8]:
df_all['language'].value_counts()

en    500553
fr     96533
es     88684
pt     70973
de     68922
zh     51536
ja     44990
Name: language, dtype: int64

### Namespace type

In [9]:
%%cython
def namespace_type(item):
    if int(item) == 0:
        return 'article'
    elif int(item) == 14:
        return 'category'
    elif int(item) % 2 == 1:
        return 'other talk'
    else:
        return 'other page'

In [10]:
df_all['namespace_type'] = df_all['page_namespace'].apply(namespace_type)

In [11]:
df_all['namespace_type'].value_counts()

article       561381
category      182602
other page    113512
other talk     64696
Name: namespace_type, dtype: int64

### Datetime parsing

In [12]:
%%cython
def get_year(timestamp):
    return timestamp.year

In [13]:
df_all['reverting_timestamp_dt'] = pd.to_datetime(df_all['reverting_timestamp'], format="%Y%m%d%H%M%S")

df_all['reverted_timestamp_dt'] = pd.to_datetime(df_all['rev_timestamp'], format="%Y%m%d%H%M%S")

df_all = df_all.set_index('reverting_timestamp_dt')

df_all['reverting_timestamp_dt'] = pd.to_datetime(df_all['reverting_timestamp'], format="%Y%m%d%H%M%S")

df_all['time_to_revert'] = df_all['reverting_timestamp_dt']-df_all['reverted_timestamp_dt']

df_all['time_to_revert_hrs'] = df_all['time_to_revert'].astype('timedelta64[s]')/(60*60)

df_all['time_to_revert_days'] = df_all['time_to_revert'].astype('timedelta64[s]')/(60*60*24)

df_all['reverting_year'] = df_all['reverting_timestamp_dt'].apply(get_year)

df_all['time_to_revert_days_log10'] = df_all['time_to_revert_days'].apply(np.log10)

df_all['time_to_revert_hrs_log10'] = df_all['time_to_revert_hrs'].apply(np.log10)

In [14]:
df_all[df_all['language']=='en'].reverting_year.value_counts().sort_index()

2004         2
2005       131
2006      3119
2007     17042
2008     33114
2009     36423
2010     30890
2011     63407
2012     48042
2013    201562
2014     20594
2015     24597
2016     21630
Name: reverting_year, dtype: int64

In [15]:
df_all.reverting_year.value_counts().sort_index()

2004       302
2005      1598
2006      6356
2007     29417
2008     54978
2009     81521
2010     68500
2011    146998
2012    103113
2013    342596
2014     26722
2015     35408
2016     24576
2017       106
Name: reverting_year, dtype: int64

### Check time to revert for negatives

In [16]:
len(df_all.query("time_to_revert_days < 0"))


3

In [17]:
len(df_all.query("time_to_revert_days > 0"))


922188

In [18]:
df_all.query("time_to_revert_days < 0").groupby("language")['rev_id'].count()


language
de    3
Name: rev_id, dtype: int64

In [19]:
df_all.query("time_to_revert_days > 0").groupby("language")['rev_id'].count()


language
de     68919
en    500553
es     88684
fr     96533
ja     44990
pt     70973
zh     51536
Name: rev_id, dtype: int64

## Other processing and metadata

### Botpair -- [bot1] rv [bot2]

In [16]:
%%cython
def concat_botpair(row):
    return str(row['reverting_user_text']) + " rv " + str(row['rev_user_text'])

def sorted_botpair(row):
    """
    Returns a sorted list. list.sort() is locale dependent, but it doesn't matter
    because all we need is some consistent (if arbitrary) way of uniquely sorting.
    """
    
    return str(sorted([row['reverting_user_text'], row['rev_user_text']]))

In [17]:
df_all['botpair'] = df_all.apply(concat_botpair, axis=1)

In [18]:
df_all['botpair_sorted'] = df_all.apply(sorted_botpair, axis=1)

### Namespace type

In [19]:
def namespace_type(item):
    if int(item) == 0:
        return 'article'
    elif int(item) == 14:
        return 'category'
    elif int(item) % 2 == 1:
        return 'other talk'
    else:
        return 'other page'

In [20]:
df_all['namespace_type'] = df_all['page_namespace'].apply(namespace_type)

In [21]:
df_all['namespace_type'].value_counts()

article       561197
category      182601
other page    113498
other talk     64678
Name: namespace_type, dtype: int64

### Reverts per page per botpair

In [22]:
gb_lpb = df_all.groupby(["language", "rev_page", "botpair"])
gb_lpb_s = df_all.groupby(["language", "rev_page", "botpair_sorted"])

In [23]:
df_lpb = pd.DataFrame(gb_lpb['rev_id'].count()).reset_index().rename(columns={"rev_id":"reverts_per_page_botpair"})
df_lpb[0:5]

Unnamed: 0,language,rev_page,botpair,reverts_per_page_botpair
0,de,61,RedBot rv EmausBot,1
1,de,81,DumZiBoT rv CarsracBot,1
2,de,81,MerlIwBot rv ZéroBot,1
3,de,82,Alecs.bot rv SieBot,1
4,de,101,Xqbot rv Dinamik-bot,1


In [24]:
df_lpb_s = pd.DataFrame(gb_lpb_s['rev_id'].count()).reset_index().rename(columns={"rev_id":"reverts_per_page_botpair_sorted"})
df_lpb_s[0:5]

Unnamed: 0,language,rev_page,botpair_sorted,reverts_per_page_botpair_sorted
0,de,61,"['EmausBot', 'RedBot']",1
1,de,81,"['CarsracBot', 'DumZiBoT']",1
2,de,81,"['MerlIwBot', 'ZéroBot']",1
3,de,82,"['Alecs.bot', 'SieBot']",1
4,de,101,"['Dinamik-bot', 'Xqbot']",1


In [25]:
df_all = pd.merge(df_all, df_lpb, how='left',
         left_on=["language", "rev_page", "botpair"],
         right_on=["language", "rev_page", "botpair"])

df_all = pd.merge(df_all, df_lpb_s, how='left',
         left_on=["language", "rev_page", "botpair_sorted"],
         right_on=["language", "rev_page", "botpair_sorted"])

### Cleaning

These are some weird cases from de.wikipedia.org which look to have a database error

In [31]:
len(df_all)

921974

In [34]:
remove = [113534501, 142947562, 142917006]
df_all = df_all[~df_all['rev_id'].isin(remove)]

In [35]:
len(df_all)

921971

### Final data format

In [36]:
len(df_all)

921971

In [37]:
df_all[0:2].transpose()

Unnamed: 0,0,1
archived,False,False
language,pt,pt
page_namespace,0,0
rev_deleted,False,False
rev_id,3502324,3525691
rev_minor_edit,True,True
rev_page,220,220
rev_parent_id,3.44361e+06,3.50232e+06
rev_revert_offset,1,3
rev_sha1,caa06f914di10gfxmi2zja7lcwib1bj,ak6yunuoagcv06b9dz0dhvoug68lnac


## Output to file

In [38]:
df_all.to_pickle("../../datasets/pandas_df_all.pickle")