In [20]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import glob
%matplotlib inline
pd.set_option('display.max_columns', 500)


## Data processing

In [21]:
glob.glob("../../datasets/*bot2bot.tsv")

['../../datasets/enwiki_20161201_reverted_bot2bot.tsv',
 '../../datasets/zhwiki_20161001_reverted_bot2bot.tsv',
 '../../datasets/frwiki_20161001_reverted_bot2bot.tsv',
 '../../datasets/jawiki_20161001_reverted_bot2bot.tsv',
 '../../datasets/eswiki_20161001_reverted_bot2bot.tsv',
 '../../datasets/ptwiki_20161001_reverted_bot2bot.tsv',
 '../../datasets/dewiki_20161001_reverted_bot2bot.tsv']

In [22]:
df_dict = {}
for filename in glob.glob("../../datasets/*bot2bot.tsv"):
    lang_code = filename[15:17]
    df_dict[lang_code] = pd.read_csv(filename, sep="\t")

In [23]:
for lang, lang_df in df_dict.items():
    print(lang, len(lang_df))

de 137844
pt 141738
en 1001093
zh 102846
es 177368
ja 89980
fr 193066


In [24]:
df_dict['en'][0:2].transpose()

Unnamed: 0,0,1
rev_id,273691771,136526894
rev_timestamp,20090227173507,20070607044209
rev_user,6505923,4534303
rev_user_text,Kbdankbot,PbBot
rev_page,5040439,3046554
rev_sha1,qj45ne2z4yfexmpaz5wfnbm2yrmqt4j,3xtnw7u4w9h6cg1smw97mqnr1en6a55
rev_minor_edit,False,False
rev_deleted,False,False
rev_parent_id,2.59117e+08,1.20932e+08
archived,False,False


### Look for duplicates

In [25]:
df_dict['en'].query("rev_page ==12")

Unnamed: 0,rev_id,rev_timestamp,rev_user,rev_user_text,rev_page,rev_sha1,rev_minor_edit,rev_deleted,rev_parent_id,archived,reverting_id,reverting_timestamp,reverting_user,reverting_user_text,reverting_page,reverting_sha1,reverting_minor_edit,reverting_deleted,reverting_parent_id,reverting_archived,reverting_comment,rev_revert_offset,revisions_reverted,reverted_to_rev_id,page_namespace
185,396300331,20101112114014,11340674,WikitanvirBot,12,e75plyc598xvr9trp4edsu8l8g65d81,True,False,396211399.0,False,396304452,20101112121743,3035831,VolkovBot,12,k8fcg4msl50tfo5n3kx3e834ej59zp1,True,False,396300331.0,False,robot Modifying: [[fa:آنارشیسم]],1,1,396211399,0
186,396304452,20101112121743,3035831,VolkovBot,12,k8fcg4msl50tfo5n3kx3e834ej59zp1,True,False,396300331.0,False,396329931,20101112151912,13177883,Mjbmrbot,12,e75plyc598xvr9trp4edsu8l8g65d81,True,False,396304452.0,False,robot Modifying: [[fa:اقتدارگریزی]],1,1,396300331,0
191,396300331,20101112114014,11340674,WikitanvirBot,12,e75plyc598xvr9trp4edsu8l8g65d81,True,False,396211399.0,False,396304452,20101112121743,3035831,VolkovBot,12,k8fcg4msl50tfo5n3kx3e834ej59zp1,True,False,396300331.0,False,robot Modifying: [[fa:آنارشیسم]],1,1,396211399,0
192,396304452,20101112121743,3035831,VolkovBot,12,k8fcg4msl50tfo5n3kx3e834ej59zp1,True,False,396300331.0,False,396329931,20101112151912,13177883,Mjbmrbot,12,e75plyc598xvr9trp4edsu8l8g65d81,True,False,396304452.0,False,robot Modifying: [[fa:اقتدارگریزی]],1,1,396300331,0


In [26]:
df_dict['en'].query("rev_page ==12").drop_duplicates()

Unnamed: 0,rev_id,rev_timestamp,rev_user,rev_user_text,rev_page,rev_sha1,rev_minor_edit,rev_deleted,rev_parent_id,archived,reverting_id,reverting_timestamp,reverting_user,reverting_user_text,reverting_page,reverting_sha1,reverting_minor_edit,reverting_deleted,reverting_parent_id,reverting_archived,reverting_comment,rev_revert_offset,revisions_reverted,reverted_to_rev_id,page_namespace
185,396300331,20101112114014,11340674,WikitanvirBot,12,e75plyc598xvr9trp4edsu8l8g65d81,True,False,396211399.0,False,396304452,20101112121743,3035831,VolkovBot,12,k8fcg4msl50tfo5n3kx3e834ej59zp1,True,False,396300331.0,False,robot Modifying: [[fa:آنارشیسم]],1,1,396211399,0
186,396304452,20101112121743,3035831,VolkovBot,12,k8fcg4msl50tfo5n3kx3e834ej59zp1,True,False,396300331.0,False,396329931,20101112151912,13177883,Mjbmrbot,12,e75plyc598xvr9trp4edsu8l8g65d81,True,False,396304452.0,False,robot Modifying: [[fa:اقتدارگریزی]],1,1,396300331,0


In [27]:
print(len(df_dict['en']))
print(len(df_dict['en'].drop_duplicates()))

1001093
500553


In [28]:
print(len(df_dict['ja']))
print(len(df_dict['ja'].drop_duplicates()))

89980
44990


### Combining into one dataframe

In [6]:
df_all = df_dict['en'].copy()
df_all = df_all.drop(df_all.index, axis=0)

for lang, lang_df in df_dict.items():
    lang_df['language'] = lang
    df_all = pd.concat([df_all, lang_df])

In [7]:
df_all['language'].value_counts()

en    1001093
fr     193066
es     177368
pt     141738
de     137844
zh     102846
ja      89980
Name: language, dtype: int64

In [8]:
for lang, lang_df in df_dict.items():
    print(lang, len(lang_df))


de 137844
pt 141738
en 1001093
zh 102846
es 177368
ja 89980
fr 193066


In [30]:
print(len(df_all))
print(len(df_all.drop_duplicates()))

1843935
921974
