In [1]:
## Import necessary libraries
import pickle
import pandas as pd
import math as mt

In [2]:
## Reading the DataFrame
df = pd.read_csv("enwiki_2002.txt", sep = '\t', parse_dates=['time'])
df.head()

Unnamed: 0,title,time,revert,version,user
0,Industrial_revolution,2002-02-25 15:51:15,0,2,Conversion_script
1,Industrial_revolution,2001-08-17 14:14:23,0,1,Koyaanis_Qatsi
2,Spitfire_(disambiguation),2002-06-20 13:00:32,0,2,Uriyan
3,Spitfire_(disambiguation),2002-02-25 15:43:11,0,1,Conversion_script
4,Supermarine_Spitfire,2002-12-24 13:09:25,0,29,Tannin


In [3]:
## Appling status to all
df = df.sort_values('time')
df['status'] = df.groupby('user').cumcount().add(1)
df['status'] = df['status'].apply(mt.log10)
df = df.sort_index() #sort-back the index

In [4]:
df.head(3)

Unnamed: 0,title,time,revert,version,user,status
0,Industrial_revolution,2002-02-25 15:51:15,0,2,Conversion_script,4.415641
1,Industrial_revolution,2001-08-17 14:14:23,0,1,Koyaanis_Qatsi,2.060698
2,Spitfire_(disambiguation),2002-06-20 13:00:32,0,2,Uriyan,2.816904


In [5]:
## List of the reversions in the data. i.e indexes of columns where revert == 1.
index_reversions = df[df['revert'] == 1].index.tolist()
index_reversions[:9]

[12, 35, 109, 110, 137, 153, 159, 163, 222]

In [6]:
##Take the Title and Version from the reversions
list_reverts = df.loc[index_reversions][['title','version']].values.tolist()
list_reverts[:10]

[['Supermarine_Spitfire ', 21],
 ['Jesus_College,_Cambridge ', 2],
 ['Bubble_sort ', 17],
 ['Bubble_sort ', 16],
 ['Jews ', 25],
 ['Jews ', 11],
 ['Jews ', 6],
 ['Jews ', 3],
 ["Coup_d'état ", 17],
 ["Coup_d'état ", 12]]

In [7]:
df.iloc[1748:1758]

Unnamed: 0,title,time,revert,version,user,status
1748,MSC_Malaysia,2002-10-18 12:56:20,0,2,Willsmith,1.255273
1749,MSC_Malaysia,2002-10-18 11:36:55,0,1,Willsmith,0.30103
1750,Aluminum,2002-12-27 03:21:09,1,5,Camembert,3.450095
1751,Aluminum,2002-12-27 02:54:25,1,4,Isis,3.596157
1752,Aluminum,2002-12-27 02:35:20,1,5,64.229.101.208,0.0
1753,Aluminum,2002-12-27 01:15:03,1,4,Isis,3.595827
1754,Aluminum,2002-12-27 00:39:50,0,5,Mav,4.104043
1755,Aluminum,2002-12-27 00:34:38,0,4,Isis,3.595717
1756,Aluminum,2002-12-27 00:32:02,0,3,Isis,3.595496
1757,Aluminum,2002-02-25 15:43:11,0,2,Conversion_script,3.678609


** In examples as in above we assume that at 2002-12-27 02:54:25 Isis reverted Mav (not 64.229.101.208)**  
** so Isis reverted Mav twice at 2002-12-27 01:15:03 and at 2002-12-27 02:54:25 **

In [8]:
## MAIN CODE: search for the reverted editor. Based on a search model. 
# 1) searches for the title of the wikipedia f.e ['Supermarine_Spitfire]
# 2) in that dataframe searches for the version to which is was reverted
# 3) (df['revert'] == 0) is in case to not to count twice some reversions of reversions. I.E the thing that is discussed above 
index_reverted = []
for sublist in list_reverts:
    indexex = df[(df['title'] == sublist[0]) & (df['version'] == sublist[1]) & (df['revert'] == 0)].index - 1
    index_reverted.append(indexex.tolist())

In [9]:
index_reverted_flat = [item for sublist in index_reverted for item in sublist]

In [10]:
df_revert = pd.DataFrame({'Time' : df.loc[index_reversions]['time'].tolist(), 
                          'Reverter' : df.loc[index_reversions]['user'].tolist(),
                          'Reverted' : df.loc[index_reverted_flat]['user'].tolist(),
                         'Status_reverter' : df.loc[index_reversions]['status'].tolist(),
                         'Status_reverted' : df.loc[index_reverted_flat]['status'].tolist()})

In [11]:
df_revert.head()

Unnamed: 0,Reverted,Reverter,Status_reverted,Status_reverter,Time
0,Uriyan,Uriyan,2.817565,2.817565,2002-06-20 13:01:19
1,Conversion_script,Willsmith,4.426218,2.33646,2002-10-23 13:34:06
2,FvdP,FvdP,2.281033,2.283301,2002-11-13 21:58:59
3,Jmallios,FvdP,0.0,2.281033,2002-11-13 21:53:17
4,Ezra_Wax,Danny,2.130334,3.279895,2002-11-03 04:28:10


In [14]:
edges_df = df_revert[df_revert['Reverter'] != df_revert['Reverted']].reset_index(drop = True)

In [15]:
edges_df.head(3)

Unnamed: 0,Reverted,Reverter,Status_reverted,Status_reverter,Time
0,Conversion_script,Willsmith,4.426218,2.33646,2002-10-23 13:34:06
1,Jmallios,FvdP,0.0,2.281033,2002-11-13 21:53:17
2,Ezra_Wax,Danny,2.130334,3.279895,2002-11-03 04:28:10


In [16]:
print("There are", len(edges_df), "reversions in the data")

There are 4992 reversions in the data


In [None]:
## uncomment if you want to overwrite.
#with open ('edges_df', 'wb') as fw:
#    pickle.dump(edges_df, fw)