In [1]:
from datetime import datetime, date

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import spacy

In [2]:
edits = pd.read_csv('../../data/external/edits.tsv', sep='\t')
edits.head(2)

Unnamed: 0,user_id,article_id,revision_id,namespace,timestamp,md5,reverted,reverted_user_id,reverted_revision_id,delta,cur_size
0,389427,9445560,279555872,0,2009-03-25 10:31:33,175d8a73b80b3c95719c458b698e5179,0,-1,-1,276,1661
1,389427,9445560,279556938,0,2009-03-25 10:41:09,48fa53402b2819283472a899a30117a1,0,-1,-1,97,1758


In [3]:
edits.shape

(22126031, 11)

In [4]:
users = pd.DataFrame({'user_id': edits.user_id.unique()})
print('Num of users:', len(users))

Num of users: 44514


In [5]:
edits['time'] = pd.to_datetime(edits.timestamp)
edits['date'] = edits.time.apply(lambda x: x.date())

# Get each revert to be analyzed

In [6]:
df = edits[edits.reverted==1][['user_id', 'article_id', 'revision_id', 'reverted', 'time', 'date']].sample(500*1000)

In [7]:
unique_users = df.user_id.unique()

In [8]:
df.shape

(500000, 6)

In [9]:
filtered_edit = edits[edits.user_id.isin(unique_users)]
filtered_edit.shape

(21355405, 13)

## Add first contribution

In [10]:
first_contribution = filtered_edit.groupby(['user_id'])['time'].min()
first_contribution = pd.DataFrame(first_contribution)
first_contribution = first_contribution.rename(columns={'time': 'time_first_contribution'})
first_contribution.head(2)

df = pd.merge(df, first_contribution, how='left', on='user_id')
df.sample(5)

Unnamed: 0,user_id,article_id,revision_id,reverted,time,date,time_first_contribution
486445,381132,413992,349064655,1,2010-03-10 20:09:52,2010-03-10,2006-08-14 05:50:02
350747,392179,498166,87540251,1,2006-11-13 14:30:23,2006-11-13,2004-12-05 16:20:58
15106,958749,1945959,271590791,1,2009-02-18 16:11:11,2009-02-18,2004-06-08 22:15:05
402517,579592,31165,350065699,1,2010-03-15 20:08:07,2010-03-15,2006-05-08 18:34:33
470484,44283,22072404,278701096,1,2009-03-21 08:29:36,2009-03-21,2004-10-16 02:35:35


In [11]:
edits_per_day = edits.groupby(['date', 'user_id'])['time'].count().unstack()
edits_per_day = edits_per_day.reset_index()
edits_per_day.head()

user_id,date,30,44,54,60,64,120,146,162,187,...,999912,999924,999931,999944,999954,999965,999969,999970,999994,999998
0,2001-05-10,,,,,,,,,,...,,,,,,,,,,
1,2001-05-14,,,,,,,,,,...,,,,,,,,,,
2,2001-05-15,,,,,,,,,,...,,,,,,,,,,
3,2001-05-16,,,,,,,,,,...,,,,,,,,,,
4,2001-05-17,,,,,,,,,,...,,,,,,,,,,


In [12]:
reverts_per_day = edits[edits.reverted==1].groupby(['date', 'user_id'])['time'].count().unstack()
reverts_per_day = reverts_per_day.reset_index()
reverts_per_day.head()

user_id,date,60,64,162,291,342,366,411,496,554,...,999459,999495,999645,999699,999730,999739,999907,999944,999954,999998
0,2001-08-13,,,,,,,,,,...,,,,,,,,,,
1,2001-09-16,,,,,,,,,,...,,,,,,,,,,
2,2001-10-18,,,,,,,,,,...,,,,,,,,,,
3,2001-10-26,,,,,,,,,,...,,,,,,,,,,
4,2001-10-29,,,,,,,,,,...,,,,,,,,,,


In [13]:
after = []
before = []
reverts_before = []
last_edit = []
last_revert = []

for i, row in df.iterrows():
    user_id = row['user_id']
    time = row['date']
    
    num_articles_after_revert = edits_per_day[user_id][edits_per_day.date > time].sum()
    num_articles_before_revert = edits_per_day[user_id][edits_per_day.date < time].sum()
    num_reverts_before_revert = reverts_per_day[user_id][reverts_per_day.date < time].sum()
    time_last_edit = edits_per_day[['date', user_id]][edits_per_day.date < time].dropna().date.max()
    time_last_revert = reverts_per_day[['date', user_id]][reverts_per_day.date < time].dropna().date.max()
    
    after.append(int(num_articles_after_revert))
    before.append(int(num_articles_before_revert))
    reverts_before.append(int(num_reverts_before_revert))
    last_edit.append(time_last_edit)
    last_revert.append(time_last_revert)

In [14]:
df['num_articles_before_revert'] = before
df['num_reverts_before_revert'] = reverts_before
df['num_reverts_after_revert'] = after

df['time_last_edit_before_revert'] = last_edit
df['time_last_revert_before_revert'] = last_revert

In [15]:
df.head()

Unnamed: 0,user_id,article_id,revision_id,reverted,time,date,time_first_contribution,num_articles_before_revert,num_reverts_before_revert,num_reverts_after_revert,time_last_edit_before_revert,time_last_revert_before_revert
0,90148,245173,241813421,1,2008-09-29 17:35:54,2008-09-29,2008-02-03 20:46:29,79,19,115,2008-09-27,2008-09-23
1,970031,19168,181186825,1,2007-12-31 14:00:30,2007-12-31,2006-11-17 14:37:53,4151,162,4606,2007-12-30,2007-12-29
2,490515,1073455,112809819,1,2007-03-05 14:19:09,2007-03-05,2006-07-20 17:57:36,5110,2030,29795,2007-03-04,2007-03-03
3,607561,2773076,218967723,1,2008-06-12 23:55:24,2008-06-12,2006-07-28 03:08:32,26634,13750,32414,2008-06-11,2008-06-11
4,881194,29328,254122453,1,2008-11-26 00:04:40,2008-11-26,2006-04-23 22:18:15,11316,533,1043,2008-11-25,2008-11-25


In [16]:
days_before_revert = df.time - df.time_first_contribution
df['days_before_revert'] = days_before_revert.apply(lambda x: x.days)

df.head(2)

Unnamed: 0,user_id,article_id,revision_id,reverted,time,date,time_first_contribution,num_articles_before_revert,num_reverts_before_revert,num_reverts_after_revert,time_last_edit_before_revert,time_last_revert_before_revert,days_before_revert
0,90148,245173,241813421,1,2008-09-29 17:35:54,2008-09-29,2008-02-03 20:46:29,79,19,115,2008-09-27,2008-09-23,238
1,970031,19168,181186825,1,2007-12-31 14:00:30,2007-12-31,2006-11-17 14:37:53,4151,162,4606,2007-12-30,2007-12-29,408


In [17]:
# The fillna with timestamp transforms the timestamps into ints...
filled_last_edit = df.time_last_edit_before_revert.fillna(df.time_first_contribution)

for i, value in filled_last_edit.iteritems():
    if isinstance(value, int):
        filled_last_edit[i] = pd.Timestamp(value)
        
df.time_last_edit_before_revert = filled_last_edit

In [18]:
days_after_last_edit = df.time - pd.to_datetime(df.time_last_edit_before_revert)
df['days_after_last_edit'] = days_after_last_edit.apply(lambda x: x.days)

In [19]:
days_after_last_revert = df.date - df.time_last_revert_before_revert
df['days_after_last_revert'] = days_after_last_revert.apply(lambda x: x.days)

In [20]:
df['target'] = df.num_reverts_after_revert.apply(lambda x: 1 if x > 0 else 0)

In [21]:
df.target.value_counts()

1    498477
0      1523
Name: target, dtype: int64

In [22]:
df.to_csv('../../data/processed/reverts_sampled_500k.csv', index=True)