In [1]:
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
edits = pd.read_csv('../../data/external/edits.tsv', sep='\t')
edits.head(2)

Unnamed: 0,user_id,article_id,revision_id,namespace,timestamp,md5,reverted,reverted_user_id,reverted_revision_id,delta,cur_size
0,389427,9445560,279555872,0,2009-03-25 10:31:33,175d8a73b80b3c95719c458b698e5179,0,-1,-1,276,1661
1,389427,9445560,279556938,0,2009-03-25 10:41:09,48fa53402b2819283472a899a30117a1,0,-1,-1,97,1758


In [3]:
edits.shape

(22126031, 11)

In [4]:
users = pd.DataFrame({'user_id': edits.user_id.unique()})
print('Num of users:', len(users))

Num of users: 44514


In [5]:
edits['time'] = pd.to_datetime(edits.timestamp)
edits['date'] = edits.time.apply(lambda x: x.date())

# Getting time of first revert

My guess is that the longer it takes to revert an artible, the more motivated the user will be to contribute even, with the revert

In [6]:
first_revert = edits[edits.reverted==1].groupby(['user_id'])['time'].min()
df = pd.DataFrame(first_revert)
df.rename(columns={'time': 'time_first_revert'}, inplace=True)
df = df.reset_index()
print('Num of users with reverted articles:', len(df.user_id.unique()))

Num of users with reverted articles: 14829


# Getting time of first contribution

Will be necessary to see the distance between first update and first revert

In [7]:
first_contribution = edits.groupby(['user_id'])['time'].min()
first_contribution = pd.DataFrame(first_contribution)
first_contribution.head(2)

df = pd.merge(df, first_contribution, how='left', on='user_id')
df = df.rename(columns={'time': 'time_first_contribution'})
df.sample(5)

Unnamed: 0,user_id,time_first_revert,time_first_contribution
3743,254756,2009-08-11 13:30:21,2005-12-14 23:37:08
9545,647008,2008-02-20 20:49:03,2006-12-16 16:55:50
74,4746,2004-08-25 06:59:56,2004-08-02 02:45:54
79,5077,2009-12-10 04:07:53,2009-12-10 04:07:53
1180,78383,2010-05-11 06:58:17,2010-05-11 06:58:17


In [8]:
last_contribution = edits.groupby(['user_id'])['time'].max()
last_contribution = pd.DataFrame(last_contribution)
last_contribution.head(2)

df = pd.merge(df, last_contribution, how='left', on='user_id')
df = df.rename(columns={'time': 'time_last_contribution'})
df.sample(5)

Unnamed: 0,user_id,time_first_revert,time_first_contribution,time_last_contribution
10110,683097,2009-03-22 10:39:43,2009-03-21 19:39:21,2010-06-29 09:43:48
13145,886842,2010-07-18 14:38:34,2010-05-21 13:21:13,2010-07-18 14:38:34
2426,160800,2010-02-23 00:40:34,2010-02-22 23:54:32,2010-02-23 01:19:58
6929,470116,2009-11-03 21:58:46,2009-10-16 21:08:54,2010-04-03 23:01:12
4805,323900,2008-06-19 18:29:29,2006-03-10 04:08:25,2010-05-19 19:26:20


In [9]:
time_until_revert = df.time_first_revert - df.time_first_contribution
df['days_until_revert'] = time_until_revert.apply(lambda x: x.days)

df.sample(5)

Unnamed: 0,user_id,time_first_revert,time_first_contribution,time_last_contribution,days_until_revert
4390,297462,2007-09-05 23:17:09,2006-10-15 18:15:21,2010-08-30 23:57:37,325
3461,234145,2010-01-23 02:03:18,2009-10-29 22:16:17,2010-01-28 23:49:55,85
2190,146480,2006-10-06 19:15:57,2006-09-09 12:44:08,2010-04-16 13:30:07,27
4976,334287,2006-07-18 04:03:50,2006-05-04 16:46:05,2010-06-07 08:20:54,74
7879,533012,2005-12-17 09:30:20,2005-12-16 17:32:05,2010-08-30 20:28:12,0


In [10]:
time_of_contribution = df.time_last_contribution - df.time_first_contribution
df['days_of_contribution'] = time_until_revert.apply(lambda x: x.days)

df.sample(5)

Unnamed: 0,user_id,time_first_revert,time_first_contribution,time_last_contribution,days_until_revert,days_of_contribution
1719,113195,2005-03-14 11:27:43,2004-08-25 11:58:06,2010-04-26 16:52:41,200,200
12877,869978,2010-07-07 17:05:23,2007-12-27 01:32:01,2010-08-31 19:36:51,923,923
2553,168315,2006-05-09 13:15:24,2005-04-17 12:21:05,2010-08-28 09:03:48,387,387
14651,989194,2008-03-20 12:47:31,2007-03-30 17:05:45,2010-08-03 12:49:44,355,355
3946,267633,2010-01-31 22:22:57,2005-09-02 18:21:35,2010-08-23 11:40:06,1612,1612


# Getting information of the number of reverts each user has


In [11]:
total_updates = edits.groupby(['user_id'])['time'].count()
total_updates = pd.DataFrame(total_updates)
total_updates = total_updates.rename(columns={'time': 'total_updates'})
total_updates.head()

df = pd.merge(df, total_updates, how='left', on='user_id')
df.sample(5)

Unnamed: 0,user_id,time_first_revert,time_first_contribution,time_last_contribution,days_until_revert,days_of_contribution,total_updates
11652,788724,2008-02-11 05:54:51,2008-02-10 08:09:22,2010-08-28 06:30:44,0,0,210
9561,648042,2006-09-09 16:09:47,2006-08-03 15:53:07,2009-09-30 00:38:59,37,37,144
8225,553999,2008-09-09 18:07:23,2008-06-01 16:26:32,2010-05-28 20:55:38,100,100,465
12980,876436,2002-08-27 16:23:25,2002-07-02 23:10:57,2010-08-21 20:08:20,55,55,1287
566,38309,2009-05-10 13:10:39,2005-08-20 00:44:33,2010-03-01 06:06:59,1359,1359,62


# Creating target variable

In [12]:
first_revert_df = pd.DataFrame(first_revert).reset_index()
first_revert_df = first_revert_df.rename(columns={'time': 'time_first_revert'}) 

df1 = pd.merge(edits, first_revert_df, on='user_id', how='left')
df1.head(2)

Unnamed: 0,user_id,article_id,revision_id,namespace,timestamp,md5,reverted,reverted_user_id,reverted_revision_id,delta,cur_size,time,date,time_first_revert
0,389427,9445560,279555872,0,2009-03-25 10:31:33,175d8a73b80b3c95719c458b698e5179,0,-1,-1,276,1661,2009-03-25 10:31:33,2009-03-25,NaT
1,389427,9445560,279556938,0,2009-03-25 10:41:09,48fa53402b2819283472a899a30117a1,0,-1,-1,97,1758,2009-03-25 10:41:09,2009-03-25,NaT


In [13]:
df1['made_after_revert'] = df1.time > df1.time_first_revert
df2 = df1.groupby('user_id')['made_after_revert'].sum().reset_index(name='num_updates_after_revert')

df2['updated_after_revert'] = df2['num_updates_after_revert'].apply(lambda x: 1 if x > 0 else 0)
df2.drop(['num_updates_after_revert'], axis=1, inplace=True)
df2.head(2)

Unnamed: 0,user_id,updated_after_revert
0,30,0
1,44,0


In [14]:
df = pd.merge(df, df2, how='left', on='user_id')
df.head()

Unnamed: 0,user_id,time_first_revert,time_first_contribution,time_last_contribution,days_until_revert,days_of_contribution,total_updates,updated_after_revert
0,60,2009-11-10 19:21:18,2009-11-10 19:12:56,2009-11-10 20:15:43,0,0,14,1
1,64,2010-07-28 20:46:23,2010-07-28 20:25:16,2010-08-08 23:30:29,0,0,33,1
2,162,2007-12-19 03:32:13,2007-07-07 21:47:30,2010-07-17 22:59:00,164,164,119,1
3,291,2006-11-27 18:30:30,2006-04-24 22:03:29,2010-05-10 05:55:15,216,216,875,1
4,342,2005-12-27 02:27:17,2004-06-13 16:44:39,2010-08-18 03:07:32,561,561,4279,1


# Getting information on editing habits

In [15]:
df['updates_per_day'] = df.total_updates / df.days_of_contribution
df.updates_per_day = df.updates_per_day.replace(np.inf, value=0)
df.head(10)

Unnamed: 0,user_id,time_first_revert,time_first_contribution,time_last_contribution,days_until_revert,days_of_contribution,total_updates,updated_after_revert,updates_per_day
0,60,2009-11-10 19:21:18,2009-11-10 19:12:56,2009-11-10 20:15:43,0,0,14,1,0.0
1,64,2010-07-28 20:46:23,2010-07-28 20:25:16,2010-08-08 23:30:29,0,0,33,1,0.0
2,162,2007-12-19 03:32:13,2007-07-07 21:47:30,2010-07-17 22:59:00,164,164,119,1,0.72561
3,291,2006-11-27 18:30:30,2006-04-24 22:03:29,2010-05-10 05:55:15,216,216,875,1,4.050926
4,342,2005-12-27 02:27:17,2004-06-13 16:44:39,2010-08-18 03:07:32,561,561,4279,1,7.627451
5,366,2006-10-25 21:23:22,2006-10-18 21:36:28,2010-08-23 19:26:02,6,6,64,1,10.666667
6,411,2007-10-23 16:38:38,2007-09-23 20:06:55,2010-08-31 23:58:15,29,29,53974,1,1861.172414
7,496,2007-03-20 09:27:54,2006-10-20 23:40:31,2010-06-09 03:46:08,150,150,230,1,1.533333
8,554,2009-03-03 17:29:38,2009-03-03 17:29:08,2010-01-14 13:35:37,0,0,8,1,0.0
9,577,2009-07-27 13:46:20,2009-07-24 10:59:03,2009-11-08 06:35:46,3,3,3599,1,1199.666667


# Getting information on behavior before revert

In [16]:
first_revert_df = pd.DataFrame(first_revert).reset_index()
first_revert_df = first_revert_df.rename(columns={'time': 'time_first_revert'})
df1 = pd.merge(edits, first_revert_df, how='left', on='user_id')
df1.head()

Unnamed: 0,user_id,article_id,revision_id,namespace,timestamp,md5,reverted,reverted_user_id,reverted_revision_id,delta,cur_size,time,date,time_first_revert
0,389427,9445560,279555872,0,2009-03-25 10:31:33,175d8a73b80b3c95719c458b698e5179,0,-1,-1,276,1661,2009-03-25 10:31:33,2009-03-25,NaT
1,389427,9445560,279556938,0,2009-03-25 10:41:09,48fa53402b2819283472a899a30117a1,0,-1,-1,97,1758,2009-03-25 10:41:09,2009-03-25,NaT
2,389427,9445560,304562912,0,2009-07-27 22:23:46,28b4e603c11f65343c39fb2946615925,0,-1,-1,18,1776,2009-07-27 22:23:46,2009-07-27,NaT
3,389427,9445560,304563043,0,2009-07-27 22:24:41,cac360b626a79d08f7369d59b7d6c475,0,-1,-1,4,1780,2009-07-27 22:24:41,2009-07-27,NaT
4,389427,9445560,346862274,0,2010-02-28 11:23:52,1524d65e5ffb54be4cf2fee045b8a414,0,-1,-1,-11,2227,2010-02-28 11:23:52,2010-02-28,NaT


In [17]:
df1['edited_before_revert'] = df1.time < df1.time_first_revert
edits_before_revert = df1[df1.edited_before_revert == True]

In [18]:
edits_before_revert = edits_before_revert.groupby('user_id')['time'].count().reset_index(name='edits_before_revert')

In [19]:
df = pd.merge(df, edits_before_revert, on='user_id', how='left')
df.head()

Unnamed: 0,user_id,time_first_revert,time_first_contribution,time_last_contribution,days_until_revert,days_of_contribution,total_updates,updated_after_revert,updates_per_day,edits_before_revert
0,60,2009-11-10 19:21:18,2009-11-10 19:12:56,2009-11-10 20:15:43,0,0,14,1,0.0,3.0
1,64,2010-07-28 20:46:23,2010-07-28 20:25:16,2010-08-08 23:30:29,0,0,33,1,0.0,10.0
2,162,2007-12-19 03:32:13,2007-07-07 21:47:30,2010-07-17 22:59:00,164,164,119,1,0.72561,7.0
3,291,2006-11-27 18:30:30,2006-04-24 22:03:29,2010-05-10 05:55:15,216,216,875,1,4.050926,100.0
4,342,2005-12-27 02:27:17,2004-06-13 16:44:39,2010-08-18 03:07:32,561,561,4279,1,7.627451,149.0


In [20]:
df.shape

(14829, 10)

In [21]:
df.to_csv('../../data/processed/user_first_revert.csv', index=False)