In [1]:
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
edits = pd.read_csv('../../data/external/edits.tsv', sep='\t')
edits.head(2)

Unnamed: 0,user_id,article_id,revision_id,namespace,timestamp,md5,reverted,reverted_user_id,reverted_revision_id,delta,cur_size
0,389427,9445560,279555872,0,2009-03-25 10:31:33,175d8a73b80b3c95719c458b698e5179,0,-1,-1,276,1661
1,389427,9445560,279556938,0,2009-03-25 10:41:09,48fa53402b2819283472a899a30117a1,0,-1,-1,97,1758


In [3]:
edits.shape

(22126031, 11)

In [4]:
users = pd.DataFrame({'user_id': edits.user_id.unique()})
print('Num of users:', len(users))

Num of users: 44514


In [5]:
edits['time'] = pd.to_datetime(edits.timestamp)
edits['date'] = edits.time.apply(lambda x: x.date())

# Getting time of first revert

My guess is that the longer it takes to revert an artible, the more motivated the user will be to contribute even, with the revert

In [6]:
first_revert = edits[edits.reverted==1].groupby(['user_id'])['time'].min()
df = pd.DataFrame(first_revert)
df.rename(columns={'time': 'time_first_revert'}, inplace=True)
df = df.reset_index()
print('Num of users with reverted articles:', len(df.user_id.unique()))

Num of users with reverted articles: 14829


# Getting time of first contribution

Will be necessary to see the distance between first update and first revert

In [7]:
first_contribution = edits.groupby(['user_id'])['time'].min()
first_contribution = pd.DataFrame(first_contribution)
first_contribution.head(2)

df = pd.merge(df, first_contribution, how='left', on='user_id')
df = df.rename(columns={'time': 'time_first_contribution'})
df.sample(5)

Unnamed: 0,user_id,time_first_revert,time_first_contribution
12118,819869,2010-03-07 18:59:49,2010-03-07 18:58:15
12169,822957,2007-03-06 00:49:04,2007-01-17 02:50:48
64,4240,2004-03-14 01:14:08,2004-03-10 16:12:35
13916,937789,2007-05-11 19:13:09,2007-05-11 19:09:46
5306,358285,2010-05-27 16:32:54,2007-03-01 23:15:48


In [8]:
last_contribution = edits.groupby(['user_id'])['time'].max()
last_contribution = pd.DataFrame(last_contribution)
last_contribution.head(2)

df = pd.merge(df, last_contribution, how='left', on='user_id')
df = df.rename(columns={'time': 'time_last_contribution'})
df.sample(5)

Unnamed: 0,user_id,time_first_revert,time_first_contribution,time_last_contribution
5747,386846,2010-08-25 02:44:00,2009-09-12 20:58:12,2010-08-31 22:04:49
3299,221599,2009-01-09 16:08:03,2006-06-03 19:14:31,2010-03-28 13:55:13
9842,666785,2010-05-22 21:29:46,2010-05-22 21:24:59,2010-08-30 03:33:53
4607,311307,2007-08-14 10:22:10,2006-12-21 05:31:25,2009-12-23 05:11:20
6947,471387,2010-02-13 18:51:05,2010-01-23 23:53:25,2010-02-13 18:51:05


In [9]:
time_until_revert = df.time_first_revert - df.time_first_contribution
df['days_until_revert'] = time_until_revert.apply(lambda x: x.days)

df.sample(5)

Unnamed: 0,user_id,time_first_revert,time_first_contribution,time_last_contribution,days_until_revert
8180,550961,2009-08-06 11:49:13,2006-04-24 18:45:50,2009-10-06 12:12:00,1199
1485,98674,2010-02-11 03:30:25,2010-02-10 21:08:45,2010-08-12 15:30:41,0
11019,744723,2006-12-10 06:21:15,2006-12-10 05:45:52,2010-03-31 02:48:51,0
3475,234894,2006-12-04 13:17:22,2006-12-04 11:22:05,2010-04-18 17:36:14,0
13113,884425,2007-01-29 18:31:29,2005-04-22 23:36:19,2010-08-21 20:32:18,646


In [10]:
time_of_contribution = df.time_last_contribution - df.time_first_contribution
df['days_of_contribution'] = time_until_revert.apply(lambda x: x.days)

df.sample(5)

Unnamed: 0,user_id,time_first_revert,time_first_contribution,time_last_contribution,days_until_revert,days_of_contribution
2225,148942,2008-05-14 19:09:07,2008-01-15 23:13:02,2010-08-03 18:59:43,119,119
1109,73225,2009-08-17 01:19:44,2009-08-07 01:39:59,2010-08-31 23:56:21,9,9
3440,232811,2010-08-09 20:58:20,2010-08-08 17:46:32,2010-08-12 14:00:42,1,1
14587,983984,2009-01-06 16:52:51,2006-11-21 11:13:22,2010-08-08 21:19:39,777,777
9593,650312,2009-10-21 01:00:26,2009-10-21 00:59:45,2009-10-21 01:00:44,0,0


# Getting information of the number of reverts each user has


In [11]:
total_updates = edits.groupby(['user_id'])['time'].count()
total_updates = pd.DataFrame(total_updates)
total_updates = total_updates.rename(columns={'time': 'total_updates'})
total_updates.head()

df = pd.merge(df, total_updates, how='left', on='user_id')
df.sample(5)

Unnamed: 0,user_id,time_first_revert,time_first_contribution,time_last_contribution,days_until_revert,days_of_contribution,total_updates
9280,628006,2007-03-21 20:18:49,2006-12-30 22:01:38,2010-07-16 09:08:27,80,80,298
9811,665123,2007-03-29 14:21:30,2006-08-20 05:03:04,2009-12-05 21:37:52,221,221,101
13901,936847,2010-07-31 11:09:03,2010-03-09 06:42:03,2010-07-31 11:09:03,144,144,11
5225,353059,2010-02-20 17:34:26,2010-01-17 19:59:06,2010-08-22 09:09:24,33,33,262
11599,785628,2010-02-13 22:10:10,2010-01-30 01:22:55,2010-08-20 12:14:21,14,14,6


# Creating target variable

In [12]:
first_revert_df = pd.DataFrame(first_revert).reset_index()
first_revert_df = first_revert_df.rename(columns={'time': 'time_first_revert'}) 

df1 = pd.merge(edits, first_revert_df, on='user_id', how='left')
df1.head(2)

Unnamed: 0,user_id,article_id,revision_id,namespace,timestamp,md5,reverted,reverted_user_id,reverted_revision_id,delta,cur_size,time,date,time_first_revert
0,389427,9445560,279555872,0,2009-03-25 10:31:33,175d8a73b80b3c95719c458b698e5179,0,-1,-1,276,1661,2009-03-25 10:31:33,2009-03-25,NaT
1,389427,9445560,279556938,0,2009-03-25 10:41:09,48fa53402b2819283472a899a30117a1,0,-1,-1,97,1758,2009-03-25 10:41:09,2009-03-25,NaT


In [13]:
df1['made_after_revert'] = df1.time > df1.time_first_revert
df2 = df1.groupby('user_id')['made_after_revert'].sum().reset_index(name='num_updates_after_revert')

df2['updated_after_revert'] = df2['num_updates_after_revert'].apply(lambda x: 1 if x > 0 else 0)
df2.drop(['num_updates_after_revert'], axis=1, inplace=True)
df2.head(2)

Unnamed: 0,user_id,updated_after_revert
0,30,0
1,44,0


In [14]:
df = pd.merge(df, df2, how='left', on='user_id')
df.head()

Unnamed: 0,user_id,time_first_revert,time_first_contribution,time_last_contribution,days_until_revert,days_of_contribution,total_updates,updated_after_revert
0,60,2009-11-10 19:21:18,2009-11-10 19:12:56,2009-11-10 20:15:43,0,0,14,1
1,64,2010-07-28 20:46:23,2010-07-28 20:25:16,2010-08-08 23:30:29,0,0,33,1
2,162,2007-12-19 03:32:13,2007-07-07 21:47:30,2010-07-17 22:59:00,164,164,119,1
3,291,2006-11-27 18:30:30,2006-04-24 22:03:29,2010-05-10 05:55:15,216,216,875,1
4,342,2005-12-27 02:27:17,2004-06-13 16:44:39,2010-08-18 03:07:32,561,561,4279,1


# Getting information on editing habits

In [15]:
df['updates_per_day'] = df.total_updates / df.days_of_contribution
df.updates_per_day = df.updates_per_day.replace(np.inf, value=0)
df.head(10)

Unnamed: 0,user_id,time_first_revert,time_first_contribution,time_last_contribution,days_until_revert,days_of_contribution,total_updates,updated_after_revert,updates_per_day
0,60,2009-11-10 19:21:18,2009-11-10 19:12:56,2009-11-10 20:15:43,0,0,14,1,0.0
1,64,2010-07-28 20:46:23,2010-07-28 20:25:16,2010-08-08 23:30:29,0,0,33,1,0.0
2,162,2007-12-19 03:32:13,2007-07-07 21:47:30,2010-07-17 22:59:00,164,164,119,1,0.72561
3,291,2006-11-27 18:30:30,2006-04-24 22:03:29,2010-05-10 05:55:15,216,216,875,1,4.050926
4,342,2005-12-27 02:27:17,2004-06-13 16:44:39,2010-08-18 03:07:32,561,561,4279,1,7.627451
5,366,2006-10-25 21:23:22,2006-10-18 21:36:28,2010-08-23 19:26:02,6,6,64,1,10.666667
6,411,2007-10-23 16:38:38,2007-09-23 20:06:55,2010-08-31 23:58:15,29,29,53974,1,1861.172414
7,496,2007-03-20 09:27:54,2006-10-20 23:40:31,2010-06-09 03:46:08,150,150,230,1,1.533333
8,554,2009-03-03 17:29:38,2009-03-03 17:29:08,2010-01-14 13:35:37,0,0,8,1,0.0
9,577,2009-07-27 13:46:20,2009-07-24 10:59:03,2009-11-08 06:35:46,3,3,3599,1,1199.666667


# Getting information on behavior before revert

In [16]:
first_revert_df = pd.DataFrame(first_revert).reset_index()
first_revert_df = first_revert_df.rename(columns={'time': 'time_first_revert'})
df1 = pd.merge(edits, first_revert_df, how='left', on='user_id')
df1.head()

Unnamed: 0,user_id,article_id,revision_id,namespace,timestamp,md5,reverted,reverted_user_id,reverted_revision_id,delta,cur_size,time,date,time_first_revert
0,389427,9445560,279555872,0,2009-03-25 10:31:33,175d8a73b80b3c95719c458b698e5179,0,-1,-1,276,1661,2009-03-25 10:31:33,2009-03-25,NaT
1,389427,9445560,279556938,0,2009-03-25 10:41:09,48fa53402b2819283472a899a30117a1,0,-1,-1,97,1758,2009-03-25 10:41:09,2009-03-25,NaT
2,389427,9445560,304562912,0,2009-07-27 22:23:46,28b4e603c11f65343c39fb2946615925,0,-1,-1,18,1776,2009-07-27 22:23:46,2009-07-27,NaT
3,389427,9445560,304563043,0,2009-07-27 22:24:41,cac360b626a79d08f7369d59b7d6c475,0,-1,-1,4,1780,2009-07-27 22:24:41,2009-07-27,NaT
4,389427,9445560,346862274,0,2010-02-28 11:23:52,1524d65e5ffb54be4cf2fee045b8a414,0,-1,-1,-11,2227,2010-02-28 11:23:52,2010-02-28,NaT


In [17]:
df1['edited_before_revert'] = df1.time < df1.time_first_revert
edits_before_revert = df1[df1.edited_before_revert == True]

In [18]:
edits_before_revert = edits_before_revert.groupby('user_id')['time'].count().reset_index(name='edits_before_revert')

In [19]:
df = pd.merge(df, edits_before_revert, on='user_id', how='left')
df.head()

Unnamed: 0,user_id,time_first_revert,time_first_contribution,time_last_contribution,days_until_revert,days_of_contribution,total_updates,updated_after_revert,updates_per_day,edits_before_revert
0,60,2009-11-10 19:21:18,2009-11-10 19:12:56,2009-11-10 20:15:43,0,0,14,1,0.0,3.0
1,64,2010-07-28 20:46:23,2010-07-28 20:25:16,2010-08-08 23:30:29,0,0,33,1,0.0,10.0
2,162,2007-12-19 03:32:13,2007-07-07 21:47:30,2010-07-17 22:59:00,164,164,119,1,0.72561,7.0
3,291,2006-11-27 18:30:30,2006-04-24 22:03:29,2010-05-10 05:55:15,216,216,875,1,4.050926,100.0
4,342,2005-12-27 02:27:17,2004-06-13 16:44:39,2010-08-18 03:07:32,561,561,4279,1,7.627451,149.0


In [20]:
df.shape

(14829, 10)

In [21]:
df.to_csv('../../data/processed/users_behavior_complete.csv', index=False)