In [1]:
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
edits = pd.read_csv('../../data/wikipedia/edits.tsv', sep='\t')
edits.head(2)

Unnamed: 0,user_id,article_id,revision_id,namespace,timestamp,md5,reverted,reverted_user_id,reverted_revision_id,delta,cur_size
0,389427,9445560,279555872,0,2009-03-25 10:31:33,175d8a73b80b3c95719c458b698e5179,0,-1,-1,276,1661
1,389427,9445560,279556938,0,2009-03-25 10:41:09,48fa53402b2819283472a899a30117a1,0,-1,-1,97,1758


In [3]:
users = pd.DataFrame({'user_id': edits.user_id.unique()})
print('Num of users:', len(users))

Num of users: 44514


In [4]:
edits['time'] = pd.to_datetime(edits.timestamp)
edits['date'] = edits.time.apply(lambda x: x.date())

# Getting time of first revert

My guess is that the longer it takes to revert an artible, the more motivated the user will be to contribute even, with the revert

In [5]:
first_revert = edits[edits.reverted==1].groupby(['user_id'])['time'].min()
df = pd.DataFrame(first_revert)
df.rename(columns={'time': 'time_first_revert'}, inplace=True)
df = df.reset_index()
print('Num of users with reverted articles:', len(df.user_id.unique()))

Num of users with reverted articles: 14829


# Getting time of first contribution

Will be necessary to see the distance between first update and first revert

In [6]:
first_contribution = edits.groupby(['user_id'])['time'].min()
first_contribution = pd.DataFrame(first_contribution)
first_contribution.head(2)

df = pd.merge(df, first_contribution, how='left', on='user_id')
df = df.rename(columns={'time': 'time_first_contribution'})
df.sample(5)

Unnamed: 0,user_id,time_first_revert,time_first_contribution
2557,168478,2010-06-12 15:40:13,2010-06-12 15:40:04
1455,97088,2007-03-14 11:29:38,2004-03-20 17:47:10
6828,464533,2009-11-20 01:53:39,2009-11-20 00:26:47
9397,636346,2010-07-22 22:56:28,2010-06-16 21:48:58
3895,263741,2006-12-21 01:16:21,2006-12-21 01:15:55


In [7]:
last_contribution = edits.groupby(['user_id'])['time'].max()
last_contribution = pd.DataFrame(last_contribution)
last_contribution.head(2)

df = pd.merge(df, last_contribution, how='left', on='user_id')
df = df.rename(columns={'time': 'time_last_contribution'})
df.sample(5)

Unnamed: 0,user_id,time_first_revert,time_first_contribution,time_last_contribution
3532,238984,2009-05-19 04:20:14,2009-05-19 04:20:14,2010-06-29 10:14:13
5482,369425,2009-02-13 17:56:10,2009-02-08 05:19:33,2010-06-02 14:43:20
496,33947,2007-12-19 16:18:38,2006-06-25 11:56:30,2010-07-01 07:11:53
3563,242547,2008-12-31 08:28:59,2008-09-26 10:34:20,2010-08-15 08:32:05
4833,325642,2009-12-26 21:56:31,2009-12-26 21:55:47,2009-12-27 20:34:51


In [8]:
time_until_revert = df.time_first_revert - df.time_first_contribution
df['days_until_revert'] = time_until_revert.apply(lambda x: x.days)

df.sample(5)

Unnamed: 0,user_id,time_first_revert,time_first_contribution,time_last_contribution,days_until_revert
2791,184448,2006-01-23 18:36:59,2005-10-31 10:26:17,2010-08-26 11:55:58,84
13106,883780,2008-06-27 02:46:50,2007-09-11 23:28:48,2010-08-31 00:59:20,289
11047,746794,2007-10-17 14:29:12,2007-06-14 20:00:17,2010-08-30 03:14:36,124
3581,243669,2009-12-23 03:56:17,2009-09-19 20:30:00,2010-08-31 03:52:11,94
8607,580910,2007-12-07 21:45:30,2006-02-25 20:46:03,2010-07-30 03:40:12,650


In [9]:
time_of_contribution = df.time_last_contribution - df.time_first_contribution
df['days_of_contribution'] = time_until_revert.apply(lambda x: x.days)

df.sample(5)

Unnamed: 0,user_id,time_first_revert,time_first_contribution,time_last_contribution,days_until_revert,days_of_contribution
5728,385580,2010-03-24 23:32:41,2009-09-11 17:48:29,2010-08-18 00:31:45,194,194
3704,251623,2009-07-16 14:04:34,2009-06-18 21:57:37,2010-05-26 14:14:47,27,27
11211,760694,2010-08-06 18:45:31,2006-12-07 00:20:06,2010-08-06 20:13:07,1338,1338
11865,803016,2007-01-10 22:25:25,2007-01-02 14:15:36,2010-08-27 13:44:00,8,8
7097,480904,2008-07-22 23:07:57,2006-03-23 18:55:46,2010-04-07 21:21:02,852,852


# Getting information of the number of reverts each user has


In [10]:
total_updates = edits.groupby(['user_id'])['time'].count()
total_updates = pd.DataFrame(total_updates)
total_updates = total_updates.rename(columns={'time': 'total_updates'})
total_updates.head()

df = pd.merge(df, total_updates, how='left', on='user_id')
df.sample(5)

Unnamed: 0,user_id,time_first_revert,time_first_contribution,time_last_contribution,days_until_revert,days_of_contribution,total_updates
1966,130600,2008-05-14 22:59:11,2007-11-03 15:47:51,2010-08-13 14:20:21,193,193,193
8576,578765,2010-05-24 22:11:50,2010-05-24 13:35:43,2010-05-30 20:04:11,0,0,8
12790,863896,2008-07-08 00:05:36,2008-06-27 19:45:40,2009-09-23 16:53:09,10,10,23
7114,482233,2009-11-24 19:44:05,2009-11-24 19:22:21,2010-07-29 14:34:55,0,0,24
6881,467544,2009-01-16 20:01:39,2009-01-16 19:57:57,2009-09-19 20:11:00,0,0,68


# Creating target variable

In [11]:
first_revert_df = pd.DataFrame(first_revert).reset_index()
first_revert_df = first_revert_df.rename(columns={'time': 'time_first_revert'}) 

df1 = pd.merge(edits, first_revert_df, on='user_id', how='left')
df1.head(2)

Unnamed: 0,user_id,article_id,revision_id,namespace,timestamp,md5,reverted,reverted_user_id,reverted_revision_id,delta,cur_size,time,date,time_first_revert
0,389427,9445560,279555872,0,2009-03-25 10:31:33,175d8a73b80b3c95719c458b698e5179,0,-1,-1,276,1661,2009-03-25 10:31:33,2009-03-25,NaT
1,389427,9445560,279556938,0,2009-03-25 10:41:09,48fa53402b2819283472a899a30117a1,0,-1,-1,97,1758,2009-03-25 10:41:09,2009-03-25,NaT


In [12]:
df1['made_after_revert'] = df1.time > df1.time_first_revert
df2 = df1.groupby('user_id')['made_after_revert'].sum().reset_index(name='num_updates_after_revert')

df2['updated_after_revert'] = df2['num_updates_after_revert'].apply(lambda x: 1 if x > 0 else 0)
df2.drop(['num_updates_after_revert'], axis=1, inplace=True)
df2.head(2)

Unnamed: 0,user_id,updated_after_revert
0,30,0
1,44,0


In [13]:
df = pd.merge(df, df2, how='left', on='user_id')
df.head()

Unnamed: 0,user_id,time_first_revert,time_first_contribution,time_last_contribution,days_until_revert,days_of_contribution,total_updates,updated_after_revert
0,60,2009-11-10 19:21:18,2009-11-10 19:12:56,2009-11-10 20:15:43,0,0,14,1
1,64,2010-07-28 20:46:23,2010-07-28 20:25:16,2010-08-08 23:30:29,0,0,33,1
2,162,2007-12-19 03:32:13,2007-07-07 21:47:30,2010-07-17 22:59:00,164,164,119,1
3,291,2006-11-27 18:30:30,2006-04-24 22:03:29,2010-05-10 05:55:15,216,216,875,1
4,342,2005-12-27 02:27:17,2004-06-13 16:44:39,2010-08-18 03:07:32,561,561,4279,1


# Getting information on editing habits

In [14]:
df['updates_per_day'] = df.total_updates / df.days_of_contribution
df.updates_per_day = df.updates_per_day.replace(np.inf, value=0)
df.head(10)

Unnamed: 0,user_id,time_first_revert,time_first_contribution,time_last_contribution,days_until_revert,days_of_contribution,total_updates,updated_after_revert,updates_per_day
0,60,2009-11-10 19:21:18,2009-11-10 19:12:56,2009-11-10 20:15:43,0,0,14,1,0.0
1,64,2010-07-28 20:46:23,2010-07-28 20:25:16,2010-08-08 23:30:29,0,0,33,1,0.0
2,162,2007-12-19 03:32:13,2007-07-07 21:47:30,2010-07-17 22:59:00,164,164,119,1,0.72561
3,291,2006-11-27 18:30:30,2006-04-24 22:03:29,2010-05-10 05:55:15,216,216,875,1,4.050926
4,342,2005-12-27 02:27:17,2004-06-13 16:44:39,2010-08-18 03:07:32,561,561,4279,1,7.627451
5,366,2006-10-25 21:23:22,2006-10-18 21:36:28,2010-08-23 19:26:02,6,6,64,1,10.666667
6,411,2007-10-23 16:38:38,2007-09-23 20:06:55,2010-08-31 23:58:15,29,29,53974,1,1861.172414
7,496,2007-03-20 09:27:54,2006-10-20 23:40:31,2010-06-09 03:46:08,150,150,230,1,1.533333
8,554,2009-03-03 17:29:38,2009-03-03 17:29:08,2010-01-14 13:35:37,0,0,8,1,0.0
9,577,2009-07-27 13:46:20,2009-07-24 10:59:03,2009-11-08 06:35:46,3,3,3599,1,1199.666667


# Getting information on behavior before revert

In [15]:
first_revert_df = pd.DataFrame(first_revert).reset_index()
first_revert_df = first_revert_df.rename(columns={'time': 'time_first_revert'})
df1 = pd.merge(edits, first_revert_df, how='left', on='user_id')
df1.head()

Unnamed: 0,user_id,article_id,revision_id,namespace,timestamp,md5,reverted,reverted_user_id,reverted_revision_id,delta,cur_size,time,date,time_first_revert
0,389427,9445560,279555872,0,2009-03-25 10:31:33,175d8a73b80b3c95719c458b698e5179,0,-1,-1,276,1661,2009-03-25 10:31:33,2009-03-25,NaT
1,389427,9445560,279556938,0,2009-03-25 10:41:09,48fa53402b2819283472a899a30117a1,0,-1,-1,97,1758,2009-03-25 10:41:09,2009-03-25,NaT
2,389427,9445560,304562912,0,2009-07-27 22:23:46,28b4e603c11f65343c39fb2946615925,0,-1,-1,18,1776,2009-07-27 22:23:46,2009-07-27,NaT
3,389427,9445560,304563043,0,2009-07-27 22:24:41,cac360b626a79d08f7369d59b7d6c475,0,-1,-1,4,1780,2009-07-27 22:24:41,2009-07-27,NaT
4,389427,9445560,346862274,0,2010-02-28 11:23:52,1524d65e5ffb54be4cf2fee045b8a414,0,-1,-1,-11,2227,2010-02-28 11:23:52,2010-02-28,NaT


In [16]:
df1['edited_before_revert'] = df1.time < df1.time_first_revert
edits_before_revert = df1[df1.edited_before_revert == True]

In [17]:
edits_before_revert = edits_before_revert.groupby('user_id')['time'].count().reset_index(name='edits_before_revert')

In [18]:
df = pd.merge(df, edits_before_revert, on='user_id', how='left')
df.head()

Unnamed: 0,user_id,time_first_revert,time_first_contribution,time_last_contribution,days_until_revert,days_of_contribution,total_updates,updated_after_revert,updates_per_day,edits_before_revert
0,60,2009-11-10 19:21:18,2009-11-10 19:12:56,2009-11-10 20:15:43,0,0,14,1,0.0,3.0
1,64,2010-07-28 20:46:23,2010-07-28 20:25:16,2010-08-08 23:30:29,0,0,33,1,0.0,10.0
2,162,2007-12-19 03:32:13,2007-07-07 21:47:30,2010-07-17 22:59:00,164,164,119,1,0.72561,7.0
3,291,2006-11-27 18:30:30,2006-04-24 22:03:29,2010-05-10 05:55:15,216,216,875,1,4.050926,100.0
4,342,2005-12-27 02:27:17,2004-06-13 16:44:39,2010-08-18 03:07:32,561,561,4279,1,7.627451,149.0


In [19]:
df.to_csv('../../data/users_behavior_complete.csv', index=False)