In [19]:
from datetime import datetime, date

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import spacy

In [20]:
edits = pd.read_csv('../../data/external/edits.tsv', sep='\t')
edits.head(2)

Unnamed: 0,user_id,article_id,revision_id,namespace,timestamp,md5,reverted,reverted_user_id,reverted_revision_id,delta,cur_size
0,389427,9445560,279555872,0,2009-03-25 10:31:33,175d8a73b80b3c95719c458b698e5179,0,-1,-1,276,1661
1,389427,9445560,279556938,0,2009-03-25 10:41:09,48fa53402b2819283472a899a30117a1,0,-1,-1,97,1758


In [21]:
edits.shape

(22126031, 11)

In [22]:
users = pd.DataFrame({'user_id': edits.user_id.unique()})
print('Num of users:', len(users))

Num of users: 44514


In [23]:
edits['time'] = pd.to_datetime(edits.timestamp)
edits['date'] = edits.time.apply(lambda x: x.date())

# Get each revert to be analyzed

In [24]:
df = edits[edits.reverted==1][['user_id', 'article_id', 'revision_id', 'reverted', 'time', 'date']].sample(500*1000)

In [26]:
unique_users = df.user_id.unique()

In [27]:
df.shape

(500000, 6)

In [28]:
filtered_edit = edits[edits.user_id.isin(unique_users)]
filtered_edit.shape

(21323484, 13)

## Add first contribution

In [29]:
first_contribution = filtered_edit.groupby(['user_id'])['time'].min()
first_contribution = pd.DataFrame(first_contribution)
first_contribution = first_contribution.rename(columns={'time': 'time_first_contribution'})
first_contribution.head(2)

df = pd.merge(df, first_contribution, how='left', on='user_id')
df.sample(5)

Unnamed: 0,user_id,article_id,revision_id,reverted,time,date,time_first_contribution
103848,178417,542198,11934182,1,2005-03-31 03:17:13,2005-03-31,2004-12-16 22:53:58
390749,386018,7885,251617057,1,2008-11-13 19:59:33,2008-11-13,2006-01-25 20:27:03
18470,955857,68453,356974779,1,2010-04-19 12:38:51,2010-04-19,2006-09-19 02:20:42
135249,117893,407149,205521398,1,2008-04-14 09:50:19,2008-04-14,2007-02-11 22:21:35
62426,764788,11826,323603600,1,2009-11-03 02:07:23,2009-11-03,2006-11-15 03:37:04


In [30]:
edits_per_day = edits.groupby(['date', 'user_id'])['time'].count().unstack()
edits_per_day = edits_per_day.reset_index()
edits_per_day.head()

user_id,date,30,44,54,60,64,120,146,162,187,...,999912,999924,999931,999944,999954,999965,999969,999970,999994,999998
0,2001-05-10,,,,,,,,,,...,,,,,,,,,,
1,2001-05-14,,,,,,,,,,...,,,,,,,,,,
2,2001-05-15,,,,,,,,,,...,,,,,,,,,,
3,2001-05-16,,,,,,,,,,...,,,,,,,,,,
4,2001-05-17,,,,,,,,,,...,,,,,,,,,,


In [31]:
after = []
before = []
for i, row in df.iterrows():
    user_id = row['user_id']
    time = row['date']
    num_articles_after_revert = edits_per_day[user_id][edits_per_day.date > time].sum()
    num_articles_before_revert = edits_per_day[user_id][edits_per_day.date < time].sum()
    after.append(int(num_articles_after_revert))
    before.append(int(num_articles_before_revert))

KeyboardInterrupt: 

In [None]:
df['num_articles_after_revert'] = after
df['num_articles_before_revert'] = before

In [None]:
df.head()

In [None]:
days_before_revert = df.time - df.time_first_contribution
df['days_before_revert'] = days_before_revert.apply(lambda x: x.days)
df.head(2)

In [None]:
df['target'] = df.num_articles_after_revert.apply(lambda x: 1 if x > 0 else 0)

In [None]:
df.target.value_counts()

# Comments

In [None]:
cnt = Counter()

def get_bag(comment):
    comment = nlp(comment)
    words = []
    for word in comment:
        wanted_word = [
            not word.is_stop,
            not word.is_punct,
            not word.is_space,
        ]
        if all(wanted_word):
            cnt[word.lemma_] += 1
            words.append(word.lemma_)
    return words

In [18]:
df.to_csv('../../data/processed/sampled_reverts_500k_with_comments.csv', index=True)