# Sample summary table stats

In [1]:
# passive reverts = times when bot was reverted by someone else
# active reverts = times when bot reverts someone else

In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from datetime import datetime

In [4]:
df = pd.read_csv('revs_scored_jan.tsv', sep='\t', header=0)

In [5]:
days = df.rev_timestamp.map(lambda ts: datetime.utcfromtimestamp(ts).day)
df['day'] = days

In [28]:
df_sample = df[df.day<22]
df_bots = df_sample[df_sample.user_is_bot]

In [29]:
df_reverters = df[df.is_revert]

df_reverted = df[df.is_reverted]
df_reverted.seconds_to_revert = df_reverted.seconds_to_revert.astype('str').astype('int')
df_reverted = df_reverted[df_reverted.seconds_to_revert < 86400]
df_reverted.revert_id = df_reverted.revert_id.astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [30]:
# merging df_reverted and df_reverters to get dataset of all reverted edits

all_reverted_edits = pd.merge(df_reverted, df_reverters, 
             how='inner', 
             left_on='revert_id', 
             right_on='rev_id', 
             suffixes=('', '_reverter')
)

In [31]:
df_bot_active_reverts = all_reverted_edits[all_reverted_edits.user_is_bot_reverter == True]
df_bot_passive_reverts = all_reverted_edits[all_reverted_edits.user_is_bot == True]

In [32]:
df_human_active_reverts = all_reverted_edits[all_reverted_edits.user_is_bot_reverter == False]
df_human_passive_reverts = all_reverted_edits[all_reverted_edits.user_is_bot == False]

In [33]:
# current variable summary:
#   df_sample = all edits in our time frame
#   df_bots = all edits made by bots in time frame
#   df_reverters = all reverting edits in dataset (not limited to time frame)
#   df_reverted = all revisions that were reverted w/in 24 hours in time frame
#   all_reverted_edits = every edit reverted w/in 24 hours with reverted and revert edit info, within time frame
#     df_bot_active_reverts = all active reverts by bots of humans and bots in time frame
#     df_bot_passive_reverts = all revisions by bots which were reverted (by humans or bots), with reverting user info merged, in time frame
#     df_human_active_reverts
#     df_human_passive_reverts

## total edits


In [34]:
# total edits in sample
len(df_sample)

3196890

In [35]:
# total bot edits
len(df_bots)

321025

In [36]:
# percent bot edits
len(df_bots) / len(df_sample)

0.10041790615254201

In [37]:
# total human edits
len(df_sample[df_sample.user_is_bot == False])

2875865

## bot edits / bot passive reverts

In [38]:
# total reverts
len(df_bot_passive_reverts)

12543

In [39]:
# reverts by bots
len(df_bot_passive_reverts[df_bot_passive_reverts.user_is_bot_reverter == True])

7884

In [40]:
# reverts by humans
len(df_bot_passive_reverts[df_bot_passive_reverts.user_is_bot_reverter == False])

4659

In [41]:
# self-reverts
len(df_bot_passive_reverts[df_bot_passive_reverts.user_text == df_bot_passive_reverts.user_text_reverter])

7020

## human edits / human passive reverts

In [65]:
# total reverts
len(df_human_passive_reverts)

366888

In [42]:
# reverts by bots
len(df_human_passive_reverts[df_human_passive_reverts.user_is_bot_reverter == True])

22243

In [43]:
# reverts by humans
len(df_human_passive_reverts[df_human_passive_reverts.user_is_bot_reverter == False])

344645

In [54]:
# self-reverts
len(df_human_passive_reverts[df_human_passive_reverts.user_text == df_human_passive_reverts.user_text_reverter])

47397

## bot reverts / bot active reverts

In [61]:
# total bot reverts
len(df_bot_active_reverts.groupby("revert_id").count())

22791

In [62]:
# bot reverts of other bots
a = df_bot_active_reverts[df_bot_active_reverts.user_is_bot == True]
len(a.groupby("revert_id").count())

5592

In [63]:
# bot reverts of humans
a = df_bot_active_reverts[df_bot_active_reverts.user_is_bot == False]
len(a.groupby("revert_id").count())

18220

In [64]:
# bot self-reverts
a = df_bot_active_reverts[df_bot_active_reverts.user_text == df_bot_active_reverts.user_text_reverter]
len(a.groupby("revert_id").count())

5040

## human reverts / human active reverts

In [66]:
# total reverts
len(df_human_active_reverts.groupby("revert_id").count())

255402

In [67]:
# human reverts of bots
a = df_human_active_reverts[df_human_active_reverts.user_is_bot == True]
len(a.groupby("revert_id").count())

4136

In [68]:
# human reverts of other humans
a = df_human_active_reverts[df_human_active_reverts.user_is_bot == False]
len(a.groupby("revert_id").count())

253128

In [69]:
# human reverts of other humans
a = df_human_active_reverts[df_human_active_reverts.user_text == df_human_active_reverts.user_text_reverter]
len(a.groupby("revert_id").count())

36844

## Average revert set size

In [73]:
# bot passive reverts
df_bot_passive_reverts.revert_set_size_reverter.astype('int').mean()

3.025273060671291

In [72]:
# human passive reverts
df_human_passive_reverts.revert_set_size_reverter.astype('int').mean()

2.199466867272846

In [87]:
# bot active reverts
a = df_bot_active_reverts.groupby(["revert_id", "revert_set_size_reverter"], as_index = False).count()
a.revert_set_size_reverter.astype('int').mean()

1.5124391207055417

In [88]:
# human active reverts
a = df_human_active_reverts.groupby(["revert_id", "revert_set_size_reverter"], as_index = False).count()
a.revert_set_size_reverter.astype('int').mean()

1.500238839163358

In [89]:
# human active bot reverts
a = df_human_active_reverts[df_human_active_reverts.user_is_bot == True]
a = a.groupby(["revert_id", "revert_set_size_reverter"], as_index = False).count()
a.revert_set_size_reverter.astype('int').mean()

2.3331721470019344

In [90]:
# human active human reverts
a = df_human_active_reverts[df_human_active_reverts.user_is_bot == False]
a = a.groupby(["revert_id", "revert_set_size_reverter"], as_index = False).count()
a.revert_set_size_reverter.astype('int').mean()

1.4997906197654942