# Sample summary table stats

In [1]:
# passive reverts = times when bot was reverted by someone else
# active reverts = times when bot reverts someone else

In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from datetime import datetime

In [4]:
df = pd.read_csv('revs_scored_jan.tsv', sep='\t', header=0)

In [5]:
days = df.rev_timestamp.map(lambda ts: datetime.utcfromtimestamp(ts).day)
df['day'] = days

In [6]:
df_sample = df[df.day<22]
df_bots = df_sample[df_sample.user_is_bot]

In [7]:
df_reverters = df[df.is_revert]

df_reverted = df[df.is_reverted]
df_reverted.seconds_to_revert = df_reverted.seconds_to_revert.astype('str').astype('int')
df_reverted = df_reverted[df_reverted.seconds_to_revert < 86400]
df_reverted.revert_id = df_reverted.revert_id.astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [8]:
# merging df_reverted and df_reverters to get dataset of all reverted edits

all_reverted_edits = pd.merge(df_reverted, df_reverters, 
             how='inner', 
             left_on='revert_id', 
             right_on='rev_id', 
             suffixes=('', '_reverter')
)

In [9]:
df_bot_active_reverts = all_reverted_edits[all_reverted_edits.user_is_bot_reverter == True]
df_bot_passive_reverts = all_reverted_edits[all_reverted_edits.user_is_bot == True]

In [10]:
df_human_active_reverts = all_reverted_edits[all_reverted_edits.user_is_bot_reverter == False]
df_human_passive_reverts = all_reverted_edits[all_reverted_edits.user_is_bot == False]

In [11]:
# current variable summary:
#   df_sample = all edits in our time frame
#   df_bots = all edits made by bots in time frame
#   df_reverters = all reverting edits in dataset (not limited to time frame)
#   df_reverted = all revisions that were reverted w/in 24 hours in time frame
#   all_reverted_edits = every edit reverted w/in 24 hours with reverted and revert edit info, within time frame
#     df_bot_active_reverts = all active reverts by bots of humans and bots in time frame
#     df_bot_passive_reverts = all revisions by bots which were reverted (by humans or bots), with reverting user info merged, in time frame
#     df_human_active_reverts
#     df_human_passive_reverts

## total edits


In [12]:
# total edits in sample
len(df_sample)

3196890

In [13]:
# total bot edits
len(df_bots)

321025

In [14]:
# percent bot edits
len(df_bots) / len(df_sample)

0.10041790615254201

In [15]:
# total human edits
len(df_sample[df_sample.user_is_bot == False])

2875865

## bot edits / bot passive reverts

In [16]:
# total reverts
len(df_bot_passive_reverts)

12543

In [17]:
# reverts by bots
len(df_bot_passive_reverts[df_bot_passive_reverts.user_is_bot_reverter == True])

7884

In [18]:
# reverts by humans
len(df_bot_passive_reverts[df_bot_passive_reverts.user_is_bot_reverter == False])

4659

In [19]:
# self-reverts
len(df_bot_passive_reverts[df_bot_passive_reverts.user_text == df_bot_passive_reverts.user_text_reverter])

7020

## human edits / human passive reverts

In [20]:
# total reverts
len(df_human_passive_reverts)

366888

In [21]:
# reverts by bots
len(df_human_passive_reverts[df_human_passive_reverts.user_is_bot_reverter == True])

22243

In [22]:
# reverts by humans
len(df_human_passive_reverts[df_human_passive_reverts.user_is_bot_reverter == False])

344645

In [23]:
# self-reverts
len(df_human_passive_reverts[df_human_passive_reverts.user_text == df_human_passive_reverts.user_text_reverter])

47397

## bot reverts / bot active reverts

In [24]:
# total bot reverts
len(df_bot_active_reverts.groupby("revert_id").count())

22791

In [25]:
# bot reverts of other bots
a = df_bot_active_reverts[df_bot_active_reverts.user_is_bot == True]
len(a.groupby("revert_id").count())

5592

In [26]:
# bot reverts of humans
a = df_bot_active_reverts[df_bot_active_reverts.user_is_bot == False]
len(a.groupby("revert_id").count())

18220

In [27]:
# bot self-reverts
a = df_bot_active_reverts[df_bot_active_reverts.user_text == df_bot_active_reverts.user_text_reverter]
len(a.groupby("revert_id").count())

5040

## human reverts / human active reverts

In [28]:
# total reverts
len(df_human_active_reverts.groupby("revert_id").count())

255402

In [29]:
# human reverts of bots
a = df_human_active_reverts[df_human_active_reverts.user_is_bot == True]
len(a.groupby("revert_id").count())

4136

In [30]:
# human reverts of other humans
a = df_human_active_reverts[df_human_active_reverts.user_is_bot == False]
len(a.groupby("revert_id").count())

253128

In [31]:
# human reverts of other humans
a = df_human_active_reverts[df_human_active_reverts.user_text == df_human_active_reverts.user_text_reverter]
len(a.groupby("revert_id").count())

36844

## Average revert set size

In [32]:
# bot passive reverts
df_bot_passive_reverts.revert_set_size_reverter.astype('int').mean()

3.025273060671291

In [33]:
# human passive reverts
df_human_passive_reverts.revert_set_size_reverter.astype('int').mean()

2.199466867272846

In [34]:
# bot active reverts
a = df_bot_active_reverts.groupby(["revert_id", "revert_set_size_reverter"], as_index = False).count()
a.revert_set_size_reverter.astype('int').mean()

1.5124391207055417

In [35]:
# human active reverts
a = df_human_active_reverts.groupby(["revert_id", "revert_set_size_reverter"], as_index = False).count()
a.revert_set_size_reverter.astype('int').mean()

1.500238839163358

In [36]:
# human active bot reverts
a = df_human_active_reverts[df_human_active_reverts.user_is_bot == True]
a = a.groupby(["revert_id", "revert_set_size_reverter"], as_index = False).count()
a.revert_set_size_reverter.astype('int').mean()

2.3331721470019344

In [37]:
# human active human reverts
a = df_human_active_reverts[df_human_active_reverts.user_is_bot == False]
a = a.groupby(["revert_id", "revert_set_size_reverter"], as_index = False).count()
a.revert_set_size_reverter.astype('int').mean()

1.4997906197654942

## percentage of reverts reverting multiple edits

In [38]:
len(df[(df.is_revert) & (df.revert_set_size == "1")]) / len(df[df.is_revert])

0.7619643552027395

## percentage of bot edits reverted by human which were reverted with a human edit

In [40]:
reverts_h = df_bot_passive_reverts.groupby('revert_id', as_index=False).count()[["revert_id"]]

In [41]:
sdf1 = df.loc[df.is_reverted] 
sdf1.revert_id = sdf1.revert_id.astype('str').astype('int')

sdf2 = reverts_h

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [42]:
reverts_h = pd.merge(sdf1, sdf2, 
             how='inner', 
             left_on='revert_id', 
             right_on='revert_id', 
             suffixes=('', '')
)

In [43]:
a = reverts_h.groupby(["revert_id", "user_is_bot"], as_index=False).count()

In [44]:
# percentage of anomie's human passive reverts reverted with a human edit
#    (found by dividing the amount of reverts that reverted a human by the ones reverting anomie)

len(a[a.user_is_bot==False]) / len(a[a.user_is_bot==True])

0.3189761513157895