In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import glob
%matplotlib inline

In [2]:
import datetime

Note that this does not require Cython, the functions are just sped up using the `%%cython` magic

In [3]:
%load_ext Cython

In [4]:
start = datetime.datetime.now()
start

datetime.datetime(2017, 4, 10, 11, 30, 38, 725293)

## Data processing

In [5]:
glob.glob("../../datasets/*bot2bot.tsv")

['../../datasets/enwiki_20161201_reverted_bot2bot.tsv',
 '../../datasets/zhwiki_20161001_reverted_bot2bot.tsv',
 '../../datasets/ptwiki_20161001_reverted_bot2bot.tsv',
 '../../datasets/frwiki_20161001_reverted_bot2bot.tsv',
 '../../datasets/jawiki_20161001_reverted_bot2bot.tsv',
 '../../datasets/dewiki_20161001_reverted_bot2bot.tsv',
 '../../datasets/eswiki_20161001_reverted_bot2bot.tsv']

In [6]:
df_dict = {}
for filename in glob.glob("../../datasets/*bot2bot.tsv"):
    lang_code = filename[15:17]
    df_dict[lang_code] = pd.read_csv(filename, sep="\t")

In [7]:
for lang, lang_df in df_dict.items():
    print(lang, len(lang_df))

en 1001093
zh 102846
pt 141738
fr 193066
ja 89980
de 137844
es 177368


In [8]:
df_dict['en'][0:2].transpose()

Unnamed: 0,0,1
rev_id,273691771,136526894
rev_timestamp,20090227173507,20070607044209
rev_user,6505923,4534303
rev_user_text,Kbdankbot,PbBot
rev_page,5040439,3046554
rev_sha1,qj45ne2z4yfexmpaz5wfnbm2yrmqt4j,3xtnw7u4w9h6cg1smw97mqnr1en6a55
rev_minor_edit,False,False
rev_deleted,False,False
rev_parent_id,2.59117e+08,1.20932e+08
archived,False,False


### Combining into one dataframe

In [9]:
df_all = df_dict['en'].copy()
df_all = df_all.drop(df_all.index, axis=0)

for lang, lang_df in df_dict.items():
    lang_df['language'] = lang
    df_all = pd.concat([df_all, lang_df])

In [10]:
df_all['language'].value_counts()

en    1001093
fr     193066
es     177368
pt     141738
de     137844
zh     102846
ja      89980
Name: language, dtype: int64

In [11]:
for lang, lang_df in df_dict.items():
    print(lang, len(lang_df))
    


en 1001093
zh 102846
pt 141738
fr 193066
ja 89980
de 137844
es 177368


### Namespace type

In [12]:
def namespace_type(item):
    if int(item) == 0:
        return 'article'
    elif int(item) == 14:
        return 'category'
    elif int(item) % 2 == 1:
        return 'other talk'
    else:
        return 'other page'

In [13]:
df_all['namespace_type'] = df_all['page_namespace'].apply(namespace_type)

In [14]:
df_all['namespace_type'].value_counts()

article       1122392
category       365193
other page     226994
other talk     129356
Name: namespace_type, dtype: int64

### Datetime parsing

In [15]:
def get_year(timestamp):
    return timestamp.year

In [16]:
df_all['reverting_timestamp_dt'] = pd.to_datetime(df_all['reverting_timestamp'], format="%Y%m%d%H%M%S")
df_all['reverted_timestamp_dt'] = pd.to_datetime(df_all['rev_timestamp'], format="%Y%m%d%H%M%S")

df_all = df_all.set_index('reverting_timestamp_dt')

df_all['reverting_timestamp_dt'] = pd.to_datetime(df_all['reverting_timestamp'], format="%Y%m%d%H%M%S")

df_all['time_to_revert'] = df_all['reverting_timestamp_dt']-df_all['reverted_timestamp_dt']

df_all['time_to_revert_hrs'] = df_all['time_to_revert'].astype('timedelta64[s]')/(60*60)

df_all['time_to_revert_days'] = df_all['time_to_revert'].astype('timedelta64[s]')/(60*60*24)

df_all['reverting_year'] = df_all['reverting_timestamp_dt'].apply(get_year)

### enwiki yearly bot-bot revert counts

In [17]:
df_all[df_all['language']=='en'].reverting_year.value_counts().sort_index()

2004         4
2005       262
2006      6238
2007     34084
2008     66228
2009     72846
2010     61780
2011    126814
2012     96084
2013    403111
2014     41188
2015     49194
2016     43260
Name: reverting_year, dtype: int64

### all langauges (in dataset) yearly bot-bot revert counts

In [18]:
df_all.reverting_year.value_counts().sort_index()

2004       604
2005      3196
2006     12714
2007     58850
2008    109978
2009    163096
2010    137000
2011    294004
2012    206238
2013    685235
2014     53466
2015     70826
2016     48728
Name: reverting_year, dtype: int64

### Final data format

In [19]:
df_all[0:2].transpose()

reverting_timestamp_dt,2009-02-28 02:19:25,2009-02-10 23:03:37
archived,False,False
language,en,en
page_namespace,0,1
rev_deleted,False,False
rev_id,273691771,136526894
rev_minor_edit,False,False
rev_page,5040439,3046554
rev_parent_id,2.59117e+08,1.20932e+08
rev_revert_offset,1,1
rev_sha1,qj45ne2z4yfexmpaz5wfnbm2yrmqt4j,3xtnw7u4w9h6cg1smw97mqnr1en6a55


# Comments analysis

Function for removing text within square brackets or parentheses, which is useful for aggregating comment messages.

In [20]:
%%cython
# by http://stackoverflow.com/questions/14596884/remove-text-between-and-in-python

def remove_brackets(str test_str):
    """
    Takes a string and returns that string with text in brackets and parentheses removed
    """
    
    test_str = str(test_str)
    ret = ''
    skip1c = 0
    skip2c = 0
    for i in test_str:
        if i == '[':
            skip1c += 1
        elif i == '(':
            skip2c += 1
        elif i == ']' and skip1c > 0:
            skip1c -= 1
        elif i == ')'and skip2c > 0:
            skip2c -= 1
        elif skip1c == 0 and skip2c == 0:
            ret += i
            
    return " ".join(ret.split())

In [21]:
df_all['reverting_comment_nobracket'] = df_all['reverting_comment'].apply(remove_brackets)

### Comment parsing functions

There are two functions that are used to parse comments. `comment_categorization()` runs first and applies a series of pattern matching to comments. If a match is not found, then `interwiki_confirm()` is called, which checks for languages codes in certain patterns that indicate interwiki links.

In [22]:
%%cython
def interwiki_confirm(str comment):
    """
    Takes a comment string, searches for language codes bordered by 
    two punctuation marks from [](){},: or one punctuation mark and
    one space. Beginning and end of a comment string counts as a
    space, not a punctuation mark.
    """
    import string, re
    
    with open("lang_codes.tsv", "r") as f:
        lang_codes = f.read().split("\n")
        
    lang_codes.pop() # a blank '' is in the list that gets returned
    
    try:
        comment = str(comment)
        comment = comment.lower()
        comment = comment.replace(": ", ":")
        comment = " " + comment + " "  # pad start and end of string with non-punctuation
        #print(comment)
        
    except Exception as e:
        return 'other'
    
    for lang_code in lang_codes:
        
        lang_code_pos = comment.find(lang_code)
        lang_code_len = len(lang_code)
        
        char_before = " "
        char_after = " "
        
        if lang_code_pos >= 0:
            char_before = comment[lang_code_pos-1]
        
            #print("Char before: '", char_before, "'", sep='')
             
            char_after = comment[lang_code_pos+lang_code_len]

            #print("Char after: '", char_after, "'", sep='')
            
            if char_before in string.punctuation and char_after in "[]{}(),:":
                #print(lang_code)
                return 'interwiki link cleanup -- suspected'
            
            elif char_after in string.punctuation and char_before in "[]{}(),:":
                #print(lang_code)
                return 'interwiki link cleanup -- suspected'
            
            elif char_before == " " and char_after in "[]{}(),:":
                #print(lang_code)
                return 'interwiki link cleanup -- suspected'
            
            elif char_after == " " and char_before in "[]{}(),:":
                #print(lang_code)
                return 'interwiki link cleanup -- suspected'               
    return 'other'
    

def comment_categorization(row):
    """
    Takes a row from a pandas dataframe or dict and returns a string with a
    kind of activity based on metadata. Used with df.apply(). Mostly parses
    comments, but makes some use of usernames too.
    """
    
    reverting_user = str(row['reverting_user_text'])
    
    
    if reverting_user.find("HBC AIV") >= 0:
        return 'AIV helperbot'
    
    try:
        comment = str(row['reverting_comment'])
    except Exception as e:
        return 'other'
    
    comment_lower = comment.lower().strip()
    comment_lower = " ".join(comment_lower.split())
 
    if comment == 'nan':
        return "deleted revision"
    
    elif comment.find("Undoing massive unnecessary addition of infoboxneeded by a (now blocked) bot") >= 0:
        return "botfight: infoboxneeded"
    
    elif comment_lower.find("commonsdelinker") >=0 and reverting_user.find("CommonsDelinker") == -1:
        return "botfight: reverting CommonsDelinker"
        
    elif comment.find("Reverted edits by [[Special:Contributions/ImageRemovalBot") >= 0:
        return "botfight: 718bot vs ImageRemovalBot"
    
    elif comment_lower.find("double redirect") >= 0:
        return "fixing double redirect"
    
    elif comment_lower.find("double-redirect") >= 0:
        return "fixing double redirect"

    elif comment_lower.find("has been moved; it now redirects to") >= 0:
        return "fixing double redirect"
    
    elif comment_lower.find("correction du redirect") >= 0:
        return "fixing double redirect"   
        
    elif comment_lower.find("redirect tagging") >= 0:
        return "redirect tagging/sorting"
    
    elif comment_lower.find("sorting redirect") >= 0:
        return "redirect tagging/sorting"
    
    elif comment_lower.find("redirecciones") >= 0 and comment_lower.find("categoría") >= 0:
        return "category redirect cleanup"    
    
    elif comment_lower.find("change redirected category") >= 0:
        return "category redirect cleanup"
    
    elif comment_lower.find("redirected category") >=0:
        return "category redirect cleanup"
    
    elif comment.find("[[User:Addbot|Bot:]] Adding ") >= 0:
        return "template tagging"
    
    elif comment_lower.find("interwiki") >= 0:
        return "interwiki link cleanup"
    
    elif comment_lower.find("langlinks") >= 0:
        return "interwiki link cleanup"
    
    elif comment_lower.find("iw-link") >= 0:
        return "interwiki link cleanup"
    
    elif comment_lower.find("changing category") >= 0:
        return "moving category"
    
    elif comment_lower.find("recat per") >= 0:
        return "moving category"
    
    elif comment_lower.find("moving category") >= 0:
        return "moving category"
    
    elif comment_lower.find("re-categorisation") >= 0:
        return "moving category"
    
    elif comment_lower.find("recatégorisation") >= 0:
        return "moving category"    
        
    elif comment_lower.find("removing a protection template") >= 0:
        return "protection template cleanup"
    
    elif comment_lower.find("removing categorization template") >= 0:
        return "template cleanup"    
    
    elif comment_lower.find("rm ibid template per") >= 0:
        return "template cleanup"      
    
    elif comment_lower.find("page is not protected") >= 0:
        return "template cleanup"          
    
    elif comment_lower.find("removing protection template") >= 0:
        return "template cleanup"      
    
    elif comment_lower.find("removing orphan t") >= 0:
        return "orphan template cleanup"
    
    elif comment_lower.find("non-applicable orphan") >= 0:
        return "orphan template cleanup"
    
    elif comment_lower.find("plantilla") >= 0 and comment_lower.find("huérfano") >= 0:
        return "orphan template cleanup"
    
    elif comment_lower.find("removed orphan t") >= 0:
        return "orphan template cleanup"    
    
    elif comment_lower.find("sandbox") >= 0:
        return "clearing sandbox"
    
    elif comment_lower.find("archiving") >= 0:
        return "archiving"
    
    elif comment_lower.find("duplicate on commons") >= 0:
        return "commons image migration"
    
    elif comment_lower.find("user:mathbot/changes to mathlists") >= 0:
        return "mathbot mathlist updates"
        
    elif comment_lower.find("link syntax") >= 0:
        return "link syntax fixing"
    
    elif comment_lower.find("links syntax") >= 0:
        return "link syntax fixing" 
    
    elif comment_lower.find(" per ") >= 0:
        return "has per justification"  
    
    elif comment_lower.find(" según") >= 0:
        return "has per justification"      
 
    elif comment_lower.find("suite à discussion") >= 0:
        return "has per justification"  
    
    elif comment_lower.find("suite à conservation") >= 0:
        return "has per justification"     
    
    elif comment_lower.find("revert") >= 0:
        return "other w/ revert in comment"  
    
    elif comment_lower.find("rv ") >= 0 or comment_lower.find("rv") == 0:
        return "other w/ revert in comment"  
    
    elif comment_lower.find("wikidata") >= 0:
        return "interwiki link cleanup"
    
    elif comment.find("言語間") >=0:
        return "interwiki link cleanup"
        
    elif comment_lower.find("interproyecto") >=0:
        return "interwiki link cleanup"    
        
    elif comment.find("语言链接") >=0:
        return "interwiki link cleanup"  
    
    elif comment.find("双重重定向") >=0 or comment.find("雙重重定向") >= 0:
        return "fixing double redirect"   

    elif comment.find("二重リダイレクト") >=0:
        return "fixing double redirect"  
    
    elif comment_lower.find("doppelten redirect") >=0:
        return "fixing double redirect"  
    
    elif comment_lower.find("doppelte weiterleitung") >=0:
        return "fixing double redirect"      
    
    elif comment_lower.find("redirectauflösung") >=0:
        return "fixing double redirect"      
    
    elif comment_lower.find("doble redirección") >=0 or comment_lower.find("redirección doble") >= 0:
        return "fixing double redirect"  
    
    elif comment_lower.find("redireccionamento duplo") >=0:
        return "fixing double redirect"  

    elif comment_lower.find("duplo redirecionamento") >=0:
        return "fixing double redirect"      
    
    elif comment_lower.find("suppression bandeau") >= 0:
        return "template cleanup"
    
    elif comment_lower.find("archiviert") >= 0:
        return "archiving"
    
    else:
        return interwiki_confirm(comment)

Testing interwiki confirm

In [23]:
tests_yes = ["Robot adding [[es:Test]]",
             "adding es:Test",
             "linking es, it, en",
             "modifying en:",
             "modifying:en",
             "modifying: en"]

tests_no = ["test", 
            "discuss policies on enwiki vs eswiki", 
            "it is done", 
            "it's not its", 
            "its not it's",
            "modifying it all",
            "modifying italy"]

print("Should return interwiki link cleanup -- suspected")
for test in tests_yes:
    print("\t", interwiki_confirm(test))

print("Should return other")
for test in tests_no:
    print("\t", interwiki_confirm(test))

Should return interwiki link cleanup -- suspected
	 interwiki link cleanup -- suspected
	 interwiki link cleanup -- suspected
	 interwiki link cleanup -- suspected
	 interwiki link cleanup -- suspected
	 interwiki link cleanup -- suspected
	 interwiki link cleanup -- suspected
Should return other
	 other
	 other
	 other
	 other
	 other
	 other
	 other


Apply categorization

In [24]:
%%time
df_all['bottype'] = df_all.apply(comment_categorization, axis=1)

CPU times: user 4min 58s, sys: 5.3 s, total: 5min 4s
Wall time: 5min 4s


## Analysis

Much of what we're interested in are articles, which are in namespace 0. 

In [25]:
df_all_ns0 = df_all[df_all['page_namespace']==0]

### Bottype counts and percentages across all languages in the dataset, articles only

In [26]:
type_counts = df_all_ns0['bottype'].value_counts().rename("count")
type_percent = df_all_ns0['bottype'].value_counts(normalize=True).rename("percent") * 100
type_percent = type_percent.round(2).astype(str) + "%"

pd.concat([type_counts, type_percent], axis=1)

Unnamed: 0,count,percent
interwiki link cleanup -- suspected,489752,43.63%
interwiki link cleanup,327372,29.17%
fixing double redirect,259022,23.08%
other,20064,1.79%
protection template cleanup,5686,0.51%
moving category,5230,0.47%
other w/ revert in comment,4852,0.43%
category redirect cleanup,3046,0.27%
orphan template cleanup,2330,0.21%
has per justification,1874,0.17%


### Bottype counts and percentages for each language, articles only

In [27]:
counts_dict = {}
for lang in df_all_ns0['language'].unique():

    df_lang_ns0 = df_all_ns0[df_all_ns0['language']==lang]
    
    type_counts = df_lang_ns0['bottype'].value_counts().rename("count")
    type_percent = df_lang_ns0['bottype'].value_counts(normalize=True).rename("percent") * 100
    type_percent = type_percent.round(2).astype(str) + "%"

    counts_dict[lang]=pd.concat([type_counts, type_percent], axis=1)

In [28]:
df_all_ns0['language'].unique()

array(['en', 'zh', 'pt', 'fr', 'ja', 'de', 'es'], dtype=object)

In [29]:
counts_dict['en']

Unnamed: 0,count,percent
fixing double redirect,220172,45.04%
interwiki link cleanup,168236,34.42%
interwiki link cleanup -- suspected,74592,15.26%
protection template cleanup,5674,1.16%
other,4744,0.97%
moving category,4672,0.96%
category redirect cleanup,2824,0.58%
orphan template cleanup,2054,0.42%
other w/ revert in comment,2014,0.41%
mathbot mathlist updates,1028,0.21%


In [30]:
counts_dict['ja']

Unnamed: 0,count,percent
interwiki link cleanup -- suspected,55268,79.86%
interwiki link cleanup,11488,16.6%
other,1826,2.64%
fixing double redirect,588,0.85%
other w/ revert in comment,24,0.03%
has per justification,14,0.02%


In [31]:
counts_dict['zh']

Unnamed: 0,count,percent
interwiki link cleanup -- suspected,47328,55.01%
interwiki link cleanup,30298,35.22%
fixing double redirect,7268,8.45%
other,604,0.7%
other w/ revert in comment,514,0.6%
has per justification,12,0.01%
botfight: reverting CommonsDelinker,6,0.01%


In [32]:
counts_dict['de']

Unnamed: 0,count,percent
interwiki link cleanup -- suspected,71228,64.86%
interwiki link cleanup,33898,30.87%
other,2592,2.36%
fixing double redirect,1972,1.8%
other w/ revert in comment,44,0.04%
moving category,20,0.02%
has per justification,20,0.02%
protection template cleanup,12,0.01%
category redirect cleanup,10,0.01%
botfight: reverting CommonsDelinker,10,0.01%


In [33]:
counts_dict['fr']

Unnamed: 0,count,percent
interwiki link cleanup -- suspected,82394,73.23%
interwiki link cleanup,20188,17.94%
fixing double redirect,6592,5.86%
other,1988,1.77%
has per justification,796,0.71%
moving category,538,0.48%
other w/ revert in comment,6,0.01%
clearing sandbox,4,0.0%
botfight: reverting CommonsDelinker,4,0.0%


In [34]:
counts_dict['pt']

Unnamed: 0,count,percent
interwiki link cleanup -- suspected,83274,69.35%
interwiki link cleanup,29964,24.96%
fixing double redirect,3816,3.18%
other,2948,2.46%
other w/ revert in comment,52,0.04%
has per justification,12,0.01%
clearing sandbox,2,0.0%
botfight: reverting CommonsDelinker,2,0.0%


In [35]:
counts_dict['es']

Unnamed: 0,count,percent
interwiki link cleanup -- suspected,75668,55.65%
interwiki link cleanup,33300,24.49%
fixing double redirect,18614,13.69%
other,5362,3.94%
other w/ revert in comment,2198,1.62%
has per justification,304,0.22%
orphan template cleanup,268,0.2%
category redirect cleanup,212,0.16%
botfight: reverting CommonsDelinker,32,0.02%
clearing sandbox,2,0.0%


### What was not categorized?

In [36]:
other_count = 0
for comment, count in df_all[df_all['bottype']=='other']['reverting_comment_nobracket'].value_counts().iteritems():
    if count > 25:
        print(count, "\t", comment)
    else:
        other_count = other_count + count
print(other_count, "\tOther cases")

7268 	 Removing redlinks.
3364 	 No broken #section links left
3036 	 Robot: Automatically cleaned
1968 	 Bot: Entferne:
1752 	 Robot : Remplacement modèle {{Wikiprojet droit/rattachement précis non vérifié}} par {{Wikiprojet droit}}
1668 	 Bot: Automatically cleaned
1668 	 r2.7.1)
1520 	 WildBot was summoned
1500 	 取消（）的编辑；更改回的最后一个版本
1456 	 Automated archival of 1 sections to
1082 	 mise en forme
1040 	 Removing completed dates from holding cell. Errors?
984 	 BOT: removing {{}} either because the file no longer meets the criteria for or because the file is already on Commons
910 	 -
902 	 No ambiguous links left
890 	 Robô: A corrigir o redirecionamento duplo para
860 	 r2.7.3)
806 	 机器人: 加速才女机器人更新
714 	 r2.7.2)
670 	 bot: 「削除告知」
632 	 Tagging for a wikiproject using tag {{WikiProject Genetics|class=|importance=|imageneeded=|imagedetails=|unref=}}
616 	 No ambiguous links left; No broken #section links left
592 	 BOT: Removing {{Orphan Image}} from a non-orphaned file
568 	 机器人: 本页被自

In [37]:
other_count = 0
for comment, count in df_all[df_all['bottype']=='other']['reverting_comment'].value_counts().iteritems():
    if count > 25:
        print(count, "\t", comment)
    else:
        other_count = other_count + count
print(other_count, "\tOther cases")

7268 	 Removing redlinks.
3364 	 No broken #section links left
3036 	 Robot: Automatically cleaned
1752 	 Robot : Remplacement modèle {{Wikiprojet droit/rattachement précis non vérifié}} par {{Wikiprojet droit}}
1668 	 Bot: Automatically cleaned
1520 	 WildBot was summoned
1082 	 mise en forme
1040 	 (BOT) Removing completed dates from holding cell. Errors? [[User:AnomieBOT/shutoff/PUICloser]]
984 	 BOT: removing {{[[Template:Copy to Wikimedia Commons|Copy to Wikimedia Commons]]}} either because the file no longer meets the criteria for [[WP:TRANSFER|transfer]] or because the file is already on Commons
910 	 -
902 	 No ambiguous links left
670 	 bot: [[Wikipedia:リダイレクトの削除依頼]]「削除告知」
632 	 Tagging for a wikiproject using tag {{WikiProject Genetics|class=|importance=|imageneeded=|imagedetails=|unref=}}
616 	 No ambiguous links left; No broken #section links left
614 	 机器人: 加速才女机器人更新 (候选积压)
592 	 BOT: Removing {{Orphan Image}} from a non-orphaned file
546 	 BOT: Removing {{Orphan image}} b

In [38]:
len(df_lang_ns0)

135960

In [39]:
lang_l = 'es'
other_count = 0
df_lang_ns0 = df_all_ns0[df_all_ns0['language']==lang_l]

for comment, count in df_lang_ns0[df_lang_ns0['bottype']=='other']['reverting_comment'].value_counts().iteritems():
    if count > 5:
        print(count, "\t", comment)
    else:
        other_count = other_count + count
print(other_count, "\tOther cases")

114 	 -
78 	 Bot: trasladando categoría - [[Categoría:Personajes de El señor de los anillos]] (v1.34)
74 	 Fusión de plantillas
60 	 Bot: Cambiando #REDIRECT por #REDIRECCIÓN.
60 	 Bot: [[Usuario:FrescoBot/Enlaces|sintaxis de los enlaces]]
46 	 Bot: idioma = Inglés.
36 	 Robot: modificadas Categoría:Juzgados por el Tribunal Penal Internacional para la ex Yugoslavia a Categoría:Juzgados por el Tribunal Penal Internacional para la ex-Yugoslavia
34 	 Bot: Cambiada Categoría:Ciclistas de la Comunidad Foral de Navarra
32 	 Robot: Retirando {{[[Template:Semiprotegida|Semiprotegida]]}} por no estar protegido
30 	 Robot: modificadas Categoría:Juzgados por el Tribunal Penal Internacional para la ex-Yugoslavia a Categoría:Juzgados por el Tribunal Penal Internacional para la ex Yugoslavia
30 	 Bot: Cambiada Categoría:Monumentos de Zamora
28 	 [[Usuario:Bigsus-bot/Corrector ortográfico|Bot]]: Arreglando referencias y otros
24 	 Bot: Moviendo categoría
24 	 Pequeñas correcciones [[WP:CEM]].
22 	 Bo

## How long did this take to run?

In [40]:
end = datetime.datetime.now()
end

datetime.datetime(2017, 4, 10, 11, 36, 43, 354501)

In [41]:
time_to_run = end - start
minutes = int(time_to_run.seconds/60)
seconds = time_to_run.seconds % 60
print("Total runtime: ", minutes, "minutes, ", seconds, "seconds")

Total runtime:  6 minutes,  4 seconds


## Export possible botfights

In [60]:
def is_possible_botfight(bottype_str):
    if bottype_str == 'other':
        return True
    elif bottype_str == 'other w/ revert in comment':
        return True
    elif bottype_str.find('botfight') >= 0:
        return True
    else:
        return False

In [61]:
df_possible_botfights_mask = df_all['bottype'].apply(is_possible_botfight)

In [62]:
df_possible_botfights = df_all[df_possible_botfights_mask]
df_possible_botfights_ns0 = df_possible_botfights[df_possible_botfights['page_namespace']==0]

In [63]:
df_possible_botfights['language'].value_counts()

en    47769
es     9858
fr     6042
de     5996
zh     5306
pt     3662
ja     2110
Name: language, dtype: int64

In [64]:
df_possible_botfights_ns0['language'].value_counts()

en    7754
es    7592
pt    3002
de    2648
fr    1998
ja    1850
zh    1124
Name: language, dtype: int64

In [65]:
df_possible_botfights[0:2].transpose()

reverting_timestamp_dt,2012-02-08 00:14:46,2012-02-08 00:14:51
archived,False,False
language,en,en
page_namespace,6,6
rev_deleted,False,False
rev_id,474726560,475121614
rev_minor_edit,False,False
rev_page,13694272,13694643
rev_parent_id,2.01207e+08,1.98982e+08
rev_revert_offset,1,1
rev_sha1,qe1a8oa0wgbwcw6lm0fk5alwy0vgqt0,9mneyipkuofyjw75uku033ne3jzvnkw


In [66]:
df_possible_botfights.to_pickle("data/possible_botfights.pickle")
df_possible_botfights.to_csv("data/possible_botfights.tsv", sep="\t")