# Introduction

This notebook explains how I calculate the metrics based on abusefilter log queries. 

## Set up the environment and import constants

In [44]:
# append system path to import utils
import sys
sys.path.append('../')

In [45]:
import json
import pandas as pd

In [46]:
with open('../data/processed_data/bot_names.json') as file:
    bot_names = json.load(file)
bot_names_series = pd.Series(bot_names)

In [47]:
from utils.data_processing import file_iterator
from utils.data_processing import check_ip_address

# Code for calcualting abusefilter metrics

The code below shows how I calculate the metrics relating to blocked editors per month.

In [48]:
# Set up parameters for data extraction 
dir_path = '../data/abusefilter_logs'
extracted_columns = ['logid', 'title', 'action', 'user', 'type', 'pageid', 'action', 'user']

In [49]:
# Define functions for cleaning dataframe
def clean_df(df):
    df = df.drop_duplicates()
    # remove rows containing the column names
    df = df[df['title']!='title']
    # remove rows containing missing information on user/administrator names & action type
    df = df[~df['title'].isna() & ~df['user'].isna() & ~df['action'].isna()]
    # Check if this log is page specific vs global
    # df['pageid'] = df['pageid'].astype(str)
    # df['is_global'] = (df['pageid'] == '0') | (df['pageid'] == '0.0') 
    return df

In [50]:
# Initialize an empty list to store metrics
abusefilter_metrics = []

# iterate over stored data
for year_month, df in file_iterator(dir_path, start_year=2009, end_year=2023, extract_cols=extracted_columns):
    if df is not None:
        # clean data
        df = clean_df(df)
        
        # Initialise a temporary dictory to store metrics
        metric={'month_year': year_month}

        # Count all abuse filter logs
        metric['n_af_all'] = df['logid'].nunique()
        # Count all abuse filter creation logs
        metric['n_af_create'] = df[df['action']=='create']['logid'].nunique()
        # Count all abuse filter modify logs
        metric['n_af_modify'] = df[df['action']=='modify']['logid'].nunique()
        # Count all abuse filter configuration logs (i.e., create + modify)
        metric['n_af_configuration'] =  metric['n_af_create'] + metric['n_af_modify']
        # Count all abuse filter hit logs
        metric['n_af_hit'] = df[df['action']=='hit']['logid'].nunique()

        # store the metric
        abusefilter_metrics.append(metric)


File not found: ../data/abusefilter_logs/2009/2009-01.csv
File not found: ../data/abusefilter_logs/2009/2009-01.csv.gz
File not found: ../data/abusefilter_logs/2009/2009-02.csv
File not found: ../data/abusefilter_logs/2009/2009-02.csv.gz
Extracting: ../data/abusefilter_logs/2009/2009-03.csv
Extracting: ../data/abusefilter_logs/2009/2009-04.csv
Extracting: ../data/abusefilter_logs/2009/2009-05.csv
Extracting: ../data/abusefilter_logs/2009/2009-06.csv
Extracting: ../data/abusefilter_logs/2009/2009-07.csv
Extracting: ../data/abusefilter_logs/2009/2009-08.csv
Extracting: ../data/abusefilter_logs/2009/2009-09.csv
Extracting: ../data/abusefilter_logs/2009/2009-10.csv
Extracting: ../data/abusefilter_logs/2009/2009-11.csv
Extracting: ../data/abusefilter_logs/2009/2009-12.csv
Extracting: ../data/abusefilter_logs/2010/2010-01.csv
Extracting: ../data/abusefilter_logs/2010/2010-02.csv
Extracting: ../data/abusefilter_logs/2010/2010-03.csv
Extracting: ../data/abusefilter_logs/2010/2010-04.csv
Extrac

Extracting: ../data/abusefilter_logs/2012/2012-01.csv
Extracting: ../data/abusefilter_logs/2012/2012-02.csv
Extracting: ../data/abusefilter_logs/2012/2012-03.csv
Extracting: ../data/abusefilter_logs/2012/2012-04.csv
Extracting: ../data/abusefilter_logs/2012/2012-05.csv
Extracting: ../data/abusefilter_logs/2012/2012-06.csv
Extracting: ../data/abusefilter_logs/2012/2012-07.csv
Extracting: ../data/abusefilter_logs/2012/2012-08.csv
Extracting: ../data/abusefilter_logs/2012/2012-09.csv
Extracting: ../data/abusefilter_logs/2012/2012-10.csv
Extracting: ../data/abusefilter_logs/2012/2012-11.csv
Extracting: ../data/abusefilter_logs/2012/2012-12.csv
Extracting: ../data/abusefilter_logs/2013/2013-01.csv
Extracting: ../data/abusefilter_logs/2013/2013-02.csv
Extracting: ../data/abusefilter_logs/2013/2013-03.csv
Extracting: ../data/abusefilter_logs/2013/2013-04.csv
Extracting: ../data/abusefilter_logs/2013/2013-05.csv
Extracting: ../data/abusefilter_logs/2013/2013-06.csv
Extracting: ../data/abusefil

Convert extracted metrics to dataframe

In [51]:
abusefilter_metrics_df = pd.DataFrame(abusefilter_metrics)
abusefilter_metrics_df.head()

Unnamed: 0,month_year,n_af_all,n_af_create,n_af_modify,n_af_configuration,n_af_hit
0,2009-03,1281,0,1281,1281,0
1,2009-04,492,0,492,492,0
2,2009-05,210,0,210,210,0
3,2009-06,300,0,300,300,0
4,2009-07,273,0,273,273,0


In [52]:
abusefilter_metrics_df.tail(50)

Unnamed: 0,month_year,n_af_all,n_af_create,n_af_modify,n_af_configuration,n_af_hit
128,2019-11,247,7,240,247,0
129,2019-12,123,5,118,123,0
130,2020-01,103,8,95,103,0
131,2020-02,169,9,160,169,0
132,2020-03,141,10,131,141,0
133,2020-04,117,6,111,117,0
134,2020-05,324,11,313,324,0
135,2020-06,142,8,134,142,0
136,2020-07,76,5,71,76,0
137,2020-08,62,5,57,62,0


## Save metrics to a local csv

In [53]:
abusefilter_metrics_df.to_csv('../data/processed_data/abusefilter_monthly_metrics.csv')