# Introduction

This notebook explains how I calculate the metrics based on abusefilter log queries. 

## Set up the environment and import constants

In [1]:
# append system path to import utils
import sys
sys.path.append('../')

In [2]:
import json
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
with open('../data/processed_data/bot_names.json') as file:
    bot_names = json.load(file)
bot_names_series = pd.Series(bot_names)

In [4]:
from utils.data_processing import file_iterator
from utils.data_processing import check_ip_address

# Code for calcualting abusefiltered edits metrics

In [5]:
df = pd.read_csv('../data/abuselogs/2023/2023-12.csv.gz', compression='gzip')

In [6]:
# Set up parameters for data extraction 
dir_path = '../data/abuselogs'
extracted_columns = ['id', 'filter_id', 'user', 'title', 'user', 'action', 'result', 'timestamp']

In [7]:
# Define functions for cleaning dataframe
regex_pattern = r'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$'

def clean_df(df):
    df = df.drop_duplicates()
    # remove rows containing the column names
    df = df[df['title']!='title']
    # remove false positive reports
    df['title'] = df['title'].astype('str')
    df = df[~df['title'].str.lower().str.contains('false positive')]
    
    # fill nas for timestamp
    # if 'revid' in df.columns:
    #     row_idx = (df['revid'].str.match(regex_pattern)) & (~df['revid'].isna())
    #     df.loc[row_idx, ['revid', 'timestamp']] = df.loc[row_idx, ['timestamp', 'revid']].values
    #     df.loc[df['timestamp'].isna(), 'timestamp'] = df.loc[df['timestamp'].isna(), 'revid']
    return df

In [8]:
# Initialize an empty list to store metrics
afedit_metrics = []

# iterate over stored data
for year_month, df in file_iterator(dir_path, start_year=2009, end_year=2023, extract_cols=extracted_columns):
    if df is not None:
        # clean data
        df = clean_df(df)
        # only keep edits that are filtered
        df = df[df['action'] == 'edit']

         # Classify results
        df['result'] = df['result'].fillna('unknown')
        df['result_disallow'] = df['result'].str.lower().str.contains('disallow')
        df['result_warn'] = df['result'].str.lower().str.contains('warn')
        df['result_tag'] = df['result'].str.lower().str.contains('tag').sum()

        # Classify users
        df['user_is_ip'] = check_ip_address(df, column='user')
        df['user_is_bot'] = df['user'].isin(bot_names_series)
       
        # Initialise a temporary dictory to store metrics
        metric={'month_year': year_month}

        # Count number of triggered logs
        metric['n_afedit_result_all'] = df['id'].nunique()
        metric['n_afedit_result_disallow'] = df[df['result_disallow']]['id'].nunique()
        metric['n_afedit_result_warn'] = df[df['result_warn'] & ~df['result_disallow']]['id'].nunique()
        metric['n_afedit_result_tag'] = df[df['result_tag'] & ~df['result_disallow'] & ~df['result_warn']]['id'].nunique()
        metric['n_afedit_result_other'] = metric['n_afedit_result_all'] - metric['n_afedit_result_tag'] - metric['n_afedit_result_warn'] - metric['n_afedit_result_disallow']

        # 
        metric['n_afedit_editor_all'] = df['user'].nunique()
        metric['n_afedit_editor_ip'] = df[df['user_is_ip']]['user'].nunique()
        metric['n_afedit_editor_bot'] = df[df['user_is_bot']]['user'].nunique()
        metric['n_afedit_editor_user'] = metric['n_afedit_editor_all'] - metric['n_afedit_editor_ip'] - metric['n_afedit_editor_bot']

        # store the metric
        afedit_metrics.append(metric)


File not found: ../data/abuselogs/2009/2009-01.csv
File not found: ../data/abuselogs/2009/2009-01.csv.gz
File not found: ../data/abuselogs/2009/2009-02.csv
File not found: ../data/abuselogs/2009/2009-02.csv.gz
Extracting: ../data/abuselogs/2009/2009-03.csv.gz
Extracting: ../data/abuselogs/2009/2009-04.csv.gz
Extracting: ../data/abuselogs/2009/2009-05.csv.gz
Extracting: ../data/abuselogs/2009/2009-06.csv.gz
Extracting: ../data/abuselogs/2009/2009-07.csv.gz
Extracting: ../data/abuselogs/2009/2009-08.csv.gz
Extracting: ../data/abuselogs/2009/2009-09.csv.gz
Extracting: ../data/abuselogs/2009/2009-10.csv.gz
Extracting: ../data/abuselogs/2009/2009-11.csv.gz
Extracting: ../data/abuselogs/2009/2009-12.csv.gz
Extracting: ../data/abuselogs/2010/2010-01.csv.gz
Extracting: ../data/abuselogs/2010/2010-02.csv.gz
Extracting: ../data/abuselogs/2010/2010-03.csv.gz
Extracting: ../data/abuselogs/2010/2010-04.csv.gz
Extracting: ../data/abuselogs/2010/2010-05.csv.gz
Extracting: ../data/abuselogs/2010/2010-

Convert extracted metrics to dataframe

In [9]:
afedit_metrics_df = pd.DataFrame(afedit_metrics)
afedit_metrics_df.head()

Unnamed: 0,month_year,n_afedit_result_all,n_afedit_result_disallow,n_afedit_result_warn,n_afedit_result_tag,n_afedit_result_other,n_afedit_editor_all,n_afedit_editor_ip,n_afedit_editor_bot,n_afedit_editor_user
0,2009-03,98336,2355,53259,42722,0,43186,32943,14,10229
1,2009-04,178851,4551,88776,85524,0,82194,58133,8,24053
2,2009-05,211495,4667,87449,119379,0,93990,66766,8,27216
3,2009-06,178840,4032,65055,109753,0,80322,54781,12,25529
4,2009-07,160733,4013,52030,104690,0,73174,46732,11,26431


In [10]:
afedit_metrics_df.tail()

Unnamed: 0,month_year,n_afedit_result_all,n_afedit_result_disallow,n_afedit_result_warn,n_afedit_result_tag,n_afedit_result_other,n_afedit_editor_all,n_afedit_editor_ip,n_afedit_editor_bot,n_afedit_editor_user
173,2023-08,190441,32392,19826,0,138223,73876,50279,12,23585
174,2023-09,200534,41285,22460,0,136789,75740,52960,15,22765
175,2023-10,225614,45468,23654,156492,0,84108,56972,15,27121
176,2023-11,210312,44754,23821,141737,0,79583,56073,18,23492
177,2023-12,191852,37747,20329,0,133776,73911,51479,14,22418


## Save metrics to a local csv

In [11]:
afedit_metrics_df.to_csv('../data/processed_data/abusefiltered_edits_monthly_metrics.csv')