# Introduction

This notebook explains how I calculate the metrics based on abusefilter log queries. 

## Set up the environment and import constants

In [1]:
# append system path to import utils
import sys
sys.path.append('../')

In [2]:
import json
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
with open('../data/scraped_data_metrics/bot_names.json') as file:
    bot_names = json.load(file)
bot_names_series = pd.Series(bot_names)

In [15]:
from utils.data_processing import file_iterator
from utils.data_processing import check_ip_address

# Code for calcualting abusefilter metrics

In [16]:
# Set up parameters for data extraction 
dir_path = '../data/abusefilter_logs'
extracted_columns = ['logid', 'title', 'action', 'user', 'type', 'pageid', 'action', 'user']

In [17]:
# Define functions for cleaning dataframe
def clean_df(df):
    df = df.drop_duplicates()
    # remove rows containing the column names
    df = df[df['title']!='title']
    # remove rows containing missing information on user/administrator names & action type
    df = df[~df['title'].isna() & ~df['user'].isna() & ~df['action'].isna()]
    # 
    df = df[df['type']=='abusefilter']
    return df

In [18]:
# Initialize an empty list to store metrics
abusefilter_metrics = []

# iterate over stored data
for year_month, df in file_iterator(dir_path, start_year=2009, end_year=2023, extract_cols=extracted_columns):
    if df is not None:
        # clean data
        df = clean_df(df)
        
        # Initialise a temporary dictory to store metrics
        metric={'month_year': year_month}

        # Count all abuse filter logs
        metric['n_af_all'] = df['logid'].nunique()
        # Count all abuse filter creation logs
        metric['n_af_create'] = df[df['action']=='create']['logid'].nunique()
        # Count all abuse filter modify logs
        metric['n_af_modify'] = df[df['action']=='modify']['logid'].nunique()
        # Count all abuse filter hit logs
        metric['n_af_hit'] = df[df['action']=='hit']['logid'].nunique()

        # store the metric
        abusefilter_metrics.append(metric)


File not found: ../data/abusefilter_logs/2009/2009-01.csv
File not found: ../data/abusefilter_logs/2009/2009-01.csv.gz
File not found: ../data/abusefilter_logs/2009/2009-02.csv
File not found: ../data/abusefilter_logs/2009/2009-02.csv.gz
Extracting: ../data/abusefilter_logs/2009/2009-03.csv
Extracting: ../data/abusefilter_logs/2009/2009-04.csv
Extracting: ../data/abusefilter_logs/2009/2009-05.csv
Extracting: ../data/abusefilter_logs/2009/2009-06.csv
Extracting: ../data/abusefilter_logs/2009/2009-07.csv
Extracting: ../data/abusefilter_logs/2009/2009-08.csv
Extracting: ../data/abusefilter_logs/2009/2009-09.csv
Extracting: ../data/abusefilter_logs/2009/2009-10.csv
Extracting: ../data/abusefilter_logs/2009/2009-11.csv
Extracting: ../data/abusefilter_logs/2009/2009-12.csv
Extracting: ../data/abusefilter_logs/2010/2010-01.csv
Extracting: ../data/abusefilter_logs/2010/2010-02.csv
Extracting: ../data/abusefilter_logs/2010/2010-03.csv
Extracting: ../data/abusefilter_logs/2010/2010-04.csv
Extrac

Extracting: ../data/abusefilter_logs/2010/2010-12.csv
Extracting: ../data/abusefilter_logs/2011/2011-01.csv
Extracting: ../data/abusefilter_logs/2011/2011-02.csv
Extracting: ../data/abusefilter_logs/2011/2011-03.csv
Extracting: ../data/abusefilter_logs/2011/2011-04.csv
Extracting: ../data/abusefilter_logs/2011/2011-05.csv
Extracting: ../data/abusefilter_logs/2011/2011-06.csv
Extracting: ../data/abusefilter_logs/2011/2011-07.csv
Extracting: ../data/abusefilter_logs/2011/2011-08.csv
Extracting: ../data/abusefilter_logs/2011/2011-09.csv
Extracting: ../data/abusefilter_logs/2011/2011-10.csv
Extracting: ../data/abusefilter_logs/2011/2011-11.csv
Extracting: ../data/abusefilter_logs/2011/2011-12.csv
Extracting: ../data/abusefilter_logs/2012/2012-01.csv
Extracting: ../data/abusefilter_logs/2012/2012-02.csv
Extracting: ../data/abusefilter_logs/2012/2012-03.csv
Extracting: ../data/abusefilter_logs/2012/2012-04.csv
Extracting: ../data/abusefilter_logs/2012/2012-05.csv
Extracting: ../data/abusefil

Convert extracted metrics to dataframe

In [19]:
abusefilter_metrics_df = pd.DataFrame(abusefilter_metrics)
abusefilter_metrics_df.head()

Unnamed: 0,month_year,n_af_all,n_af_create,n_af_modify,n_af_hit
0,2009-03,1281,0,1281,0
1,2009-04,492,0,492,0
2,2009-05,210,0,210,0
3,2009-06,300,0,300,0
4,2009-07,273,0,273,0


In [22]:
abusefilter_metrics_df.tail()

Unnamed: 0,month_year,n_af_all,n_af_create,n_af_modify,n_af_hit
173,2023-08,178,2,176,0
174,2023-09,116,3,113,0
175,2023-10,219,8,211,0
176,2023-11,214,4,210,0
177,2023-12,363,6,357,0


In [23]:
abusefilter_metrics_df['n_af_hit'].value_counts()

n_af_hit
0    178
Name: count, dtype: int64

## Save metrics to a local csv

In [21]:
abusefilter_metrics_df.to_csv('../data/scraped_data_metrics/abusefilter_monthly_metrics.csv')