In [1]:
import os
os.getcwd() 

'/Users/izapreev/Projects/ML-PT'

In [2]:
# Make sure the source code auto reloads into the kernel\n
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt

from src.utils.logger import logger

In [4]:
# Load the provided test data
data_df = pd.read_csv('./data/part_10_wrangled.csv')

In [5]:
# The only na values are for strings - replace with empty
data_df = data_df.fillna('')
# Infer the best data types
data_df = data_df.convert_dtypes()
# Check on the column types
data_df.dtypes

EVENT_ID                     string
CLIENT_IP                    string
CLIENT_USERAGENT             string
IS_USERAGENT_VALID          boolean
REQUEST_SIZE                  Int64
RESPONSE_CODE                 Int64
MATCHED_VARIABLE_SRC         string
MATCHED_VARIABLE_SRC_SEC     string
MATCHED_VARIABLE_NAME        string
MATCHED_VARIABLE_VALUE       string
dtype: object

In [6]:
from src.features.extractor import FeatureExtractor

# Get the features for all the data with Scaling and PCA the data
extractor = FeatureExtractor()
X = extractor.fit_transform(data_df)

# Get the names of the features remaining after the PCA
_ = extractor.get_feature_names_out()

11:44:47 INFO (extractor:64): Actual Non-Feature Columns: ['EVENT_ID']
11:44:47 INFO (extractor:68): Actual Scaling flag: True
11:44:47 INFO (extractor:76): Actual PCA arguments: {'n_components': 0.999999}, the PCA is: Enabled
11:44:47 INFO (extractor:113): Start fitting the Feature Extraction model
11:44:47 INFO (extractor:108): Considering feature columns: ['CLIENT_IP', 'CLIENT_USERAGENT', 'IS_USERAGENT_VALID', 'REQUEST_SIZE', 'RESPONSE_CODE', 'MATCHED_VARIABLE_SRC', 'MATCHED_VARIABLE_SRC_SEC', 'MATCHED_VARIABLE_NAME', 'MATCHED_VARIABLE_VALUE']
11:44:47 INFO (extractor:118): Fitting the vectorizer for: "CLIENT_USERAGENT"
11:44:47 INFO (extractor:118): Fitting the vectorizer for: "MATCHED_VARIABLE_SRC"
11:44:47 INFO (extractor:118): Fitting the vectorizer for: "MATCHED_VARIABLE_SRC_SEC"
11:44:47 INFO (extractor:118): Fitting the vectorizer for: "MATCHED_VARIABLE_NAME"
11:44:47 INFO (extractor:118): Fitting the vectorizer for: "MATCHED_VARIABLE_VALUE"
11:44:49 INFO (extractor:129): Sta

In [7]:
# Get the features additionally ignoring the CLIENT_IP column with Scaling and PCA the data
extractor = FeatureExtractor(ignore_columns=['EVENT_ID', 'CLIENT_IP'])
X = extractor.fit_transform(data_df)

# Get the names of the features remaining after the PCA
_ = extractor.get_feature_names_out()

11:44:50 INFO (extractor:64): Actual Non-Feature Columns: ['EVENT_ID', 'CLIENT_IP']
11:44:50 INFO (extractor:68): Actual Scaling flag: True
11:44:50 INFO (extractor:76): Actual PCA arguments: {'n_components': 0.999999}, the PCA is: Enabled
11:44:50 INFO (extractor:113): Start fitting the Feature Extraction model
11:44:50 INFO (extractor:108): Considering feature columns: ['CLIENT_USERAGENT', 'IS_USERAGENT_VALID', 'REQUEST_SIZE', 'RESPONSE_CODE', 'MATCHED_VARIABLE_SRC', 'MATCHED_VARIABLE_SRC_SEC', 'MATCHED_VARIABLE_NAME', 'MATCHED_VARIABLE_VALUE']
11:44:50 INFO (extractor:118): Fitting the vectorizer for: "CLIENT_USERAGENT"
11:44:51 INFO (extractor:118): Fitting the vectorizer for: "MATCHED_VARIABLE_SRC"
11:44:51 INFO (extractor:118): Fitting the vectorizer for: "MATCHED_VARIABLE_SRC_SEC"
11:44:51 INFO (extractor:118): Fitting the vectorizer for: "MATCHED_VARIABLE_NAME"
11:44:51 INFO (extractor:118): Fitting the vectorizer for: "MATCHED_VARIABLE_VALUE"
11:44:52 INFO (extractor:129): Sta

In [8]:
# Get the features additionally ignoring the CLIENT_IP and CLIENT_USERAGENT columns with Scaling and PCA the data
extractor = FeatureExtractor(ignore_columns=['EVENT_ID', 'CLIENT_IP', 'CLIENT_USERAGENT'])
X = extractor.fit_transform(data_df)

# Get the names of the features remaining after the PCA
_ = extractor.get_feature_names_out()

11:44:54 INFO (extractor:64): Actual Non-Feature Columns: ['EVENT_ID', 'CLIENT_IP', 'CLIENT_USERAGENT']
11:44:54 INFO (extractor:68): Actual Scaling flag: True
11:44:54 INFO (extractor:76): Actual PCA arguments: {'n_components': 0.999999}, the PCA is: Enabled
11:44:54 INFO (extractor:113): Start fitting the Feature Extraction model
11:44:54 INFO (extractor:108): Considering feature columns: ['IS_USERAGENT_VALID', 'REQUEST_SIZE', 'RESPONSE_CODE', 'MATCHED_VARIABLE_SRC', 'MATCHED_VARIABLE_SRC_SEC', 'MATCHED_VARIABLE_NAME', 'MATCHED_VARIABLE_VALUE']
11:44:54 INFO (extractor:118): Fitting the vectorizer for: "MATCHED_VARIABLE_SRC"
11:44:54 INFO (extractor:118): Fitting the vectorizer for: "MATCHED_VARIABLE_SRC_SEC"
11:44:54 INFO (extractor:118): Fitting the vectorizer for: "MATCHED_VARIABLE_NAME"
11:44:54 INFO (extractor:118): Fitting the vectorizer for: "MATCHED_VARIABLE_VALUE"
11:44:55 INFO (extractor:129): Start fitting the scaler
11:44:55 INFO (extractor:134): Start fitting the PCA
11:

In [9]:
# Get the features additionally ignoring the CLIENT_IP, CLIENT_USERAGENT, and IS_USERAGENT_VALID columns with Scaling and PCA the data
extractor = FeatureExtractor(ignore_columns=['EVENT_ID', 'CLIENT_IP', 'CLIENT_USERAGENT', 'IS_USERAGENT_VALID'])
X = extractor.fit_transform(data_df)

# Get the names of the features remaining after the PCA
_ = extractor.get_feature_names_out()

11:44:56 INFO (extractor:64): Actual Non-Feature Columns: ['EVENT_ID', 'CLIENT_IP', 'CLIENT_USERAGENT', 'IS_USERAGENT_VALID']
11:44:56 INFO (extractor:68): Actual Scaling flag: True
11:44:56 INFO (extractor:76): Actual PCA arguments: {'n_components': 0.999999}, the PCA is: Enabled
11:44:56 INFO (extractor:113): Start fitting the Feature Extraction model
11:44:56 INFO (extractor:108): Considering feature columns: ['REQUEST_SIZE', 'RESPONSE_CODE', 'MATCHED_VARIABLE_SRC', 'MATCHED_VARIABLE_SRC_SEC', 'MATCHED_VARIABLE_NAME', 'MATCHED_VARIABLE_VALUE']
11:44:56 INFO (extractor:118): Fitting the vectorizer for: "MATCHED_VARIABLE_SRC"
11:44:56 INFO (extractor:118): Fitting the vectorizer for: "MATCHED_VARIABLE_SRC_SEC"
11:44:56 INFO (extractor:118): Fitting the vectorizer for: "MATCHED_VARIABLE_NAME"
11:44:57 INFO (extractor:118): Fitting the vectorizer for: "MATCHED_VARIABLE_VALUE"
11:44:58 INFO (extractor:129): Start fitting the scaler
11:44:58 INFO (extractor:134): Start fitting the PCA
11:

In [10]:
# Get the features additionally ignoring the CLIENT_IP, CLIENT_USERAGENT, IS_USERAGENT_VALID, and REQUEST_SIZE columns with Scaling and PCA the data
extractor = FeatureExtractor(ignore_columns=['EVENT_ID', 'CLIENT_IP', 'CLIENT_USERAGENT', 'IS_USERAGENT_VALID', 'REQUEST_SIZE'])
X = extractor.fit_transform(data_df)

# Get the names of the features remaining after the PCA
_ = extractor.get_feature_names_out()

11:44:59 INFO (extractor:64): Actual Non-Feature Columns: ['EVENT_ID', 'CLIENT_IP', 'CLIENT_USERAGENT', 'IS_USERAGENT_VALID', 'REQUEST_SIZE']
11:44:59 INFO (extractor:68): Actual Scaling flag: True
11:44:59 INFO (extractor:76): Actual PCA arguments: {'n_components': 0.999999}, the PCA is: Enabled
11:44:59 INFO (extractor:113): Start fitting the Feature Extraction model
11:44:59 INFO (extractor:108): Considering feature columns: ['RESPONSE_CODE', 'MATCHED_VARIABLE_SRC', 'MATCHED_VARIABLE_SRC_SEC', 'MATCHED_VARIABLE_NAME', 'MATCHED_VARIABLE_VALUE']
11:44:59 INFO (extractor:118): Fitting the vectorizer for: "MATCHED_VARIABLE_SRC"
11:44:59 INFO (extractor:118): Fitting the vectorizer for: "MATCHED_VARIABLE_SRC_SEC"
11:44:59 INFO (extractor:118): Fitting the vectorizer for: "MATCHED_VARIABLE_NAME"
11:44:59 INFO (extractor:118): Fitting the vectorizer for: "MATCHED_VARIABLE_VALUE"
11:45:00 INFO (extractor:129): Start fitting the scaler
11:45:00 INFO (extractor:134): Start fitting the PCA
11: