In [1]:
import os
os.getcwd() 

'/Users/izapreev/Projects/ML-PT'

In [2]:
# Make sure the source code auto reloads into the kernel\n
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt

from src.utils.logger import logger

In [4]:
# Load the provided test data
data_df = pd.read_csv('./data/part_10_wrangled.csv')

In [5]:
# The only na values are for strings - replace with empty
data_df = data_df.fillna('')
# Infer the best data types
data_df = data_df.convert_dtypes()
# Check on the column types
data_df.dtypes

EVENT_ID                   string
CLIENT_IP                  string
CLIENT_USERAGENT           string
IS_USERAGENT_VALID        boolean
REQUEST_SIZE                Int64
RESPONSE_CODE               Int64
MATCHED_VARIABLE_SRC       string
MATCHED_VARIABLE_NAME      string
MATCHED_VARIABLE_VALUE     string
dtype: object

In [6]:
from src.features.extractor import FeatureExtractor

# Get the features for all the data with Scaling and PCA the data
extractor = FeatureExtractor()
X = extractor.fit_transform(data_df)

# Get the names of the features remaining after the PCA
_ = extractor.get_feature_names_out()

12:31:44 INFO (extractor:61): Actual Non-Feature Columns: ['EVENT_ID']
12:31:44 INFO (extractor:65): Actual Scaling flag: True
12:31:44 INFO (extractor:73): Actual PCA arguments: {'n_components': 0.999999}, the PCA is: Enabled
12:31:44 INFO (extractor:110): Start fitting the Feature Extraction model
12:31:44 INFO (extractor:105): Considering feature columns: ['CLIENT_IP', 'CLIENT_USERAGENT', 'IS_USERAGENT_VALID', 'REQUEST_SIZE', 'RESPONSE_CODE', 'MATCHED_VARIABLE_SRC', 'MATCHED_VARIABLE_NAME', 'MATCHED_VARIABLE_VALUE']
12:31:44 INFO (extractor:115): Fitting the vectorizer for: "CLIENT_USERAGENT"
12:31:45 INFO (extractor:115): Fitting the vectorizer for: "MATCHED_VARIABLE_SRC"
12:31:45 INFO (extractor:115): Fitting the vectorizer for: "MATCHED_VARIABLE_NAME"
12:31:45 INFO (extractor:115): Fitting the vectorizer for: "MATCHED_VARIABLE_VALUE"
12:31:46 INFO (extractor:126): Start fitting the scaler
12:31:46 INFO (extractor:131): Start fitting the PCA
12:31:47 INFO (extractor:134): Fitting 

In [7]:
# Get the features additionally ignoring the CLIENT_IP column with Scaling and PCA the data
extractor = FeatureExtractor(ignore_columns=['EVENT_ID', 'CLIENT_IP'])
X = extractor.fit_transform(data_df)

# Get the names of the features remaining after the PCA
_ = extractor.get_feature_names_out()

12:31:48 INFO (extractor:61): Actual Non-Feature Columns: ['EVENT_ID', 'CLIENT_IP']
12:31:48 INFO (extractor:65): Actual Scaling flag: True
12:31:48 INFO (extractor:73): Actual PCA arguments: {'n_components': 0.999999}, the PCA is: Enabled
12:31:48 INFO (extractor:110): Start fitting the Feature Extraction model
12:31:48 INFO (extractor:105): Considering feature columns: ['CLIENT_USERAGENT', 'IS_USERAGENT_VALID', 'REQUEST_SIZE', 'RESPONSE_CODE', 'MATCHED_VARIABLE_SRC', 'MATCHED_VARIABLE_NAME', 'MATCHED_VARIABLE_VALUE']
12:31:48 INFO (extractor:115): Fitting the vectorizer for: "CLIENT_USERAGENT"
12:31:48 INFO (extractor:115): Fitting the vectorizer for: "MATCHED_VARIABLE_SRC"
12:31:48 INFO (extractor:115): Fitting the vectorizer for: "MATCHED_VARIABLE_NAME"
12:31:48 INFO (extractor:115): Fitting the vectorizer for: "MATCHED_VARIABLE_VALUE"
12:31:50 INFO (extractor:126): Start fitting the scaler
12:31:50 INFO (extractor:131): Start fitting the PCA
12:31:50 INFO (extractor:134): Fitting 

In [8]:
# Get the features additionally ignoring the CLIENT_IP and CLIENT_USERAGENT columns with Scaling and PCA the data
extractor = FeatureExtractor(ignore_columns=['EVENT_ID', 'CLIENT_IP', 'CLIENT_USERAGENT'])
X = extractor.fit_transform(data_df)

# Get the names of the features remaining after the PCA
_ = extractor.get_feature_names_out()

12:31:51 INFO (extractor:61): Actual Non-Feature Columns: ['EVENT_ID', 'CLIENT_IP', 'CLIENT_USERAGENT']
12:31:51 INFO (extractor:65): Actual Scaling flag: True
12:31:51 INFO (extractor:73): Actual PCA arguments: {'n_components': 0.999999}, the PCA is: Enabled
12:31:51 INFO (extractor:110): Start fitting the Feature Extraction model
12:31:51 INFO (extractor:105): Considering feature columns: ['IS_USERAGENT_VALID', 'REQUEST_SIZE', 'RESPONSE_CODE', 'MATCHED_VARIABLE_SRC', 'MATCHED_VARIABLE_NAME', 'MATCHED_VARIABLE_VALUE']
12:31:51 INFO (extractor:115): Fitting the vectorizer for: "MATCHED_VARIABLE_SRC"
12:31:51 INFO (extractor:115): Fitting the vectorizer for: "MATCHED_VARIABLE_NAME"
12:31:51 INFO (extractor:115): Fitting the vectorizer for: "MATCHED_VARIABLE_VALUE"
12:31:52 INFO (extractor:126): Start fitting the scaler
12:31:52 INFO (extractor:131): Start fitting the PCA
12:31:52 INFO (extractor:134): Fitting the Feature Extractor model is done!
12:31:53 INFO (extractor:162): Start tran

In [9]:
# Get the features additionally ignoring the CLIENT_IP, CLIENT_USERAGENT, and IS_USERAGENT_VALID columns with Scaling and PCA the data
extractor = FeatureExtractor(ignore_columns=['EVENT_ID', 'CLIENT_IP', 'CLIENT_USERAGENT', 'IS_USERAGENT_VALID'])
X = extractor.fit_transform(data_df)

# Get the names of the features remaining after the PCA
_ = extractor.get_feature_names_out()

12:31:53 INFO (extractor:61): Actual Non-Feature Columns: ['EVENT_ID', 'CLIENT_IP', 'CLIENT_USERAGENT', 'IS_USERAGENT_VALID']
12:31:53 INFO (extractor:65): Actual Scaling flag: True
12:31:53 INFO (extractor:73): Actual PCA arguments: {'n_components': 0.999999}, the PCA is: Enabled
12:31:53 INFO (extractor:110): Start fitting the Feature Extraction model
12:31:53 INFO (extractor:105): Considering feature columns: ['REQUEST_SIZE', 'RESPONSE_CODE', 'MATCHED_VARIABLE_SRC', 'MATCHED_VARIABLE_NAME', 'MATCHED_VARIABLE_VALUE']
12:31:53 INFO (extractor:115): Fitting the vectorizer for: "MATCHED_VARIABLE_SRC"
12:31:53 INFO (extractor:115): Fitting the vectorizer for: "MATCHED_VARIABLE_NAME"
12:31:53 INFO (extractor:115): Fitting the vectorizer for: "MATCHED_VARIABLE_VALUE"
12:31:54 INFO (extractor:126): Start fitting the scaler
12:31:54 INFO (extractor:131): Start fitting the PCA
12:31:55 INFO (extractor:134): Fitting the Feature Extractor model is done!
12:31:55 INFO (extractor:162): Start tran

In [10]:
# Get the features additionally ignoring the CLIENT_IP, CLIENT_USERAGENT, IS_USERAGENT_VALID, and REQUEST_SIZE columns with Scaling and PCA the data
extractor = FeatureExtractor(ignore_columns=['EVENT_ID', 'CLIENT_IP', 'CLIENT_USERAGENT', 'IS_USERAGENT_VALID', 'REQUEST_SIZE'])
X = extractor.fit_transform(data_df)

# Get the names of the features remaining after the PCA
_ = extractor.get_feature_names_out()

12:31:55 INFO (extractor:61): Actual Non-Feature Columns: ['EVENT_ID', 'CLIENT_IP', 'CLIENT_USERAGENT', 'IS_USERAGENT_VALID', 'REQUEST_SIZE']
12:31:55 INFO (extractor:65): Actual Scaling flag: True
12:31:55 INFO (extractor:73): Actual PCA arguments: {'n_components': 0.999999}, the PCA is: Enabled
12:31:55 INFO (extractor:110): Start fitting the Feature Extraction model
12:31:55 INFO (extractor:105): Considering feature columns: ['RESPONSE_CODE', 'MATCHED_VARIABLE_SRC', 'MATCHED_VARIABLE_NAME', 'MATCHED_VARIABLE_VALUE']
12:31:55 INFO (extractor:115): Fitting the vectorizer for: "MATCHED_VARIABLE_SRC"
12:31:55 INFO (extractor:115): Fitting the vectorizer for: "MATCHED_VARIABLE_NAME"
12:31:56 INFO (extractor:115): Fitting the vectorizer for: "MATCHED_VARIABLE_VALUE"
12:31:57 INFO (extractor:126): Start fitting the scaler
12:31:57 INFO (extractor:131): Start fitting the PCA
12:31:57 INFO (extractor:134): Fitting the Feature Extractor model is done!
12:31:57 INFO (extractor:162): Start tran