In [1]:
import os
os.getcwd() 

'/Users/izapreev/Projects/ML-PT'

In [2]:
# Make sure the source code auto reloads into the kernel\n
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt

from src.utils.logger import logger

In [4]:
# Load the provided test data
data_df = pd.read_csv('./data/part_10_wrangled.csv')

In [5]:
# The only na values are for strings - replace with empty
data_df = data_df.fillna('')
# Infer the best data types
data_df = data_df.convert_dtypes()
# Check on the column types
data_df.dtypes

EVENT_ID                     string
CLIENT_IP                    string
CLIENT_USERAGENT             string
IS_USERAGENT_VALID          boolean
REQUEST_SIZE                  Int64
RESPONSE_CODE                 Int64
MATCHED_VARIABLE_SRC         string
MATCHED_VARIABLE_SRC_SEC     string
MATCHED_VARIABLE_NAME        string
MATCHED_VARIABLE_VALUE       string
dtype: object

In [6]:
# Initialize the two dimensional numpy array to be used
X = np.empty(shape=(len(data_df),0))

In [7]:
# Register the feature
input_features = []
def register_features(name, data):
    global input_features
    number = data.shape[1]
    for idx in range(number):
        input_features.append(f'{name}_{idx}' if number > 1 else name)

In [8]:
# Define the feature extraction common wrapper method
def extract_column_features(data_df, data_extractor, col_name):
    global X
    col_data = data_extractor(data_df[col_name])
    X = np.append(X, col_data, axis=1)
    logger.info(f'The resulting X shape is: {X.shape}')

    # Register feature names
    register_features(col_name, col_data)

In [9]:
# Convert CLIENT_IP into 8 distinct IP int value feature columns
ip_col_data_extractor = lambda col_data: np.stack(col_data.apply(lambda val: np.array([int(entry, 16) for entry in val.split(':')])).values)
extract_column_features(data_df, ip_col_data_extractor, 'CLIENT_IP')

08:09:29 INFO (4182382029:6): The resulting X shape is: (57729, 8)


In [10]:
# Add the REQUEST_SIZE
rs_col_data_extractor = lambda col_data: np.reshape(col_data.values, (-1, 1))
extract_column_features(data_df, rs_col_data_extractor, 'REQUEST_SIZE')

08:09:29 INFO (4182382029:6): The resulting X shape is: (57729, 9)


In [11]:
# Add the RESPONSE_CODE
rc_col_data_extractor = lambda col_data: np.reshape(col_data.values, (-1, 1))
extract_column_features(data_df, rc_col_data_extractor, 'RESPONSE_CODE')

08:09:29 INFO (4182382029:6): The resulting X shape is: (57729, 10)


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
UA_MAX_FEATURES = 10

# Encode values from CLIENT_USERAGENT
ua_tfidf = TfidfVectorizer(max_features=UA_MAX_FEATURES)

ua_col_data_extractor = lambda col_data: ua_tfidf.fit_transform(col_data).toarray()
extract_column_features(data_df, ua_col_data_extractor, 'CLIENT_USERAGENT')

08:09:29 INFO (4182382029:6): The resulting X shape is: (57729, 20)


In [13]:
# Convert IS_USERAGENT_VALID into integers and add to X
iuv_col_data_extractor = lambda col_data: np.reshape(col_data.astype('int').values, (-1, 1))
extract_column_features(data_df, iuv_col_data_extractor, 'IS_USERAGENT_VALID')

08:09:29 INFO (4182382029:6): The resulting X shape is: (57729, 21)


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

MVSS_MAX_FEATURES = 1

# Encode labels from MATCHED_VARIABLE_SRC and MATCHED_VARIABLE_SRC_SEC
mvss_tfidf = TfidfVectorizer(max_features=MVSS_MAX_FEATURES)

mvss_tfidf.fit(data_df['MATCHED_VARIABLE_SRC'])
mvss_tfidf.fit(data_df['MATCHED_VARIABLE_SRC_SEC'])

mvss_col_data_extractor = lambda col_data: mvss_tfidf.transform(col_data).toarray()
extract_column_features(data_df, mvss_col_data_extractor, 'MATCHED_VARIABLE_SRC')
extract_column_features(data_df, mvss_col_data_extractor, 'MATCHED_VARIABLE_SRC_SEC')

08:09:30 INFO (4182382029:6): The resulting X shape is: (57729, 22)
08:09:30 INFO (4182382029:6): The resulting X shape is: (57729, 23)


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

MVN_MAX_FEATURES = 11 # This value is derived from the data exploration, can also be re-discovered from data but is taken as a constant for simplicity

# Encode variable names from MATCHED_VARIABLE_NAME
mvn_tfidf = TfidfVectorizer(max_features=MVN_MAX_FEATURES)

#TODO: Implement proper lambda ua_tfidf.fit_transform(col_data).toarray()
#mvn_col_data_extractor = lambda col_data: ...
#extract_column_features(data_df, mvn_col_data_extractor, 'MATCHED_VARIABLE_NAME')

In [16]:
# TODO: Encode values from MATCHED_VARIABLE_VALUE

In [17]:
from sklearn import preprocessing

# Normalize the data
scaler = preprocessing.StandardScaler().fit(X)
X_norm = scaler.transform(X)

In [18]:
from sklearn import decomposition

EXPLAINED_VARIANCE_PCA=0.999999

# Run PCA analysis on the provided data keep the features to explain 99% of the data variance
pca = decomposition.PCA(n_components=EXPLAINED_VARIANCE_PCA)
pca.fit(X_norm)
logger.info(f'The variance explained:\n{pca.explained_variance_ratio_}')

logger.info(f'The initial X shape: {X_norm.shape}')
X_norm_pca = pca.transform(X_norm)
logger.info(f'The X shape after PCA: {X_norm_pca.shape}')

# Report preserved feature names
rem_pca_features = pca.get_feature_names_out(input_features)
feature_map = {f'pca{idx}' : input_features[idx] for idx in range(len(input_features))}
rem_features = [feature_map[pca_name] for pca_name in rem_pca_features]
logger.info(f'The variance explained:\n{rem_features}')

08:09:30 INFO (3613636168:8): The variance explained:
[2.23923418e-01 1.27374299e-01 1.02100344e-01 6.16890502e-02
 5.60100232e-02 5.30648542e-02 4.52489871e-02 4.36878691e-02
 4.28942133e-02 4.17421258e-02 3.83847833e-02 3.11456307e-02
 2.98089043e-02 2.30760973e-02 2.07008233e-02 1.58855002e-02
 1.55895692e-02 1.22741455e-02 1.02198605e-02 3.96982418e-03
 1.02074550e-03 1.80839077e-04 8.09354335e-06]
08:09:30 INFO (3613636168:10): The initial X shape: (57729, 23)
08:09:30 INFO (3613636168:12): The X shape after PCA: (57729, 23)
08:09:30 INFO (3613636168:18): The variance explained:
['CLIENT_IP_0', 'CLIENT_IP_1', 'CLIENT_IP_2', 'CLIENT_IP_3', 'CLIENT_IP_4', 'CLIENT_IP_5', 'CLIENT_IP_6', 'CLIENT_IP_7', 'REQUEST_SIZE', 'RESPONSE_CODE', 'CLIENT_USERAGENT_0', 'CLIENT_USERAGENT_1', 'CLIENT_USERAGENT_2', 'CLIENT_USERAGENT_3', 'CLIENT_USERAGENT_4', 'CLIENT_USERAGENT_5', 'CLIENT_USERAGENT_6', 'CLIENT_USERAGENT_7', 'CLIENT_USERAGENT_8', 'CLIENT_USERAGENT_9', 'IS_USERAGENT_VALID', 'MATCHED_VARI

In [19]:
# TODO: Apply DBSCAN for clustering of the provided data (tune hyperparameters to find the optimal number of clusters)

In [20]:
# TODO: For each cluster compute the central point and the radius

In [21]:
# TODO: Store the trained models (TfidfVectorizer, PCA and etc) along with the Classifier information: Class Id, Central point, Radius