In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt

In [2]:
# Load the provided test data
data_df = pd.read_csv('../data/part_10_wrangled.csv')

In [3]:
# Infer the best data types
data_df = data_df.convert_dtypes()
# Check on the column types
data_df.dtypes

EVENT_ID                     string
CLIENT_IP                    string
CLIENT_USERAGENT             string
IS_USERAGENT_VALID          boolean
REQUEST_SIZE                  Int64
RESPONSE_CODE                 Int64
MATCHED_VARIABLE_SRC         string
MATCHED_VARIABLE_SRC_SEC     string
MATCHED_VARIABLE_NAME        string
MATCHED_VARIABLE_VALUE       string
dtype: object

In [4]:
# Initialize the two dimensional numpy array to be used
X = np.empty(shape=(len(data_df),0))

In [5]:
# Register the feature
input_features = []
def register_features(name, data):
    global input_features
    number = data.shape[1]
    for idx in range(number):
        input_features.append(f'{name}_{idx}' if number > 1 else name)

In [6]:
# Define the feature extraction common wrapper method
def extract_column_features(data_df, data_extractor, col_name):
    global X
    col_data = data_extractor(data_df[col_name])
    X = np.append(X, col_data, axis=1)
    print(f'The resulting X shape is: {X.shape}')

    # Register feature names
    register_features(col_name, col_data)

In [7]:
# Convert CLIENT_IP into 8 distinct IP int value feature columns
ip_col_data_extractor = lambda col_data: np.stack(col_data.apply(lambda val: np.array([int(entry, 16) for entry in val.split(':')])).values)
extract_column_features(data_df, ip_col_data_extractor, 'CLIENT_IP')

The resulting X shape is: (57729, 8)


In [8]:
# Add the REQUEST_SIZE
rs_col_data_extractor = lambda col_data: np.reshape(col_data.values, (-1, 1))
extract_column_features(data_df, rs_col_data_extractor, 'REQUEST_SIZE')

The resulting X shape is: (57729, 9)


In [9]:
# Add the RESPONSE_CODE
rc_col_data_extractor = lambda col_data: np.reshape(col_data.values, (-1, 1))
extract_column_features(data_df, rc_col_data_extractor, 'RESPONSE_CODE')

The resulting X shape is: (57729, 10)


In [10]:
# TODO: Encode labels from MATCHED_VARIABLE_SRC and MATCHED_VARIABLE_SRC_SEC

In [11]:
# TODO: Encode variable names from MATCHED_VARIABLE_NAME

In [12]:
# TODO: Encode values from MATCHED_VARIABLE_VALUE

In [13]:
# TODO: Encode values from CLIENT_USERAGENT

In [14]:
# Convert IS_USERAGENT_VALID into integers and add to X
iuv_col_data_extractor = lambda col_data: np.reshape(col_data.astype('int').values, (-1, 1))
extract_column_features(data_df, iuv_col_data_extractor, 'IS_USERAGENT_VALID')

The resulting X shape is: (57729, 11)


In [15]:
from sklearn import preprocessing

# Normalize the data
scaler = preprocessing.StandardScaler().fit(X)
X_norm = scaler.transform(X)

In [16]:
from sklearn import decomposition

# Run PCA analysis on the provided data keep the features to explain 99% of the data variance
pca = decomposition.PCA(n_components=0.99)
pca.fit(X_norm)
print(f'The variance explained:\n{pca.explained_variance_ratio_}')
print(f'The variance explained:\n{pca.get_feature_names_out(input_features)}')

print(f'The initial X shape: {X.shape}')
X_norm_pca = pca.transform(X_norm)
print(f'The X shape after PCA: {X.shape}')

The variance explained:
[0.26657768 0.11803594 0.11550613 0.09063994 0.08922088 0.08316607
 0.07238139 0.06252017 0.04344444 0.03274857 0.02575877]
The variance explained:
['pca0' 'pca1' 'pca2' 'pca3' 'pca4' 'pca5' 'pca6' 'pca7' 'pca8' 'pca9'
 'pca10']
The initial X shape: (57729, 11)
The X shape after PCA: (57729, 11)


In [17]:
# TODO: Apply DBSCAN for clustering of the provided data (tune hyperparameters to find the optimal clusters)

In [18]:
# TODO: For each cluster compute the central point and the radius

In [19]:
# TODO: Store the trained models (TfidfVectorizer, PCA and etc) along with the Classifier information: Class Id, Central point, Radius