In [11]:
import time

import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA

In [2]:
df_train = pd.read_csv("UNSW_NB15_training-set.csv").drop("id", axis=1)
df_test = pd.read_csv("UNSW_NB15_testing-set.csv").drop("id", axis=1)

In [3]:
def preprocess_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy(deep=True)

    # Remove duplicates
    df.drop_duplicates(inplace=True)

    # Encode continuous values
    X, y = df.drop('label', axis=1), df['label']
    le = LabelEncoder()
    for col in X.columns:
        if X[col].dtype == object:
            X[col] = le.fit_transform(X[col])

    # Normalize the other columns
    df = pd.DataFrame(StandardScaler().fit_transform(X), columns=X.columns, index=X.index)
    df['label'] = y    
    
    return df

def feature_select_with_corr(df: pd.DataFrame, threshold: float) -> pd.DataFrame:
   df = df.copy(deep=True)

   # Calculate the absolute value of the correlation matrix of df
   corr_matrix = df.drop('label', axis=1).corr().abs()

   # Get the upper triangle of the correlation matrix
   upper_triangle = np.triu(corr_matrix, k=1)

   # Find and return the features that are highly correlated with others from the df
   return [col for i, col in enumerate(corr_matrix.columns) if any(upper_triangle[i, :] > threshold)]

def resample_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy(deep=True)
    X, y = df.drop('label', axis=1), df['label']
    
    # Resample with SMOTE
    smote = SMOTE(random_state=42, k_neighbors=5)
    X, y = smote.fit_resample(X, y)
    
    # Remake the df
    df = pd.DataFrame(X, columns=X.columns)
    df['label'] = y
    
    return df


def pca_transform_df(df: pd.DataFrame, pca: PCA) -> pd.DataFrame:
    df = df.copy(deep=True)

    # Split and transform
    X, y = df.drop('label', axis=1), df['label']
    transformed_X = pca.transform(X)

    # Merge
    df = pd.DataFrame(transformed_X, columns=[f'PC{i+1}' for i in range(transformed_X.shape[1])], index=X.index)
    df['label'] = y
    
    return df

In [4]:
# Preprocess both the datasets
df_train, df_test = preprocess_df(df_train), preprocess_df(df_test)

# Get the irrelevant features
highly_correlated_features = feature_select_with_corr(df_train, 0.8)
df_train.drop(columns=highly_correlated_features, inplace=True)
df_test.drop(columns=highly_correlated_features, inplace=True)

# Resample the train dataset
df_train = resample_df(df_train)

# PCA
pca = PCA(n_components=10).fit(df_train.drop("label", axis=1))
df_train, df_test = pca_transform_df(df_train, pca), pca_transform_df(df_test, pca)

In [5]:
X_train, y_train = df_train.drop('label', axis=1), df_train['label']
X_test, y_test = df_test.drop('label', axis=1), df_test['label']

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import classification_report

In [7]:
existing_model = LogisticRegression()
existing_model.fit(X_train, y_train)
existing_model_preds = existing_model.predict(X_test)

print("\nExisting model performance:")
print(classification_report(y_test, existing_model_preds))


Existing model performance:
              precision    recall  f1-score   support

           0       0.89      0.68      0.77     34206
           1       0.63      0.86      0.73     21739

    accuracy                           0.75     55945
   macro avg       0.76      0.77      0.75     55945
weighted avg       0.79      0.75      0.75     55945



Picking the best models using K-Fold Cross validation

In [12]:
# Define base models
model1 = LogisticRegression()
model2 = KNeighborsClassifier()
model3 = SVC()
model4 = DecisionTreeClassifier()
model5 = RandomForestClassifier()
model6 = GradientBoostingClassifier()
model7 = AdaBoostClassifier()

# List of models
models = [model1, model2, model3, model4, model5, model6, model7]

# Dictionary to hold model names and their scores
scores = {}

# Evaluate each model
for model in models:
    model_name = type(model).__name__
    print(model_name)
    score = cross_val_score(model, X_train, y_train, cv=2, scoring='accuracy').mean()
    scores[model_name] = score

# Sort models based on score
sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)

for i, (model_name, score) in enumerate(sorted_scores):
    print(f"Model {i+1}: {model_name}, Score: {score}")

LogisticRegression
KNeighborsClassifier
SVC
DecisionTreeClassifier
RandomForestClassifier
GradientBoostingClassifier
AdaBoostClassifier
Model 1: KNeighborsClassifier, Score: 0.9283169203222918
Model 2: RandomForestClassifier, Score: 0.9158012533572069
Model 3: DecisionTreeClassifier, Score: 0.9120680393912265
Model 4: GradientBoostingClassifier, Score: 0.8734377797672337
Model 5: SVC, Score: 0.865550581915846
Model 6: AdaBoostClassifier, Score: 0.8612175470008953
Model 7: LogisticRegression, Score: 0.8344762757385855


In [10]:
# PICKING THE BEST HYPERPARAMETERS WITH GRIDSEARCH

# Define base models
chosen_model_1 = KNeighborsClassifier()
chosen_model_2 = RandomForestClassifier()
chosen_model_3 = DecisionTreeClassifier()
chosen_model_4 = GradientBoostingClassifier()

# Define ensemble model
ensemble = VotingClassifier(estimators=[('knn', chosen_model_1), ('rf', chosen_model_2), ('dt', chosen_model_3), ('gb', chosen_model_4)], voting='hard')

# Define hyperparameters to tune
params = {
    'knn__n_neighbors': [3, 5], 
    'rf__n_estimators': [3, 5], 
    'dt__max_depth': [None,2, 5],
    'gb__n_estimators': [2, 5],
    'gb__learning_rate': [0.1, 0.01]
}

# Perform grid search
grid = GridSearchCV(estimator=ensemble, param_grid=params, cv=2)
grid.fit(X_train, y_train)

# Print best parameters
print(grid.best_params_)

{'dt__max_depth': None, 'gb__learning_rate': 0.1, 'gb__n_estimators': 5, 'knn__n_neighbors': 3, 'rf__n_estimators': 3}


The gridsearch execution took 8 minutes 51.4 seconds.

In [12]:
# Ensemble of models
chosen_model_1 = KNeighborsClassifier(n_neighbors=3)
chosen_model_2 = RandomForestClassifier(n_estimators=3)
chosen_model_3 = DecisionTreeClassifier(max_depth=None)
chosen_model_4 = GradientBoostingClassifier(learning_rate=0.1, n_estimators=5)

# Define ensemble model
start_time = time.time()

ensemble = VotingClassifier(estimators=[('knn', chosen_model_1), ('rf', chosen_model_2), ('dt', chosen_model_3), ('gb', chosen_model_4)], voting='hard')
ensemble.fit(X_train, y_train)
ensemble_preds = ensemble.predict(X_test)

end_time = time.time()

print(f"\nExecution took {end_time - start_time} seconds.")
print("\nEnsemble Model Performance:")
print(classification_report(y_test, ensemble_preds))


Execution took 11.883054256439209 seconds.

Ensemble Model Performance:
              precision    recall  f1-score   support

           0       0.98      0.84      0.91     34206
           1       0.80      0.98      0.88     21739

    accuracy                           0.90     55945
   macro avg       0.89      0.91      0.89     55945
weighted avg       0.91      0.90      0.90     55945

