In [1]:
import time

import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA

In [2]:
df_train = pd.read_csv("UNSW_NB15_training-set.csv").drop('id', axis=1)
df_test = pd.read_csv("UNSW_NB15_testing-set.csv").drop('id', axis=1)

In [3]:
def preprocess_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy(deep=True)

    # Remove duplicates
    df.drop_duplicates(inplace=True)

    # Encode continuous values
    X, y = df.drop('label', axis=1), df['label']
    le = LabelEncoder()
    for col in X.columns:
        if X[col].dtype == object:
            X[col] = le.fit_transform(X[col])

    # Normalize the other columns
    df = pd.DataFrame(StandardScaler().fit_transform(X), columns=X.columns, index=X.index)
    df['label'] = y    
    
    return df

def feature_select_with_corr(df: pd.DataFrame, threshold: float) -> pd.DataFrame:
   df = df.copy(deep=True)

   # Calculate the absolute value of the correlation matrix of df
   corr_matrix = df.drop('label', axis=1).corr().abs()

   # Get the upper triangle of the correlation matrix
   upper_triangle = np.triu(corr_matrix, k=1)

   # Find and return the features that are highly correlated with others from the df
   return [col for i, col in enumerate(corr_matrix.columns) if any(upper_triangle[i, :] > threshold)]

def resample_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy(deep=True)
    X, y = df.drop('label', axis=1), df['label']
    
    # Resample with SMOTE
    smote = SMOTE(random_state=42, k_neighbors=5)
    X, y = smote.fit_resample(X, y)
    
    # Remake the df
    df = pd.DataFrame(X, columns=X.columns)
    df['label'] = y
    
    return df


def pca_transform_df(df: pd.DataFrame, pca: PCA) -> pd.DataFrame:
    df = df.copy(deep=True)

    # Split and transform
    X, y = df.drop('label', axis=1), df['label']
    transformed_X = pca.transform(X)

    # Merge
    df = pd.DataFrame(transformed_X, columns=[f'PC{i+1}' for i in range(transformed_X.shape[1])], index=X.index)
    df['label'] = y
    
    return df

In [4]:
# Preprocess the df
df_train, df_test = preprocess_df(df_train), preprocess_df(df_test)

# # Feature select with correlation threshold
highly_correlated_features = feature_select_with_corr(df_train, 0.5)
df_train.drop(columns=highly_correlated_features, inplace=True)
df_test.drop(columns=highly_correlated_features, inplace=True)

# # Resample the train dataset with SMOTE
df_train = resample_df(df_train)

# PCA
pca = PCA(n_components=10).fit(df_train.drop('label', axis=1))
df_train, df_test = pca_transform_df(df_train, pca), pca_transform_df(df_test, pca)

In [5]:
X_train, y_train = df_train.drop('label', axis=1), df_train['label']
X_test, y_test = df_test.drop('label', axis=1), df_test['label']

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report

In [7]:
# Consider LogisticRegression as some existing model taken for reference
existing_model = LogisticRegression()
existing_model.fit(X_train, y_train)
existing_preds = existing_model.predict(X_test)

print("\nExisting Model Performance:")
print(classification_report(y_test, existing_preds))


Existing Model Performance:
              precision    recall  f1-score   support

           0       0.89      0.74      0.81     34206
           1       0.68      0.85      0.75     21739

    accuracy                           0.78     55945
   macro avg       0.78      0.80      0.78     55945
weighted avg       0.81      0.78      0.79     55945



In [8]:
# Ensemble of models
model1 = LogisticRegression()
model2 = KNeighborsClassifier()
model3 = SVC()
model4 = DecisionTreeClassifier()
model5 = RandomForestClassifier()
model6 = GradientBoostingClassifier()
model7 = AdaBoostClassifier()

start_time = time.time()

ensemble = VotingClassifier(estimators=[('lr', model1), ('knn', model2), ('svc', model3), ('dt', model4), ('rf', model5), ('gb', model6), ('ab', model7)], voting='hard')
ensemble.fit(X_train, y_train)
ensemble_preds = ensemble.predict(X_test)

end_time = time.time()

print(f"\nExecution took {end_time - start_time} seconds.")
print("\nEnsemble Model Performance:")
print(classification_report(y_test, ensemble_preds))


Execution took 218.0678834915161 seconds.

Ensemble Model Performance:
              precision    recall  f1-score   support

           0       0.99      0.86      0.92     34206
           1       0.82      0.99      0.90     21739

    accuracy                           0.91     55945
   macro avg       0.91      0.92      0.91     55945
weighted avg       0.92      0.91      0.91     55945



In [9]:
### PICKING THE BEST HYPERPARAMETERS WITH GRIDSEARCH

# X = df.drop('label', axis=1)
# y = df['label']

# # Define base models
# model1 = LogisticRegression()
# model2 = KNeighborsClassifier()
# model3 = SVC()
# model4 = DecisionTreeClassifier()
# model5 = RandomForestClassifier()
# model6 = AdaBoostClassifier()
# model7 = GradientBoostingClassifier()

# # Define ensemble model
# ensemble = VotingClassifier(estimators=[('lr', model1), ('knn', model2), ('svc', model3), ('dt', model4), ('rf', model5), ('ab', model6), ('gb', model7)], voting='hard')

# # Define hyperparameters to tune
# params = {
#     'lr__C': [1, 10], 
#     'rf__n_estimators': [10, 50], 
#     'svc__C': [1, 10], 
#     'knn__n_neighbors': [3, 5, 7], 
#     'dt__max_depth': [None, 5, 10],
#     'ab__n_estimators': [50, 100],
#     'gb__n_estimators': [50, 100],
#     'gb__learning_rate': [0.1, 0.01]
# }

# # Perform grid search
# grid = GridSearchCV(estimator=ensemble, param_grid=params, cv=5)
# grid.fit(X, y)

# # Print best parameters
# print(grid.best_params_)

In [10]:
### PICKING TOP MODELS


# X = df.drop('label', axis=1)
# y = df['label']

# # Define base models
# model1 = LogisticRegression()
# model2 = KNeighborsClassifier()
# model3 = SVC()
# model4 = DecisionTreeClassifier()
# # ... Add more models

# # List of models
# models = [model1, model2, model3, model4]

# # Dictionary to hold model names and their scores
# scores = {}

# # Evaluate each model
# for model in models:
#     model_name = type(model).__name__
#     print(model_name)
#     score = cross_val_score(model, X, y, cv=2, scoring='accuracy').mean()
#     scores[model_name] = score

# # Sort models based on score
# sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)

# # Select top k models
# top_k = 4
# top_models = sorted_scores[:top_k]

# # Print top k models
# for i, (model_name, score) in enumerate(top_models):
#     print(f"Model {i+1}: {model_name}, Score: {score}")