## Libraries

In [None]:
# Base Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.figure as fgr
from matplotlib.pyplot import figure
import seaborn as sns
import time
import os

# Data Analysis Libraries
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import MinMaxScaler,StandardScaler, RobustScaler, MaxAbsScaler, LabelEncoder, OneHotEncoder, PowerTransformer
from sklearn.metrics import mean_squared_error,accuracy_score,recall_score,precision_score,f1_score,roc_auc_score
from sklearn.model_selection import train_test_split, RepeatedKFold, KFold, cross_val_score, GridSearchCV, RandomizedSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import make_scorer, classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_curve, roc_auc_score
import pylab
from scipy.stats import skew



#Ensemble Technique
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

##Nueral Network


## Dataset

In [None]:
dfps_tr = []
dfps_ts = []
for dirname, _, filenames in os.walk('/kaggle/input/cicddos2019/'):
    for filename in filenames:
        if filename.endswith('-training.parquet'):
            dfp = os.path.join(dirname, filename)
            dfps_tr.append(dfp)
            print(dfp)
        elif filename.endswith('-testing.parquet'):
            dfp = os.path.join(dirname, filename)
            dfps_ts.append(dfp)
            print(dfp)

In [None]:
train_df = pd.concat([pd.read_parquet(dfp) for dfp in dfps_tr], ignore_index=True)
test_df = pd.concat([pd.read_parquet(dfp) for dfp in dfps_ts], ignore_index=True)

In [None]:
train_df.to_csv('output.csv', index=False)

In [None]:
train_df.shape, test_df.shape

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.columns

## Attributes

In [None]:
def grab_col_names(dataframe, cat_th=10, car_th=20):
    
    # cat_cols, cat_but_car
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and dataframe[col].dtypes != "O"]
    cat_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_cat]

    # num_cols
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_cat: {len(cat_but_cat)}')
#     print(f'num_but_cat: {len(num_but_cat)}')

    return cat_cols, num_cols, cat_but_cat


cat_cols, num_cols, cat_but_cat, = grab_col_names(train_df)
cat_cols, num_cols, cat_but_cat

In [None]:
for i in cat_cols:
    print(i, train_df[i].unique())

## Missing Values

In [None]:
for i in train_df.columns:
    print(i, train_df[i].isnull().sum())

# EDA

## Categorical Columns

In [None]:
def cat_summary(dataframe, col_name, plot=False):
    print(pd.DataFrame({col_name: dataframe[col_name].value_counts(),
                        "Ratio": 100 * dataframe[col_name].value_counts() / len(dataframe)}))

    if plot:
        fig, axs = plt.subplots(1, 2, figsize=(8, 6))
        plt.subplot(1, 2, 1)
        sns.countplot(x=dataframe[col_name], data=dataframe)
        plt.title("Frequency of " + col_name)
        plt.xticks(rotation=90)

        plt.subplot(1, 2, 2)
        values = dataframe[col_name].value_counts()
        plt.pie(x=values, labels=values.index, autopct=lambda p: '{:.2f}% ({:.0f})'.format(p, p/100 * sum(values)))
        plt.title("Frequency of " + col_name)
        plt.legend(labels=['{} - {:.2f}%'.format(index, value/sum(values)*100) for index, value in zip(values.index, values)],
                   loc='upper center', bbox_to_anchor=(0.5, -0.2), fancybox=True, shadow=True, ncol=1)
        plt.show(block=True)

for col in cat_cols:
    cat_summary(train_df, col, True)

## Numerical Columns

In [None]:
# Distribution Plots:
def my_histplot(df, col, ax):
    sns.histplot(df[col], kde=True, ax=ax)
    ax.set_title(f'Histogram Plot of {col}')
def my_distplot(df, col, ax):
    sns.distplot(df[col], ax=ax)
    ax.set_title(f'Distribution Plot of {col}')
def my_kdeplot(df, col, ax):
    sns.kdeplot(df[col], ax=ax, fill=True)
    ax.set_title(f'KDE Plot of {col}')

# Relational Plots:
def my_scatterplot(df, col, ax):
    sns.scatterplot(df[col], ax=ax)
    ax.set_title(f'Scatter Plot of {col}')
def my_lineplot(df, col, ax):
    sns.lineplot(df[col], ax=ax)
    ax.set_title(f'Line Plot of {col}')
    
# Categorical Plots:
def my_pie_chart(df, col, ax):
    labels = df[col].value_counts()
    ax.pie(labels, labels=labels.index, autopct='%1.1f%%')
    ax.set_title(f'Pie Chart of {col}')
def my_countplot(df, col, ax):
    sns.countplot(x=df[col], ax=ax)
    ax.set_title(f'Count Plot of {col}')
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
def my_boxplot(df, col, ax):
    sns.boxplot(y=df[col], ax=ax)
def my_violinplot(df, col, ax):
    sns.violinplot(y=df[col], ax=ax)
    
# Matrix Plots:
def my_heatmap(df, size):
    if size: plt.figure(figsize=size)
    sns.heatmap(df.corr(), annot=True, fmt=".1f", cmap='Blues', annot_kws={"size": 12})
    plt.title('Correlation Heatmap')
    plt.show()
    
#vsplot
def my_vsplot(df, normal_col, label_col):
    plt.figure(figsize=(10, 6), dpi=80)
    plt.bar(list(dict(df[normal_col].value_counts()).keys()), dict(df[normal_col].value_counts()).values(), color='r')
    plt.bar(list(dict(df[normal_col][df[label_col] == 1].value_counts()).keys()), dict(df[normal_col][df[label_col] == 1].value_counts()).values(), color='b')

    plt.xlabel(normal_col)
    plt.ylabel('Count')
    plt.legend(['All', label_col])
    # plt.title('The number of requests from different protocols')
    
def plot_charts_grid_single_feature(df, plot_func, size=(12, 4), n_col=1):
    if len(df.columns) == 0:
        return
    n_rows = (len(df.columns) + n_col-1) // n_col
    fig, axes = plt.subplots(n_rows, n_col, figsize=(size[0]*n_col, size[1]*n_rows))
    if len(df.columns) == 1:
        axes = np.array([axes])
    axes = axes.flatten()
    
    for i, label in enumerate(df.columns):
        plot_func(df, label, axes[i])
        axes[i].set_xlabel(label)

    for j in range(i+1, n_rows*n_col):
        axes[j].axis('off')
    
    plt.tight_layout()
    plt.show()

In [None]:
plot_charts_grid_single_feature(train_df[num_cols], my_distplot)

In [None]:
# cnt = 1
# for i in train_df.columns:
#     skew_val = skew(train_df[i], axis=0, bias=True)
        
#     print(cnt, ". ", i, " = ", skew_val)
#     cnt=cnt+1


In [None]:
plot_charts_grid_single_feature(train_df[num_cols], my_boxplot, size=(2, 4), n_col=6)

## Packet Size and Flow Duration Analysis

In [None]:
# Boxplot of Flow Duration by Attack Label
plt.figure(figsize=(12, 6))
sns.boxplot(x='Label', y='Flow Duration', data=train_df)
plt.title('Flow Duration Distribution for DDoS vs Normal Traffic')
plt.show()

# Boxplot of Packet Length Mean by Protocol and Label
plt.figure(figsize=(12, 6))
sns.boxplot(x='Protocol', y='Packet Length Mean', hue='Label', data=train_df)
plt.title('Packet Length Mean by Protocol and Attack Label')
plt.show()


## Packet Flags and Attack Patterns

In [None]:
# Counting the number of occurrences of each flag in attacks vs normal traffic
flag_columns = ['SYN Flag Count', 'ACK Flag Count', 'FIN Flag Count', 'RST Flag Count']

for flag in flag_columns:
    plt.figure(figsize=(10, 6))
    sns.countplot(x=flag, hue='Label', data=train_df)
    plt.title(f'{flag} Distribution by Attack Label')
    plt.show()


In [None]:
figure(figsize=(10, 6), dpi=80)
plt.bar(list(dict(train_df.Protocol.value_counts()).keys()), dict(train_df.Protocol.value_counts()).values(), color='r')
plt.bar(list(dict(train_df[train_df.Label == 1].Protocol.value_counts()).keys()), dict(train_df[train_df.Label == 1].Protocol.value_counts()).values(), color='b')

plt.xlabel('Protocol')
plt.ylabel('Count')
plt.legend(['All', 'malicious'])
plt.title('The number of requests from different protocols')

In [None]:
# sns.pairplot(train_df[['Flow Duration', 'Flow Packets/s', 'Flow Bytes/s', 'Label']], hue='Label', palette='Set2')
# plt.show()

## Visualization of Flow and Packet Trends

In [None]:
# # Boxplot for Flow Packets/s by Attack Label
# plt.figure(figsize=(12, 6))
# sns.boxplot(x='Label', y='Flow Packets/s', data=train_df)
# plt.title('Flow Packets per Second by Attack Label')
# plt.show()

# # Boxplot for Flow Bytes/s by Protocol and Attack Label
# plt.figure(figsize=(12, 6))
# sns.boxplot(x='Protocol', y='Flow Bytes/s', hue='Label', data=train_df)
# plt.title('Flow Bytes per Second by Protocol and Attack Label')
# plt.show()


## Correlation

In [None]:
n_numeric_cols = len(train_df.select_dtypes(include=[np.number]).columns) // 3 * 2
my_heatmap(train_df.select_dtypes(include=[np.number]), size=(n_numeric_cols+1, n_numeric_cols+1))

# Feature Engineering

## Feature Selection

In [None]:
remove_cols = ['Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'FIN Flag Count', 'Fwd Avg Bytes/Bulk', 
'Fwd Avg Packets/Bulk',
'Fwd Avg Bulk Rate',
'Bwd Avg Bytes/Bulk',
'Bwd Avg Packets/Bulk',
'Bwd Avg Bulk Rate', 'ECE Flag Count', 'PSH Flag Count']

In [None]:
train_df.drop(remove_cols, axis=1, inplace=True)

In [None]:
# Select only numerical columns
numerical_df = train_df.select_dtypes(include=[np.number])

# Calculate the correlation matrix
corr_matrix = numerical_df.corr().abs()

# Generate a boolean mask for the upper triangle
mask = np.triu(np.ones(corr_matrix.shape), k=1) == 1

# Select the upper triangle of the correlation matrix
upper_tri = corr_matrix.where(mask)

# Find the columns with a high correlation of 0.8 or higher
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.8)]

# Drop the columns with a high correlation
numerical_df.drop(to_drop, axis=1, inplace=True)

In [None]:
to_drop

In [None]:
remove_col1 = ['Bwd Packets Length Total',
 'Fwd Packet Length Mean',
 'Bwd Packet Length Mean',
 'Bwd Packet Length Std',
 'Flow IAT Std',
 'Flow IAT Max',
 'Fwd IAT Total',
 'Fwd IAT Mean',
 'Fwd IAT Std',
 'Fwd IAT Max',
 'Fwd IAT Min',
 'Bwd IAT Std',
 'Bwd IAT Max',
 'Fwd Packets/s',
 'Packet Length Min',
 'Packet Length Max',
 'Packet Length Mean',
 'Packet Length Std',
 'Packet Length Variance',
 'RST Flag Count',
 'Avg Packet Size',
 'Avg Fwd Segment Size',
 'Avg Bwd Segment Size',
 'Subflow Fwd Packets',
 'Subflow Fwd Bytes',
 'Subflow Bwd Packets',
 'Subflow Bwd Bytes',
 'Fwd Act Data Packets',
 'Fwd Seg Size Min',
 'Active Max',
 'Active Min',
 'Idle Mean',
 'Idle Max',
 'Idle Min']

In [None]:
train_df.drop(remove_col1, axis=1, inplace=True)

In [None]:
n_numeric_cols = len(train_df.select_dtypes(include=[np.number]).columns) // 3 * 2
my_heatmap(train_df.select_dtypes(include=[np.number]), size=(n_numeric_cols+1, n_numeric_cols+1))

## Feature Transformation

In [None]:
train_df.head()

In [None]:
train_df['Label'].unique()

In [None]:
# Apply one-hot encoding to the 'Label' column
train_df = pd.get_dummies(train_df, columns=['Label'], prefix='Label')

In [None]:
train_df.head()

## Splitting

In [None]:
# X = df.drop(columns='Label')
# y = df['Label']

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# MODELS

## Optuna

In [None]:
# import optuna
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import f1_score
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# import xgboost as xgb
# import lightgbm as lgb
# from catboost import CatBoostClassifier


# # Objective functions for each model

# # 1. Random Forest
# def objective_rf(trial):
#     # Define hyperparameter space
#     n_estimators = trial.suggest_int('n_estimators', 100, 1000)
#     max_depth = trial.suggest_int('max_depth', 5, 30)
#     min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
#     min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5)
    
#     # Train model
#     model = RandomForestClassifier(
#         n_estimators=n_estimators,
#         max_depth=max_depth,
#         min_samples_split=min_samples_split,
#         min_samples_leaf=min_samples_leaf,
#         random_state=42
#     )
#     model.fit(X_train, y_train)
    
#     # Evaluate
#     y_pred = model.predict(X_test)
#     return f1_score(y_test, y_pred, average='weighted')

# # 2. XGBoost
# def objective_xgb(trial):
#     # Define hyperparameter space
#     params = {
#         'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
#         'max_depth': trial.suggest_int('max_depth', 3, 30),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
#         'subsample': trial.suggest_float('subsample', 0.6, 1.0),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0)
#     }
    
#     model = xgb.XGBClassifier(**params, use_label_encoder=False, random_state=42)
#     model.fit(X_train, y_train)
    
#     # Evaluate
#     y_pred = model.predict(X_test)
#     return f1_score(y_test, y_pred, average='weighted')

# # 3. LightGBM
# def objective_lgb(trial):
#     params = {
#         'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
#         'max_depth': trial.suggest_int('max_depth', 3, 30),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
#         'num_leaves': trial.suggest_int('num_leaves', 31, 512),
#         'subsample': trial.suggest_float('subsample', 0.6, 1.0),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0)
#     }

#     model = lgb.LGBMClassifier(**params, random_state=42)
#     model.fit(X_train, y_train)
    
#     # Evaluate
#     y_pred = model.predict(X_test)
#     return f1_score(y_test, y_pred, average='weighted')

# # 4. CatBoost
# def objective_catboost(trial):
#     params = {
#         'iterations': trial.suggest_int('iterations', 100, 1000),
#         'depth': trial.suggest_int('depth', 3, 10),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
#         'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0),
#         'border_count': trial.suggest_int('border_count', 32, 255)
#     }

#     model = CatBoostClassifier(**params, verbose=0, random_state=42)
#     model.fit(X_train, y_train)
    
#     # Evaluate
#     y_pred = model.predict(X_test)
#     return f1_score(y_test, y_pred, average='weighted')

# # 5. Gradient Boosting
# def objective_gb(trial):
#     params = {
#         'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
#         'max_depth': trial.suggest_int('max_depth', 3, 30),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
#         'subsample': trial.suggest_float('subsample', 0.6, 1.0)
#     }

#     model = GradientBoostingClassifier(**params, random_state=42)
#     model.fit(X_train, y_train)
    
#     # Evaluate
#     y_pred = model.predict(X_test)
#     return f1_score(y_test, y_pred, average='weighted')

# # Run the Optuna study for each model
# def run_study(objective_func, n_trials=50):
#     study = optuna.create_study(direction='maximize')
#     study.optimize(objective_func, n_trials=n_trials)
#     return study

# # Optimize each model
# print("Optimizing Random Forest...")
# rf_study = run_study(objective_rf)

# print("Optimizing XGBoost...")
# xgb_study = run_study(objective_xgb)

# print("Optimizing LightGBM...")
# lgb_study = run_study(objective_lgb)

# print("Optimizing CatBoost...")
# catboost_study = run_study(objective_catboost)

# print("Optimizing Gradient Boosting...")
# gb_study = run_study(objective_gb)

# # Show the best hyperparameters for each model
# print("Best parameters for Random Forest:", rf_study.best_params)
# print("Best parameters for XGBoost:", xgb_study.best_params)
# print("Best parameters for LightGBM:", lgb_study.best_params)
# print("Best parameters for CatBoost:", catboost_study.best_params)
# print("Best parameters for Gradient Boosting:", gb_study.best_params)

## Random Forest

In [None]:
# ## Random Forest

# rf_params = {
#     'n_estimators': [100, 200, 300],
#     'criterion': ['gini', 'entropy'],
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
# }

# skf = RepeatedStratifiedKFold(n_splits=5)

# RF = RandomizedSearchCV(estimator=RandomForestClassifier(random_state=42),
#                               param_distributions=rf_params, cv=skf, n_iter=2, n_jobs=2)
                            
# RF_model = RF.fit(X_train, y_train)
# RF_pred = RF_model.predict(X_test)
# accuracy_score(y_test, RF_pred)

## XGBoost

## LightBGM

In [None]:
# lgbm_params = {
#         'num_leaves': [249], 
#         'learning_rate': [0.02636616162598401], 
#         'n_estimators': [546],
#         'subsample_for_bin': [50], 
#         'min_child_samples': [77], 
#         'lambda_l1': [8.242410039948067e-07],
#         'lambda_l2': [0.4063299210212167],
#         'colsample_bytree': [0.8107657422421071], 
#         'subsample': [0.8727733774586144], 
#         'max_depth': [10],
#     }

# skf = RepeatedStratifiedKFold(n_splits=5)

# lgbm = RandomizedSearchCV(estimator=LGBMClassifier(random_state=42, verbosity=-1),
#                               param_grid=lgbm_params, cv=skf, n_jobs=3, verbose=-1)
                      
# lgbm_model = lgbm.fit(X_train, y_train)
# lgbm_pred = lgbm_model.predict(X_test)
# accuracy_score(y_test, lgbm_pred)

## CatBoosting

## Autogloun

In [None]:
# from IPython.display import clear_output
# !pip install autogluon --user
# clear_output()
# from autogluon.tabular import TabularPredictor
# ID="ID"
# TARGET = "Crime_Category"
# TRAIN_PATH = "/kaggle/input/crime-cast-forecasting-crime-categories/train.csv"
# TEST_PATH = "/kaggle/input/crime-cast-forecasting-crime-categories/test.csv"
# SAMPLE_SUBISSION_PATH = "/kaggle/input/crime-cast-forecasting-crime-categories/sample.csv"
# SUBMISSION_PATH = "submission.csv"
# predictor = TabularPredictor(label=TARGET).fit(train_data=TRAIN_PATH)
# pred_test = predictor.predict(TEST_PATH)