In [38]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/firewall-threats/threat.csv


In [39]:
!pip install --upgrade scikit-learn



In [40]:
df = pd.read_csv('/kaggle/input/firewall-threats/threat.csv')
df

Unnamed: 0,type,subtype,level,srccountry,srcintf,srcintfrole,dstintf,dstintfrole,action,proto,service,policyid,appcat,duration,sentbyte,rcvdbyte,crscore,threat
0,0,0,0,4,0,0,0,0,4,6.0,2,72.0,1,0.0,0.0,0.0,30.0,2
1,0,0,0,6,0,0,0,0,4,6.0,4,89.0,1,0.0,0.0,0.0,30.0,2
2,0,0,0,4,0,0,0,0,4,6.0,2,72.0,1,0.0,0.0,0.0,30.0,2
3,0,0,0,4,0,0,0,0,4,6.0,2,72.0,1,0.0,0.0,0.0,30.0,2
4,0,0,0,0,0,0,0,0,5,6.0,0,42.0,1,25.0,60.0,320.0,5.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17076,0,0,0,4,0,0,0,0,4,6.0,933,72.0,1,0.0,0.0,0.0,30.0,2
17077,0,0,0,4,0,0,0,0,4,6.0,934,89.0,1,0.0,0.0,0.0,30.0,2
17078,0,0,0,4,0,0,0,0,4,6.0,933,72.0,1,0.0,0.0,0.0,30.0,2
17079,0,0,0,4,0,0,0,0,4,6.0,933,72.0,1,0.0,0.0,0.0,30.0,2


# Initial Trial
As a first attempt, we use `resample` to upsample the minority classes, and then train a random forest classifier on the upsampled data.

In [41]:
import pandas as pd
from sklearn.utils import resample
from sklearn.model_selection import train_test_split


X = df.drop('threat', axis=1)  
y = df['threat']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=666)

train_data = pd.concat([X_train, y_train], axis=1)

majority_class = train_data[train_data['threat'] == 1]  

minority_class_1 = train_data[train_data['threat'] == 2] 
minority_class_2 = train_data[train_data['threat'] == 3]  

minority_upsampled_1 = minority_class_1

minority_upsampled_2 = resample(minority_class_2,
                                replace=True,  
                                n_samples=len(majority_class),  
                                random_state=42)


upsampled_data = pd.concat([majority_class, minority_upsampled_1, minority_upsampled_2])

X_train_upsampled = upsampled_data.drop('threat', axis=1)
y_train_upsampled = upsampled_data['threat']
print(y_train_upsampled.value_counts())
X_train_upsampled
y_test.value_counts()

threat
2    7975
1    4833
3    4833
Name: count, dtype: int64


threat
2    2601
1    1667
3       3
Name: count, dtype: int64

In [42]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_upsampled, y_train_upsampled)

model.score(X_test, y_test)

0.9995317255911964

As we can see, the model is performing well at first glance. However, for a more detailed analysis, we will use the `classification_report` and `confusion_matrix` functions.

---

In [43]:
from sklearn.metrics import confusion_matrix, classification_report

y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[1667    0    0]
 [   0 2601    0]
 [   0    2    1]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00      1667
           2       1.00      1.00      1.00      2601
           3       1.00      0.33      0.50         3

    accuracy                           1.00      4271
   macro avg       1.00      0.78      0.83      4271
weighted avg       1.00      1.00      1.00      4271



Upon further analysis, we can see that the model is not performing well for the third level of threat, which being the most dangerous level, does not bode well for our objective.

We will take a detailed look at the feature importances of the model, in order to better understand the importance of each feature, in an attempt to improve our model.

---

In [44]:
feature_importances = model.feature_importances_

# Create a DataFrame to display feature importances
feature_importance_df = pd.DataFrame({'Feature': range(1, X.shape[1] + 1), 'Importance': feature_importances})

# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
feature_importance_df.Feature= feature_importance_df.Feature.apply(lambda x: X.columns[x-1])

# Print and plot the feature importances
print("Feature Importances:")
print(feature_importance_df)

Feature Importances:
        Feature  Importance
8        action    0.205822
13     duration    0.121824
2         level    0.095075
11     policyid    0.093002
0          type    0.085418
1       subtype    0.084629
16      crscore    0.080706
14     sentbyte    0.069511
10      service    0.053434
12       appcat    0.048886
15     rcvdbyte    0.048643
3    srccountry    0.009366
4       srcintf    0.001796
5   srcintfrole    0.001097
6       dstintf    0.000310
9         proto    0.000289
7   dstintfrole    0.000192


# Feature-Tuning
We can see that the features which have less than 1% of their importance are `proto`, `dstintfrole`, `srcintfrole`, `dstintf`, `srcintf`. Hence, we can remove these features from our dataset.

In [45]:
X = df.drop(['proto','dstintfrole','srcintfrole','dstintf','srcintf','threat'], axis=1)  # replace 'target_column' with the actual column name
y = df['threat'].apply(lambda x: x-1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=214)

train_data = pd.concat([X_train, y_train], axis=1)

majority_class = train_data[train_data['threat'] == 0]  
minority_class_1 = train_data[train_data['threat'] == 1]  
minority_class_2 = train_data[train_data['threat'] == 2]  

minority_upsampled_1 = minority_class_1

minority_upsampled_2 = resample(minority_class_2,
                                replace=True, 
                                n_samples=len(majority_class),  
                                random_state=42)


upsampled_data = pd.concat([majority_class, minority_upsampled_1, minority_upsampled_2])


X_train_upsampled = upsampled_data.drop('threat', axis=1)
y_train_upsampled = upsampled_data['threat']
print(y_train_upsampled.value_counts())
X_train_upsampled
y_test.value_counts()

threat
1    6841
0    4258
2    4258
Name: count, dtype: int64


threat
1    3735
0    2242
2       2
Name: count, dtype: int64

# Model Ensemble
We will use bagging to improve our model's performance. For this, we will use the `RandomForestClassifier` model, `XGBoost` model, `SVC` model, and `LightGBM` model and compare their individual performances, in order to form an ensemble model.

## Random Forest Classifier

In [46]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_upsampled, y_train_upsampled)

rf.score(X_test, y_test)

0.9998327479511624

As we can see, the model is performing well at first glance. However, for a more detailed analysis, we will use the `classification_report` and `confusion_matrix` functions.

---

In [47]:
from sklearn.metrics import confusion_matrix, classification_report

y_predrf = rf.predict(X_test)
print(confusion_matrix(y_test, y_predrf))
print(classification_report(y_test, y_predrf))

[[2242    0    0]
 [   0 3734    1]
 [   0    0    2]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2242
           1       1.00      1.00      1.00      3735
           2       0.67      1.00      0.80         2

    accuracy                           1.00      5979
   macro avg       0.89      1.00      0.93      5979
weighted avg       1.00      1.00      1.00      5979



We can see that the model is performing considerably better after feature selection. It has misclassified a level 2 threat as a level 3 threat, but this error can be neglected since the model is not misclassifying any threat level to be lower than it actually is.

---

# XGBoost

In [48]:
from xgboost import XGBClassifier

xgb = XGBClassifier()
xgb.fit(X_train_upsampled, y_train_upsampled)

xgb.score(X_test, y_test)

0.9998327479511624

As we can see, the model is performing well at first glance. However, for a more detailed analysis, we will use the `classification_report` and `confusion_matrix` functions.

---

In [49]:
from sklearn.metrics import confusion_matrix, classification_report

y_predxgb = xgb.predict(X_test)
print(confusion_matrix(y_test, y_predxgb))
print(classification_report(y_test, y_predxgb))

[[2242    0    0]
 [   0 3734    1]
 [   0    0    2]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2242
           1       1.00      1.00      1.00      3735
           2       0.67      1.00      0.80         2

    accuracy                           1.00      5979
   macro avg       0.89      1.00      0.93      5979
weighted avg       1.00      1.00      1.00      5979



We can see that it has misclassified a level 2 threat as a level 3 threat. Hence, we can assume that the model is not misclassifying any threat level to be lower than it actually is.

---

## SVC

In [50]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(X_train_upsampled, y_train_upsampled)

svc.score(X_test, y_test)

0.9533366783743101

As we can see the model is not performing well at all. However, for a more detailed analysis, we will use the `classification_report` and `confusion_matrix` functions.

In [51]:
from sklearn.metrics import confusion_matrix, classification_report

y_pred = svc.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[2173    4   65]
 [   1 3525  209]
 [   0    0    2]]
              precision    recall  f1-score   support

           0       1.00      0.97      0.98      2242
           1       1.00      0.94      0.97      3735
           2       0.01      1.00      0.01         2

    accuracy                           0.95      5979
   macro avg       0.67      0.97      0.66      5979
weighted avg       1.00      0.95      0.98      5979



Even though the model is not performing well, it is still able to correctly classify the level 3 threats. However, a lot of misclassifications are there, and hence, we will not be using this model.

---

## LightGBM Classifier

In [52]:
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier()

lgbm.fit(X_train_upsampled, y_train_upsampled)

lgbm.score(X_test, y_test)

0.9998327479511624

As we can see, the model is performing well at first glance. However, for a more detailed analysis, we will use the `classification_report` and `confusion_matrix` functions.

---

In [53]:
from sklearn.metrics import confusion_matrix, classification_report

y_predlgbm = lgbm.predict(X_test)
print(confusion_matrix(y_test, y_predlgbm))
print(classification_report(y_test, y_predlgbm))

[[2242    0    0]
 [   0 3734    1]
 [   0    0    2]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2242
           1       1.00      1.00      1.00      3735
           2       0.67      1.00      0.80         2

    accuracy                           1.00      5979
   macro avg       0.89      1.00      0.93      5979
weighted avg       1.00      1.00      1.00      5979



We can see that it has misclassified a level 2 threat as a level 3 threat. Hence, we can assume that the model is not misclassifying any threat level to be lower than it actually is.

---

## Comparing Misclassifications

In [54]:
misclassified_indices = np.where(y_test != y_predrf)[0]
print(X_test.loc[misclassified_indices])
misclassified_indices = np.where(y_test != y_predxgb)[0]
print(X_test.loc[misclassified_indices])
misclassified_indices = np.where(y_test != y_predlgbm)[0]
print(X_test.loc[misclassified_indices])

      type  subtype  level  srccountry  action  service  policyid  appcat  \
2634     0        0      0           0       5        0      47.0       1   

      duration  sentbyte  rcvdbyte  crscore  
2634      25.0      52.0     260.0      5.0  
      type  subtype  level  srccountry  action  service  policyid  appcat  \
2634     0        0      0           0       5        0      47.0       1   

      duration  sentbyte  rcvdbyte  crscore  
2634      25.0      52.0     260.0      5.0  
      type  subtype  level  srccountry  action  service  policyid  appcat  \
2634     0        0      0           0       5        0      47.0       1   

      duration  sentbyte  rcvdbyte  crscore  
2634      25.0      52.0     260.0      5.0  


Upon comparing the misclassified indices, we can see that the models are missclassifying the same level 2 threats as level 3 threats.

---

# Voting Classifier

We will be using the Voting Classifier to combine the models. We will be using `soft` voting, since multi-class classification is involved.

In [55]:
from sklearn.ensemble import VotingClassifier

model = VotingClassifier(estimators=[('rf', rf), ('xgb', xgb), ('lgbm', lgbm)], voting='soft')

model.fit(X_train_upsampled, y_train_upsampled)

model.score(X_test, y_test)

0.9998327479511624

In [56]:
from sklearn.metrics import confusion_matrix, classification_report

y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[2242    0    0]
 [   0 3734    1]
 [   0    0    2]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2242
           1       1.00      1.00      1.00      3735
           2       0.67      1.00      0.80         2

    accuracy                           1.00      5979
   macro avg       0.89      1.00      0.93      5979
weighted avg       1.00      1.00      1.00      5979



# Model Export

We will be using pickle to export the model, so that it can be used in the Threat Detection pipeline.

In [57]:
import pickle

with open('raksha_ultra_xlf.pkl', 'wb') as f:
    pickle.dump(model, f)