In [1]:
import pandas as pd
import numpy as np

In [2]:
import warnings

# To ignore all warnings (not recommended in most cases)
warnings.filterwarnings('ignore', category=pd.errors.PerformanceWarning)

In [3]:
# Importing one of the 13 datasets
df = pd.read_csv("C:\\Users\\Hp\\Downloads\\CTU-13\\1-Neris-20110810.binetflow.csv")

In [4]:
df.head()

Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label
0,2011/08/10 09:46:59.607825,1.026539,tcp,94.44.127.113,1577,->,147.32.84.59,6881,S_RA,0.0,0.0,4,276,156,flow=Background-Established-cmpgw-CVUT
1,2011/08/10 09:47:00.634364,1.009595,tcp,94.44.127.113,1577,->,147.32.84.59,6881,S_RA,0.0,0.0,4,276,156,flow=Background-Established-cmpgw-CVUT
2,2011/08/10 09:47:48.185538,3.056586,tcp,147.32.86.89,4768,->,77.75.73.33,80,SR_A,0.0,0.0,3,182,122,flow=Background-TCP-Attempt
3,2011/08/10 09:47:48.230897,3.111769,tcp,147.32.86.89,4788,->,77.75.73.33,80,SR_A,0.0,0.0,3,182,122,flow=Background-TCP-Attempt
4,2011/08/10 09:47:48.963351,3.083411,tcp,147.32.86.89,4850,->,77.75.73.33,80,SR_A,0.0,0.0,3,182,122,flow=Background-TCP-Attempt


In [5]:
df.shape

(2824636, 15)

In [6]:
# Conversion to date-time
df["StartTime"] = pd.to_datetime(df["StartTime"])

In [7]:
for x in df:
    print(f"{x} -> {df[x].dtype}")

StartTime -> datetime64[ns]
Dur -> float64
Proto -> object
SrcAddr -> object
Sport -> object
Dir -> object
DstAddr -> object
Dport -> object
State -> object
sTos -> float64
dTos -> float64
TotPkts -> int64
TotBytes -> int64
SrcBytes -> int64
Label -> object


### An exploration on variables

* Categorical Columns -> Proto (15), Dir (7), State (230) (nan=1), Sport (nan=9379),  Dport (nan = 4390)
* Numerical columns -> Dur, sTos (nan = 10590), dTos (nan = 195190), TotPkts, TotBytes, SrcBytes
* Something (close to numerical but expressed in string) -> SrcAddr,  DstAddr
* Time Series -> StartTime

In [8]:
# Dropping the only row with a nan State value
df = df.dropna(subset=["State"])

# Filling "Unknown" in place of nan values in Sport and Dport
df['Sport'].fillna('unknown', inplace=True)
df['Dport'].fillna('unknown', inplace=True)

# Replacing the nan values in sTos and dTos with the median of respective columns
df['sTos'].fillna(df["sTos"].median(), inplace=True)
df['dTos'].fillna(df["dTos"].median(), inplace=True)

df.head()

# Checking for any leftover nan values in each columns
for column in df:
    print(f"{column} -> {df[column].isna().sum()}")

StartTime -> 0
Dur -> 0
Proto -> 0
SrcAddr -> 0
Sport -> 0
Dir -> 0
DstAddr -> 0
Dport -> 0
State -> 0
sTos -> 0
dTos -> 0
TotPkts -> 0
TotBytes -> 0
SrcBytes -> 0
Label -> 0


Hence, we have dealt with all the NaN values in our dataset.

In [9]:
# Encoding Categorical Features using Label Encoding from scikit-learn
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

# The Categorical variables we have to encode
categorical_cols = ["Proto", "Dir", "State", "Sport", "Dport"]

for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col])

# Encoding the Labels to integers
df["Label"] = label_encoder.fit_transform(df["Label"])

# Creating New Features
# Rate of packets per second
df['PacketsPerSec'] = df['TotPkts'] / df['Dur']
# Bytes per second
df['BytesPerSec'] = df['TotBytes'] / df['Dur']

# Replace any NaN or infinite values (with zero) generated during the feature engineering
df.replace([float('inf'), -float('inf')], float('nan'), inplace=True)
df.fillna(0, inplace=True)

df.head()

Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label,PacketsPerSec,BytesPerSec
0,2011-08-10 09:46:59.607825,1.026539,11,94.44.127.113,6373,0,147.32.84.59,70633,203,0.0,0.0,4,276,156,3,3.896588,268.864602
1,2011-08-10 09:47:00.634364,1.009595,11,94.44.127.113,6373,0,147.32.84.59,70633,203,0.0,0.0,4,276,156,3,3.961985,273.376948
2,2011-08-10 09:47:48.185538,3.056586,11,147.32.86.89,41591,0,77.75.73.33,71762,193,0.0,0.0,3,182,122,4,0.981487,59.543556
3,2011-08-10 09:47:48.230897,3.111769,11,147.32.86.89,41812,0,77.75.73.33,71762,193,0.0,0.0,3,182,122,4,0.964082,58.487632
4,2011-08-10 09:47:48.963351,3.083411,11,147.32.86.89,42496,0,77.75.73.33,71762,193,0.0,0.0,3,182,122,4,0.972948,59.02554


In [10]:
# Drop non-numeric columns and separate features (X) and target (y) variables
X = df.select_dtypes(include=['number']).drop(['Label'], axis=1, errors='ignore')
y = df['Label']

## Model Building

In [11]:
# Import necessary ML packages from scikit-learn
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB


# Import other necessary packages from scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score


# Splitting the data into training and testing sets (70% training and 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Deploying different models and checking their accuracy

### Decision Tree

A decision stump (decision tree with depth 1) is a good way to start making a decision tree model, since it gives us a base to compare our main model from a basic one.

In [26]:
# First creating a decision stump for the start
clf = DecisionTreeClassifier(max_depth = 1, random_state=42)
clf.fit(X_train, y_train)
print(f"Accuracy on Decision Stump -> {clf.score(X_test, y_test)}")

# Now the full decision tree
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)
print(f"Accuracy on full Decision Tree -> {clf.score(X_test, y_test)}")

Accuracy on Decision Stump -> 0.7344354613159687
Accuracy on full Decision Tree -> 0.9629993710105489


In [27]:
clf.get_depth()

54

The full decision tree has a depth of 54.

Let's see for the accuracies for decision trees of different depths.

In [14]:
for i in [5,10, 15, 20, 25, 30, 35, 40, 45, 50, 54]:
    clf = DecisionTreeClassifier(random_state=42, max_depth = i)
    clf.fit(X_train, y_train)
    print(f"Accuracy on Decision Tree with depth {i}-> {clf.score(X_test, y_test)}")

Accuracy on Decision Tree with depth 5-> 0.9117278800459292
Accuracy on Decision Tree with depth 10-> 0.9400383058116029
Accuracy on Decision Tree with depth 15-> 0.9545534469920025
Accuracy on Decision Tree with depth 20-> 0.9603665840208357
Accuracy on Decision Tree with depth 25-> 0.9629344659077097
Accuracy on Decision Tree with depth 30-> 0.9633947020914785
Accuracy on Decision Tree with depth 35-> 0.9632755127208101
Accuracy on Decision Tree with depth 40-> 0.9630878779689659
Accuracy on Decision Tree with depth 45-> 0.9631693043707096
Accuracy on Decision Tree with depth 50-> 0.9630725367628403
Accuracy on Decision Tree with depth 54-> 0.9629993710105489


The accuracies does not seem to deviate much from that of the full decision tree. We will choose a depth small enough to reduce computation and big enough to keep the accuracy high. The depth between 15 and 20 seems to match the description. Let's go with 17.

In [15]:
clf = DecisionTreeClassifier(random_state=42, max_depth = 17)
clf.fit(X_train, y_train)
print(f"Accuracy on full Decision Tree -> {clf.score(X_test, y_test)}")

Accuracy on full Decision Tree -> 0.9574151719808212


### Random Forest

Let's see how Random Forest performs on the dataset instead of the Decision Tree

In [16]:
clf = RandomForestClassifier(random_state = 42, max_depth = 17, n_estimators=10, max_features=5)
clf.fit(X_train, y_train)
print(f"Accuracy on Random Forest Classifier -> {clf.score(X_test, y_test)}")

Accuracy on Random Forest Classifier -> 0.9574092715169267


The accuracy of Random Forest is very similar to that of Decision Tree. Also, the Random Forest took more time due to smaller features each time. Hence we can say that the overall performance of Decision Tree seems to be better.

**Note** - We will not be using any boosting algorithm such as AdaBoost, because of many reasons:
1) The dataset is itself very large. So it will take a large amount of time to fit subsequent models on the dataset each time.
2) The accuracy from simple Decision Trees and random Forest are good enough.

Hence applying AdaBoost will not give us any significant advantage when considering time of execution and model accuracy.

In [17]:
from sklearn.ensemble import AdaBoostClassifier

In [18]:
clf = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=1, random_state=42), n_estimators=5, learning_rate = 2)
clf.fit(X_train, y_train)
print(f"Accuracy on AdaBoost Classifier -> {clf.score(X_test, y_test)}")

Accuracy on AdaBoost Classifier -> 0.8142002924269907


The model ran for a long time using just Decision Stumps and with small number of n_estimators. Still the accuracy is not high enough. This time will increase a lot when applied on larger base models and higher number of times. Thus we will not implement AdaBoost anymore.

Just for completion, we will deploy some other models on the dataset and compare their accuracies.

### GaussianNB and MultinomialNB

In [19]:
clf = GaussianNB()
clf.fit(X_train, y_train)
print(f"Accuracy on GaussianNB -> {clf.score(X_test, y_test)}")

clf = MultinomialNB()
clf.fit(X_train, y_train)
print(f"Accuracy on MultinomialNB -> {clf.score(X_test, y_test)}")

Accuracy on GaussianNB -> 0.46950817273254025
Accuracy on MultinomialNB -> 0.20206020597339364


The Naive Bayes Classifiers seem to perform much worse.

### SVC

In [20]:
'''
kernels = ['linear', 'poly', 'sigmoid', 'precomputed', 'rbf']

for k in kernels:
    clf = SVC(random_state = 42, kernel = k)
    clf.fit(X_train, y_train)
    print(f"Accuracy on SVC (with kernel = {k}) -> {clf.score(X_test, y_test)}")
'''

'\nkernels = [\'linear\', \'poly\', \'sigmoid\', \'precomputed\', \'rbf\']\n\nfor k in kernels:\n    clf = SVC(random_state = 42, kernel = k)\n    clf.fit(X_train, y_train)\n    print(f"Accuracy on SVC (with kernel = {k}) -> {clf.score(X_test, y_test)}")\n'

The SVC models above have been commented out because none of the kernels seem to be able to fit the data. Even after running the model for 
around 12 hours on a Kaggle Notebook, it was unable to fit the data. This might happen because the data is too large for the Support Vector Classifiers to be able to fit.

### Deploying model on other datasets.

Hence we can imply that Decision Tree seems to give the best accuracy in comparately less time. We would deploy Decision Tree on the rest of the 12 datasets and check their accuracies.

In [21]:
df_dict = {}
df_dict[1] = pd.read_csv("C:\\Users\\Hp\\Downloads\\CTU-13\\9-Neris-20110817.binetflow.csv")
df_dict[2] = pd.read_csv("C:\\Users\\Hp\\Downloads\\CTU-13\\8-Murlo-20110816-3.binetflow.csv")
df_dict[3] = pd.read_csv("C:\\Users\\Hp\\Downloads\\CTU-13\\6-Menti-20110816.binetflow.csv")
df_dict[4] = pd.read_csv("C:\\Users\\Hp\\Downloads\\CTU-13\\7-Sogou-20110816-2.binetflow.csv")
df_dict[5] = pd.read_csv("C:\\Users\\Hp\\Downloads\\CTU-13\\4-Rbot-20110815.binetflow.csv")
df_dict[6] = pd.read_csv("C:\\Users\\Hp\\Downloads\\CTU-13\\5-Virut-20110815-2.binetflow.csv")
df_dict[7] = pd.read_csv("C:\\Users\\Hp\\Downloads\\CTU-13\\3-Rbot-20110812.binetflow.csv")
df_dict[8] = pd.read_csv("C:\\Users\\Hp\\Downloads\\CTU-13\\2-Neris-20110811.binetflow.csv")
df_dict[9] = pd.read_csv("C:\\Users\\Hp\\Downloads\\CTU-13\\13-Virut-20110815-3.binetflow.csv")
df_dict[10] = pd.read_csv("C:\\Users\\Hp\\Downloads\\CTU-13\\12-NsisAy-20110819.binetflow.csv")
df_dict[11] = pd.read_csv("C:\\Users\\Hp\\Downloads\\CTU-13\\10-Rbot-20110818.binetflow.csv")
df_dict[12] = pd.read_csv("C:\\Users\\Hp\\Downloads\\CTU-13\\11-Rbot-20110818-2.binetflow.csv")

In [22]:
def preprocessing(c):
    
    df = df_dict[c]
    
    # Conversion to date-time
    df["StartTime"] = pd.to_datetime(df["StartTime"])
    
    # Impute the missing value in the 'State' column with the mode (most frequent value) of the column
    most_frequent_state = df['State'].mode()[0]
    df['State'].fillna(most_frequent_state, inplace=True)

    # Filling "Unknown" in place of nan values in Sport and Dport
    df['Sport'].fillna('unknown', inplace=True)
    df['Dport'].fillna('unknown', inplace=True)

    # Replacing the nan values in sTos and dTos with the median of respective columns
    df['sTos'].fillna(df["sTos"].median(), inplace=True)
    df['dTos'].fillna(df["dTos"].median(), inplace=True)
    
    # Encoding Categorical Features using Label Encoding from scikit-learn
    from sklearn.preprocessing import LabelEncoder
    label_encoder = LabelEncoder()

    # The Categorical variables we have to encode
    categorical_cols = ["Proto", "Dir", "State", "Sport", "Dport"]

    for col in categorical_cols:
        df[col] = label_encoder.fit_transform(df[col])

    # Encoding the Labels to integers
    df["Label"] = label_encoder.fit_transform(df["Label"])

    # Creating New Features
    # Rate of packets per second
    df['PacketsPerSec'] = df['TotPkts'] / df['Dur']
    # Bytes per second
    df['BytesPerSec'] = df['TotBytes'] / df['Dur']

    # Replace any NaN or infinite values (with zero) generated during the feature engineering
    df.replace([float('inf'), -float('inf')], float('nan'), inplace=True)
    df.fillna(0, inplace=True)
    
    # Drop non-numeric columns and separate features (X) and target (y) variables
    X = df.select_dtypes(include=['number']).drop(['Label'], axis=1, errors='ignore')
    y = df['Label']
    
    df_dict[c] = (X, y)

In [23]:
# Preprocessing on the rest of 12 datasets
for c in range(1,13):
    preprocessing(c)

In [24]:
def model(c):
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(df_dict[c][0], df_dict[c][1], test_size=0.3, random_state=42)
    
    clf = DecisionTreeClassifier(random_state=42, max_depth = 17)
    clf.fit(X_train, y_train)
    print(f"Accuracy on dataset {c} = {clf.score(X_test, y_test)}")

In [25]:
# Decision Tree on those datasets
for c in range(1,13):
    model(c)

Accuracy on dataset 1 = 0.8945873313181734
Accuracy on dataset 2 = 0.9568900638519456
Accuracy on dataset 3 = 0.9234595290918199
Accuracy on dataset 4 = 0.9342566619915849
Accuracy on dataset 5 = 0.9158398325419315
Accuracy on dataset 6 = 0.9185365853658537
Accuracy on dataset 7 = 0.9749177747963476
Accuracy on dataset 8 = 0.9601299321395849
Accuracy on dataset 9 = 0.9765039953596689
Accuracy on dataset 10 = 0.9352839966407899
Accuracy on dataset 11 = 0.8569189032366429
Accuracy on dataset 12 = 0.9039656887120835


The simple model like Decision Tree seems to fit all the datasets pretty nicely, with a few of them having lower accuracy. This states the fact that building machine learning models for classifying botnet traffic with significant accuracy is possible. Hence we can build more complex models and make them fit the data for longer duration of time, to get high accuracies when deployed in the real world.

For making an overall good model, we can create a model for each of the datasets separately, fine-tune them and use a voting algorithm on all of them and then deploy this model on the real world. Now we can decide whether to use one model type each time or a collection of models. We could also combine these datasets, shuffle and divide them into a desired number of smaller datasets and then apply the above steps on them. The list of ideas goes on and on.