In [1]:
import numpy as np 
from numpy import dtype
import time
import pandas as pd 

%matplotlib inline


In [2]:
dtypes = {
 'Dst Port': dtype('int64'),
 'Protocol': dtype('int64'),
 'Flow Duration': dtype('int64'),
 'Tot Fwd Pkts': dtype('int64'),
 'Tot Bwd Pkts': dtype('int64'),
 'TotLen Fwd Pkts': dtype('float64'),
 'TotLen Bwd Pkts': dtype('float64'),
 'Fwd Pkt Len Max': dtype('float64'),
 'Fwd Pkt Len Min': dtype('float64'),
 'Fwd Pkt Len Mean': dtype('float64'),
 'Fwd Pkt Len Std': dtype('float64'),
 'Bwd Pkt Len Max': dtype('float64'),
 'Bwd Pkt Len Min': dtype('float64'),
 'Bwd Pkt Len Mean': dtype('float64'),
 'Bwd Pkt Len Std': dtype('float64'),
 'Flow Byts/s': dtype('float64'),
 'Flow Pkts/s': dtype('float64'),
 'Flow IAT Mean': dtype('float64'),
 'Flow IAT Std': dtype('float64'),
 'Flow IAT Max': dtype('float64'),
 'Flow IAT Min': dtype('float64'),
 'Fwd IAT Tot': dtype('float64'),
 'Fwd IAT Mean': dtype('float64'),
 'Fwd IAT Std': dtype('float64'),
 'Fwd IAT Max': dtype('float64'),
 'Fwd IAT Min': dtype('float64'),
 'Bwd IAT Tot': dtype('float64'),
 'Bwd IAT Mean': dtype('float64'),
 'Bwd IAT Std': dtype('float64'),
 'Bwd IAT Max': dtype('float64'),
 'Bwd IAT Min': dtype('float64'),
 'Fwd PSH Flags': dtype('int64'),
 'Bwd PSH Flags': dtype('int64'),
 'Fwd URG Flags': dtype('int64'),
 'Bwd URG Flags': dtype('int64'),
 'Fwd Header Len': dtype('int64'),
 'Bwd Header Len': dtype('int64'),
 'Fwd Pkts/s': dtype('float64'),
 'Bwd Pkts/s': dtype('float64'),
 'Pkt Len Min': dtype('float64'),
 'Pkt Len Max': dtype('float64'),
 'Pkt Len Mean': dtype('float64'),
 'Pkt Len Std': dtype('float64'),
 'Pkt Len Var': dtype('float64'),
 'FIN Flag Cnt': dtype('int64'),
 'SYN Flag Cnt': dtype('int64'),
 'RST Flag Cnt': dtype('int64'),
 'PSH Flag Cnt': dtype('int64'),
 'ACK Flag Cnt': dtype('int64'),
 'URG Flag Cnt': dtype('int64'),
 'CWE Flag Count': dtype('int64'),
 'ECE Flag Cnt': dtype('int64'),
 'Down/Up Ratio': dtype('float64'),
 'Pkt Size Avg': dtype('float64'),
 'Fwd Seg Size Avg': dtype('float64'),
 'Bwd Seg Size Avg': dtype('float64'),
 'Fwd Byts/b Avg': dtype('float64'),
 'Fwd Pkts/b Avg': dtype('float64'),
 'Fwd Blk Rate Avg': dtype('float64'),
 'Bwd Byts/b Avg': dtype('float64'),
 'Bwd Pkts/b Avg': dtype('float64'),
 'Bwd Blk Rate Avg': dtype('float64'),
 'Subflow Fwd Pkts': dtype('int64'),
 'Subflow Fwd Byts': dtype('int64'),
 'Subflow Bwd Pkts': dtype('int64'),
 'Subflow Bwd Byts': dtype('int64'),
 'Init Fwd Win Byts': dtype('int64'),
 'Init Bwd Win Byts': dtype('int64'),
 'Fwd Act Data Pkts': dtype('int64'),
 'Fwd Seg Size Min': dtype('int64'),
 'Active Mean': dtype('float64'),
 'Active Std': dtype('float64'),
 'Active Max': dtype('float64'),
 'Active Min': dtype('float64'),
 'Idle Mean': dtype('float64'),
 'Idle Std': dtype('float64'),
 'Idle Max': dtype('float64'),
 'Idle Min': dtype('float64'),
 'Label': dtype('O')
 }

#The inf values are replaced with nan values, subsequently imputing all nan values of those columns with the mean of the columns. 

def replace_infinity_with_mean(df):
    inf_columns = [c for c in df.columns if df[df[c] == np.inf][c].count()>0]
    for col in inf_columns:
        df[col].replace([np.inf, -np.inf], np.nan, inplace =True)
        mean = df[col].mean()
        df[col].fillna(mean, inplace=True)
    return df 

def replace_negative_values_with_mean(df):
    numeric_cols = df.select_dtypes(include = [np.number]).columns.values 
    columns = [c for c in numeric_cols if df[df[c]<0][c].count()>0]
    for col in columns: 
        mask = df[col]<0
        df.loc[mask,col] = np.nan
        mean = df[col].mean()
        df[col].fillna(mean,inplace=True)
    return df

def preprocessing(df): 
    df = replace_infinity_with_mean(df)
    df = replace_negative_values_with_mean(df)
    classes = {'DDoS attacks-LOIC-HTTP': 1, 'Benign': 0,'DDOS attack-HOIC': 2,'DDOS attack-LOIC-UDP': 3,'DoS attacks-SlowHTTPTest': 4,'DoS attacks-Hulk': 5,'Bot': 6,'FTP-BruteForce': 7,'SSH-Bruteforce': 8,'Infilteration': 9,'DoS attacks-GoldenEye': 10,'DoS attacks-Slowloris': 11}
    df= df.drop('Timestamp', axis = 1)
    df['Label'] = df['Label'].apply(lambda x: classes[x])
    return df 



In [39]:
from sklearn.linear_model import SGDClassifier
df_train = pd.read_csv("shuffled_1000_train.csv")
start = time.time()
df_train = preprocessing(df_train)
x_train = df_train.drop(columns = ['Label'])
y_train = df_train['Label']

model = SGDClassifier(random_state = 0)
model.fit(x_train, y_train)

df_test = pd.read_csv("shuffled_1000_test.csv")
 
df_test = preprocessing(df_test)
x_test = df_test.drop(columns = ['Label'])
y_test = df_test['Label']
y_pred_test = model.predict(x_test)
y_pred_train = model.predict(x_train)
end = time.time()

#print("Predictions: " ,y_pred)
print(f"Processed within: ", (end-start), "sec")



Processed within:  0.92728590965271 sec


In [40]:
from sklearn.metrics import accuracy_score, precision_score
accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test, average = 'micro')
accuracy_train = accuracy_score(y_train, y_pred_train)
precision_train = precision_score(y_train,y_pred_train, average= 'micro')

print("Accuracy score:" , accuracy_test , " " , accuracy_train)
print("Precision:", precision_test, " " , precision_train )

Accuracy score: 0.5266666666666666   0.5171428571428571
Precision: 0.5266666666666666   0.5171428571428571
