# Libraries for the Project

In [1]:
import numpy as np
import pandas as pd 
from scipy.io import arff
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN

from sklearn.kernel_approximation import RBFSampler
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (precision_score, recall_score,f1_score, accuracy_score,mean_squared_error,mean_absolute_error)
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import (precision_score, recall_score,f1_score, accuracy_score,mean_squared_error,mean_absolute_error, roc_curve, classification_report,auc)


# Reading the Dataset

In [2]:
# creating file handler for  
# our KDDTrain+.txt and KDDTest+.txt files in 
# read mode 
trainData_handler = open("/Users/jeremyperez/Jupyter/Dataset/KDDTrain+.csv", "r")
testData_handler = open("/Users/jeremyperez/Jupyter/Dataset/KDDTest+.csv", "r")

# creating a Pandas DataFrame 
# using read_csv function  
# that reads from a csv file. 
trainData = pd.read_csv(trainData_handler, sep = ",") 
testData = pd.read_csv(testData_handler, sep = ",") 
  
# closing the file handler 
trainData_handler.close() 
testData_handler.close()

# Converting String Values  to Unique Number

In [3]:
protocol = {'tcp': 0,'udp': 1,'icmp': 2}
service = {'http': 0, 'domain_u': 1, 'sunrpc': 2, 'smtp': 3, 'ecr_i': 4, 'iso_tsap': 5, 'private': 6, 'finger': 7, 'ftp': 8, 'telnet': 9,'other': 10,'discard': 11, 'courier': 12, 'pop_3': 13, 'ldap': 14, 'eco_i': 15, 'ftp_data': 16, 'klogin': 17, 'auth': 18, 'mtp': 19, 'name': 20, 'netbios_ns': 21,'remote_job': 22,'supdup': 23,'uucp_path': 24,'Z39_50': 25,'csnet_ns': 26,'uucp': 27,'netbios_dgm': 28,'urp_i': 29,'domain': 30,'bgp':31,'gopher': 32,'vmnet': 33,'systat': 34,'http_443': 35,'efs': 36,'whois': 37,'imap4': 38,'echo': 39,'link': 40,'login': 41,'kshell': 42,'sql_net': 43,'time': 44,'hostnames': 45,'exec': 46,'ntp_u': 47,'nntp': 48,'ctf': 49,'ssh': 50,'daytime': 51,'shell': 52,'netstat': 53,'nnsp': 54,'IRC': 55,'pop_2': 56,'printer': 57,'tim_i': 58,'pm_dump': 59,'red_i': 60,'netbios_ssn': 61,'rje': 62,'X11': 63,'urh_i': 64,'http_8001': 65,'aol': 66,'http_2784': 67,'tftp_u': 68,'harvest': 69}
server_error = {'REJ': 0, 'SF': 1, 'S0': 2, 'RSTR': 3, 'RSTO': 4,'SH': 5,'S1': 6,'RSTOS0': 7,'S3': 8,'S2': 9,'OTH': 10}
attacks= {'normal': 0, 'neptune': 1, 'warezclient': 2, 'ipsweep': 3, 'mscan': 4, 'back': 5, 'smurf': 6, 'mailbomb': 7, 'apache2': 8, 'rootkit': 9,'back': 10,'satan': 11, 'processtable': 12, 'guess_passwd': 13, 'saint': 14,'portsweep': 15,'teardrop': 16,'nmap': 17,'pod': 18,'ftp_write': 19,'multihop': 20,'buffer_overflow': 21,'imap': 22,'warezmaster': 21,'phf': 22,'land': 23,'loadmodule': 24,'spy': 25,'perl': 26,'snmpgetattack': 27,'httptunnel': 28,'ps': 29,'snmpguess': 30,'named': 31,'sendmail':32,'xterm':33,'worm': 34,'xlock': 35,'xsnoop': 36,'sqlattack': 37,'udpstorm':38}
  
# traversing through dataframe 
# Protocol,Service,Server error and Attacks column's and writing 
# values where key matches in the train dataset
trainData.Protocol = [protocol[item] for item in trainData.Protocol]
trainData.Service = [service[item] for item in trainData.Service]
trainData.Server_error = [server_error[item] for item in trainData.Server_error]
trainData.Attacks = [attacks[item] for item in trainData.Attacks]

# traversing through dataframe 
# Protocol,Service,Server error and Attacks column's and writing 
# values where key matches in the test dataset
testData.Protocol = [protocol[item] for item in testData.Protocol]
testData.Service = [service[item] for item in testData.Service]
testData.Server_error = [server_error[item] for item in testData.Server_error]
testData.Attacks = [attacks[item] for item in testData.Attacks]

# Train Dataset

In [None]:
X = trainData.iloc[:,1:42]
Y = trainData.iloc[:,0]
C = testData.iloc[:,0]
T = testData.iloc[:,1:42]

scaler = Normalizer().fit(X)
trainX = scaler.transform(X)
scaler = Normalizer().fit(T)
testT = scaler.transform(T)

traindata = np.array(trainX)
trainlabel = np.array(Y)
testdata = np.array(testT)
testlabel = np.array(C)

model = LogisticRegression()
model.fit(traindata, trainlabel)

model = LogisticRegression()
model.fit(traindata, trainlabel)
model



In [None]:
# make predictions
expected = testlabel
predicted = model.predict(testdata)
np.savetxt('res/predictedLR.txt', predicted, fmt='%01d')
accuracy = accuracy_score(expected, predicted)
recall = recall_score(expected, predicted, average="binary")
precision = precision_score(expected, predicted , average="binary")
f1 = f1_score(expected, predicted , average="binary")
cm = metrics.confusion_matrix(expected, predicted)
print(cm)
tpr = float(cm[0][0])/np.sum(cm[0])
fpr = float(cm[1][1])/np.sum(cm[1])
print("%.3f" %tpr)
print("%.3f" %fpr)
print("Accuracy")
print("%.3f" %accuracy)
print("precision")
print("%.3f" %precision)
print("recall")
print("%.3f" %recall)
print("f-score")
print("%.3f" %f1)
print("fpr")
print("%.3f" %fpr)
print("tpr")
print("%.3f" %tpr)
print("***************************************************************")


# fit a Naive Bayes model to the data
model = GaussianNB()
model.fit(traindata, trainlabel)
print(model)
# make predictions
expected = testlabel
predicted = model.predict(testdata)
np.savetxt('res/predictedNB.txt', predicted, fmt='%01d')
accuracy = accuracy_score(expected, predicted)
recall = recall_score(expected, predicted, average="binary")
precision = precision_score(expected, predicted , average="binary")
f1 = f1_score(expected, predicted , average="binary")
cm = metrics.confusion_matrix(expected, predicted)
print(cm)
tpr = float(cm[0][0])/np.sum(cm[0])
fpr = float(cm[1][1])/np.sum(cm[1])
print("%.3f" %tpr)
print("%.3f" %fpr)
print("Accuracy")
print("%.3f" %accuracy)
print("precision")
print("%.3f" %precision)
print("recall")
print("%.3f" %recall)
print("f-score")
print("%.3f" %f1)
print("fpr")
print("%.3f" %fpr)
print("tpr")
print("%.3f" %tpr)
print("***************************************************************")



# fit a k-nearest neighbor model to the data
model = KNeighborsClassifier()
model.fit(traindata, trainlabel)
print(model)
# make predictions
expected = testlabel
predicted = model.predict(testdata)
np.savetxt('res/predictedKNN.txt', predicted, fmt='%01d')
# summarize the fit of the model
accuracy = accuracy_score(expected, predicted)
recall = recall_score(expected, predicted, average="binary")
precision = precision_score(expected, predicted , average="binary")
f1 = f1_score(expected, predicted , average="binary")

cm = metrics.confusion_matrix(expected, predicted)
print(cm)
tpr = float(cm[0][0])/np.sum(cm[0])
fpr = float(cm[1][1])/np.sum(cm[1])
print("%.3f" %tpr)
print("%.3f" %fpr)
print("Accuracy")
print("%.3f" %accuracy)
print("precision")
print("%.3f" %precision)
print("recall")
print("%.3f" %recall)
print("f-score")
print("%.3f" %f1)
print("fpr")
print("%.3f" %fpr)
print("tpr")
print("%.3f" %tpr)
print("***************************************************************")



model = DecisionTreeClassifier()
model.fit(traindata, trainlabel)
print(model)
# make predictions
expected = testlabel
predicted = model.predict(testdata)
np.savetxt('res/predictedDT.txt', predicted, fmt='%01d')
# summarize the fit of the model
accuracy = accuracy_score(expected, predicted)
recall = recall_score(expected, predicted, average="binary")
precision = precision_score(expected, predicted , average="binary")
f1 = f1_score(expected, predicted , average="binary")

cm = metrics.confusion_matrix(expected, predicted)
print(cm)
tpr = float(cm[0][0])/np.sum(cm[0])
fpr = float(cm[1][1])/np.sum(cm[1])
print("%.3f" %tpr)
print("%.3f" %fpr)
print("Accuracy")
print("%.3f" %accuracy)
print("precision")
print("%.3f" %precision)
print("recall")
print("%.3f" %recall)
print("f-score")
print("%.3f" %f1)
print("fpr")
print("%.3f" %fpr)
print("tpr")
print("%.3f" %tpr)
print("***************************************************************")






model = AdaBoostClassifier(n_estimators=100)
model.fit(traindata, trainlabel)

# make predictions
expected = testlabel
predicted = model.predict(testdata)
np.savetxt('res/predictedABoost.txt', predicted, fmt='%01d')
# summarize the fit of the model
accuracy = accuracy_score(expected, predicted)
recall = recall_score(expected, predicted, average="binary")
precision = precision_score(expected, predicted , average="binary")
f1 = f1_score(expected, predicted , average="binary")

cm = metrics.confusion_matrix(expected, predicted)
print(cm)
tpr = float(cm[0][0])/np.sum(cm[0])
fpr = float(cm[1][1])/np.sum(cm[1])
print("%.3f" %tpr)
print("%.3f" %fpr)
print("Accuracy")
print("%.3f" %accuracy)
print("precision")
print("%.3f" %precision)
print("recall")
print("%.3f" %recall)
print("f-score")
print("%.3f" %f1)
print("fpr")
print("%.3f" %fpr)
print("tpr")
print("%.3f" %tpr)
print("***************************************************************")




model = RandomForestClassifier(n_estimators=100)
model = model.fit(traindata, trainlabel)

# make predictions
expected = testlabel
predicted = model.predict(testdata)
np.savetxt('res/predictedRF.txt', predicted, fmt='%01d')
# summarize the fit of the model
accuracy = accuracy_score(expected, predicted)
recall = recall_score(expected, predicted, average="binary")
precision = precision_score(expected, predicted , average="binary")
f1 = f1_score(expected, predicted , average="binary")

cm = metrics.confusion_matrix(expected, predicted)
print(cm)
tpr = float(cm[0][0])/np.sum(cm[0])
fpr = float(cm[1][1])/np.sum(cm[1])
print("%.3f" %tpr)
print("%.3f" %fpr)
print("Accuracy")
print("%.3f" %accuracy)
print("precision")
print("%.3f" %precision)
print("recall")
print("%.3f" %recall)
print("f-score")
print("%.3f" %f1)
print("fpr")
print("%.3f" %fpr)
print("tpr")
print("%.3f" %tpr)
print("***************************************************************")



tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

scores = ['precision', 'recall']
X_train = traindata
y_train = trainlabel
X_test = testdata
y_test = testlabel

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()
    clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5,
                       scoring='%s_macro' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print("----------------------------------------------")
    print("cross-validation accuracy of train data set")
    print(means)
    
    print("----------------------------------------------")
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    #print("accuracy score")
    #print(accuracy_score(y_true, y_pred))
    print("confusion matrix")
    print(confusion_matrix(y_true, y_pred))
    print("Classification report")
    print(classification_report(y_true, y_pred))
    print()
    print("***************************************************************************")
    print("for now")
    print("accuracy score")
    print(accuracy_score(y_true, y_pred))
    print("precision")
    print(precision_score(y_true, y_pred , average="binary"))
    print("recall")
    print(recall_score(y_true, y_pred , average="binary"))
    print("F-score")
    print(f1_score(y_true, y_pred , average="binary"))
    print("best parameters")
    print(clf.best_params_)
    print("***************************************************************************")
    predicted = y_pred
    expected = y_true
    cm = metrics.confusion_matrix(expected, predicted)
    print("==============================================")
    print(cm)
    tpr = float(cm[0][0])/np.sum(cm[0])
    fpr = float(cm[1][1])/np.sum(cm[1])
    print("%.3f" %tpr)
    print("%.3f" %fpr)


# KMeans

In [None]:
kmeans = KMeans(n_clusters = 20)
kmeans

KMmodel = kmeans.fit(trainData)
KMmodel

KMmodel.labels_

# Analyzing Results

In [None]:
pd.crosstab(,)