# CSC 215 - MiniProject 3
## Khoi Hoang

In [1]:
import os
import csv
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import collections
import sklearn.feature_extraction.text as sk_text
from scipy.stats import zscore
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from sklearn import svm, datasets
from sklearn.metrics import confusion_matrix, classification_report
from matplotlib.pyplot import figure, show
%matplotlib inline

In [2]:
# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_

# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd
    
# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)    

# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()    
    
# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column. 
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, collections.Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

# Plot a confusion matrix.
# cm is the confusion matrix, names are the names of the classes.
def plot_confusion_matrix(cm, names, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(names))
    plt.xticks(tick_marks, names, rotation=45)
    plt.yticks(tick_marks, names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    

# Plot an ROC. pred - the predictions, y - the expected output.
def plot_roc(pred,y):
    fpr, tpr, thresholds = roc_curve(y, pred)
    roc_auc = auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC)')
    plt.legend(loc="lower right")
    plt.show()

In [3]:
df = pd.read_csv('network_intrusion_data.csv')
df.columns = [
 'duration',
 'protocol_type',
 'service',
 'flag',
 'src_bytes',
 'dst_bytes',
 'land',
 'wrong_fragment',
 'urgent',
 'hot',
 'num_failed_logins',
 'logged_in',
 'num_compromised',
 'root_shell',
 'su_attempted',
 'num_root',
 'num_file_creations',
 'num_shells',
 'num_access_files',
 'num_outbound_cmds',
 'is_host_login',
 'is_guest_login',
 'count',
 'srv_count',
 'serror_rate',
 'srv_serror_rate',
 'rerror_rate',
 'srv_rerror_rate',
 'same_srv_rate',
 'diff_srv_rate',
 'srv_diff_host_rate',
 'dst_host_count',
 'dst_host_srv_count',
 'dst_host_same_srv_rate',
 'dst_host_diff_srv_rate',
 'dst_host_same_src_port_rate',
 'dst_host_srv_diff_host_rate',
 'dst_host_serror_rate',
 'dst_host_srv_serror_rate',
 'dst_host_rerror_rate',
 'dst_host_srv_rerror_rate',
 'outcome'
]

In [4]:
df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,outcome
0,0,tcp,http,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,217,2032,0,0,0,0,...,59,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.


In [5]:
df.shape

(494020, 42)

In [6]:
# df = df.loc[:, (df != 0).any(axis=0)] #drop columns with all 0
unique_val = df.apply(pd.Series.nunique)
cols_to_drop = unique_val[unique_val==1].index
df = df.drop(cols_to_drop, axis=1)

In [7]:
df.shape

(494020, 40)

In [8]:
np.unique(df['outcome'].values,return_counts=True)

(array(['back.', 'buffer_overflow.', 'ftp_write.', 'guess_passwd.',
        'imap.', 'ipsweep.', 'land.', 'loadmodule.', 'multihop.',
        'neptune.', 'nmap.', 'normal.', 'perl.', 'phf.', 'pod.',
        'portsweep.', 'rootkit.', 'satan.', 'smurf.', 'spy.', 'teardrop.',
        'warezclient.', 'warezmaster.'], dtype=object),
 array([  2203,     30,      8,     53,     12,   1247,     21,      9,
             7, 107201,    231,  97277,      3,      4,    264,   1040,
            10,   1589, 280790,      2,    979,   1020,     20]))

In [9]:
# binary encode the outcome, 0 is good/normal and 1 is an attack
df['outcome'] = df['outcome'].apply(lambda x: '0' if x == 'normal.' else '1')

In [10]:
np.unique(df['outcome'].values,return_counts=True)

(array(['0', '1'], dtype=object), array([ 97277, 396743]))

In [11]:
df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,outcome
0,0,tcp,http,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0
1,0,tcp,http,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0
2,0,tcp,http,SF,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0
3,0,tcp,http,SF,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0
4,0,tcp,http,SF,217,2032,0,0,0,0,...,59,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0


In [13]:
df.drop_duplicates(keep='first', inplace=True)

In [16]:
df.shape

(145584, 40)

In [20]:
df.dtypes

duration                         int64
protocol_type                   object
service                         object
flag                            object
src_bytes                        int64
dst_bytes                        int64
land                             int64
wrong_fragment                   int64
urgent                           int64
hot                              int64
num_failed_logins                int64
logged_in                        int64
num_compromised                  int64
root_shell                       int64
su_attempted                     int64
num_root                         int64
num_file_creations               int64
num_shells                       int64
num_access_files                 int64
is_guest_login                   int64
count                            int64
srv_count                        int64
serror_rate                    float64
srv_serror_rate                float64
rerror_rate                    float64
srv_rerror_rate          

In [28]:
#from sklearn.preprocessing import StandardScaler

scaler = preprocessing.StandardScaler()

cols_to_normalize = df.select_dtypes(include=['int64', 'float64']).columns
numeric_features = scaler.fit_transform(df.select_dtypes(include=['float64','int64']))

numeric_features = pd.DataFrame(numeric_features, columns=cols_to_normalize)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [48]:
numeric_features.shape

(145584, 36)

In [64]:
scaler = preprocessing.LabelEncoder()

cols_to_encode = df.select_dtypes(include=['object']).columns
label_features = df.select_dtypes(include=['object']).apply(scaler.fit_transform)

label_features = pd.DataFrame(label_features, columns=cols_to_encode)

In [65]:
label_features.head()

Unnamed: 0,protocol_type,service,flag,outcome
0,1,22,9,0
1,1,22,9,0
2,1,22,9,0
3,1,22,9,0
4,1,22,9,0


In [66]:
label_features.drop(columns=['outcome'], inplace=True)

In [67]:
label_features.shape

(145584, 3)

In [68]:
numeric_features.reset_index(drop=True, inplace=True)
label_features.reset_index(drop=True, inplace=True)
input_data = pd.concat([numeric_features, label_features], axis=1)
input_data.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,protocol_type,service,flag
0,-0.107851,-0.004261,-0.039036,-0.011722,-0.084394,-0.004737,-0.07021,-0.018022,1.017168,-0.007905,...,-0.417559,-0.177588,-0.315312,-0.644435,-0.641711,-0.36088,-0.353944,1,22,9
1,-0.107851,-0.004263,-0.025042,-0.011722,-0.084394,-0.004737,-0.07021,-0.018022,1.017168,-0.007905,...,-0.417559,-0.260536,-0.315312,-0.644435,-0.641711,-0.36088,-0.353944,1,22,9
2,-0.107851,-0.004272,-0.025042,-0.011722,-0.084394,-0.004737,-0.07021,-0.018022,1.017168,-0.007905,...,-0.417559,-0.260536,-0.315312,-0.644435,-0.641711,-0.36088,-0.353944,1,22,9
3,-0.107851,-0.004273,-0.013613,-0.011722,-0.084394,-0.004737,-0.07021,-0.018022,1.017168,-0.007905,...,-0.417559,-0.302011,-0.315312,-0.644435,-0.641711,-0.36088,-0.353944,1,22,9
4,-0.107851,-0.004273,-0.013613,-0.011722,-0.084394,-0.004737,-0.07021,-0.018022,1.017168,-0.007905,...,-0.417559,-0.302011,-0.315312,-0.644435,-0.641711,-0.36088,-0.353944,1,22,9


In [69]:
output_data = df['outcome']
output_data.shape

(145584,)

In [71]:
x_train, x_test, y_train, y_test = train_test_split(input_data, output_data, test_size=0.25, random_state=43)

In [75]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# Initiate and train all the models
logreg = LogisticRegression()
logreg.fit(x_train, y_train)

svc = LinearSVC(dual=False)
svc.fit(x_train, y_train)

knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(x_train, y_train)

gnb = GaussianNB()
gnb.fit(x_train, y_train)



GaussianNB(priors=None, var_smoothing=1e-09)

In [76]:
y_pred_Logistic = logreg.predict(x_test)
logistic_accuracy = metrics.accuracy_score(y_test, y_pred_Logistic)
print("Accuracy score: {}".format(logistic_accuracy))

Accuracy score: 0.9879107594241126


In [77]:
y_svc = svc.predict(x_test)
svc_accuracy = metrics.accuracy_score(y_test, y_svc)
print("Accuracy score: {}".format(svc_accuracy))

Accuracy score: 0.9901912298054731
