# Step 1: Prepare Datasets

In [43]:
#disable auto save, this sometimes hangs the browser
%autosave 0
import pandas as pd
import time
from pandas.tools.plotting import scatter_matrix
import numpy
import sklearn
from sklearn import preprocessing
from sklearn_pandas import DataFrameMapper
from sklearn.cross_validation import train_test_split
from sklearn import cross_validation
from sklearn import preprocessing
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn import svm
from sklearn.neural_network import MLPClassifier
import operator 

# to supress printing of exponential notation in pandas
pd.options.display.float_format = '{:20,.2f}'.format

# avoid data truncation
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

Autosave disabled


## Helper Functions

### Function to return true if array contains binary (zero and one) values only

In [44]:
def is_only_zero_and_one(array):
    return len(array) == 2 and ((array[0] == 0 and array[1] == 1) or ((array[0] == 1 and array[1] == 0)))

### Function to convert categorical features into binary
#### use this in future instead: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html
#### need to do this because: https://stackoverflow.com/questions/24715230/can-sklearn-random-forest-directly-handle-categorical-features

In [45]:
# does not modify the original source
def convert_categorical_to_binary(data, categorical_columns):
    
    temp_data = data.copy()

    label_binarizer = []
    for col in categorical_columns:
        label_binarizer.append((col, sklearn.preprocessing.LabelBinarizer()))
        
    # df_out=True: output a data frame
    mapper_df = DataFrameMapper(label_binarizer, df_out=True)    
    # temp contains the new columns
    temp = mapper_df.fit_transform(temp_data)
    
    # print temp[temp.isnull().any(axis=1)]
    
    for col in temp.columns:
        temp_data[col] = numpy.array(temp[col])
    
    total_column_count = len(data.columns)
    for col in categorical_columns:
        total_column_count += len(data[col].unique())
        
    print 'new column count should be ' + str(len(temp_data.columns)) + ' and is ' + str(total_column_count)
    return temp_data

### Function to rescale all non-binary features between 0 and 1

In [46]:
# does not modify the original source
# categorical_columns are skipped
# if a column only has binary (0/1) values, it is skipped too
def rescale_non_binary_columns(data, categorical_columns):
    
    temp_data = data.copy()
    scaler = preprocessing.MinMaxScaler()
    for col in data.columns:
        if col not in categorical_columns and not is_only_zero_and_one(data[col].unique()):
            # print 'scaling ' + col
            temp_data[col] = scaler.fit_transform(temp_data[[col]])
            
    return temp_data

### Function to print confusion matrix

In [47]:
def print_confusion_matrix(confusion_matrix, labels):
    records = len(labels)
    for row in range(records):
        print "-------------" + labels[row] + "-------------"
        total = 0
        for column in range(records):
            total += confusion_matrix[row][column]
        print 'total: ' + str(total)
        print 'correct: ' + str(confusion_matrix[row][row])
        for column in range(records):
            if confusion_matrix[row][column] != 0 and row != column:
                print labels[column] + ': ' + str(confusion_matrix[row][column])

### Function to print summary statistics

In [48]:
def print_summary_statistics(confusion_matrix, normal_class_index):
    class_label_count = len(confusion_matrix)
    total_records = 0
    total_normal = 0
    total_anomalous = 0
    total_normal_correctly_identified = 0
    total_anomalous_correctly_identified = 0
    
    for row in range(class_label_count):
        for col in range(class_label_count):            
            total_records += confusion_matrix[row][col]            
            if row == normal_class_index:
                total_normal += confusion_matrix[row][col]
                if col == normal_class_index:
                    total_normal_correctly_identified = confusion_matrix[row][col]
            else:
                total_anomalous += confusion_matrix[row][col]
                if row == col:
                    total_anomalous_correctly_identified += confusion_matrix[row][col]
     
    # * by 1.0 to make denominator float
    #  If the numerator or denominator is a float, then the result will be also.
    total_correctly_identified = total_normal_correctly_identified + total_anomalous_correctly_identified
    correct_normal_percentage = total_normal_correctly_identified * 100/(1.0 * total_normal)
    correct_anomalous_percentage = total_anomalous_correctly_identified * 100/(1.0 * total_anomalous)
    correct_total_percentage = total_correctly_identified * 100/(1.0 * total_records)
    print 'total: ' + str(total_records)
    print 'normal: ' + str(total_normal)
    print 'anomalous: ' + str(total_anomalous)
    
    print 'total correctly identified: ' + str(total_correctly_identified) + '(' + str(correct_total_percentage) + '%)'
    print 'normal correctly identified: ' + str(total_normal_correctly_identified) + '(' + str(correct_normal_percentage) + '%)'
    print 'anomalous correctly identified: ' + str(total_anomalous_correctly_identified) + '(' + str(correct_anomalous_percentage) + '%)'
            

### Function to print F scores

In [49]:
def print_f_scores(actual_labels, predictions, unique_labels):
    #http://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html#sklearn.metrics.f1_score
    # Calculate metrics globally by counting the total true positives, false negatives and false positives.
    print 'micro: ' + str(metrics.f1_score(actual_labels, predictions, 
                                           labels=unique_labels, average='micro'))
    # Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account.
    print 'macro: '+ str(metrics.f1_score(actual_labels, predictions, 
                                          labels=unique_labels, average='macro'))
    # Calculate metrics for each label, and find their average, weighted by support (the number of true instances 
    # for each label). This alters ‘macro’ to account for label imbalance; it can result in an F-score that
    # is not between precision and recall.
    print 'weighted: ' + str(metrics.f1_score(actual_labels, predictions, 
                                              labels=unique_labels, average='weighted'))

# Step 2: Load train and cross validation set

In [50]:
train = pd.read_csv("/Users/haris/Desktop/kdd_datasets/train.csv")
cross_validation = pd.read_csv("/Users/haris/Desktop/kdd_datasets/cross_validation.csv")
print "csvs loaded"
print str(len(train)) + ' train rows'
print str(len(cross_validation)) + ' cross validation rows'


csvs loaded
644994 train rows
214999 cross validation rows


### Converting string class labels to int for Neural Networks

In [51]:
labels = train.append(cross_validation)['label'].unique()
print len(labels)
label_to_index_map = {}
index_to_label_map = {}
index = 0

for label in labels:
    index_to_label_map[index] = label
    label_to_index_map[label] = index
    index += 1
    
train['label'] = train['label'].map(lambda item: label_to_index_map[item])
cross_validation['label'] = cross_validation['label'].map(lambda item: label_to_index_map[item])
print train['label'].unique()
print cross_validation['label'].unique()

23
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22]
[ 0  2  3  5  8  6  4  7  1  9 18 14 12 15 13 10 11 19 17 16 21 20]


### Prepare train and cross validation set

In [55]:
y_train = train['label']
X_train = train.drop('label', 1) 
y_cross_validation = cross_validation['label']
X_cross_validation = cross_validation.drop('label', 1) 

### Train several Neural Networks using different activation/alpha/hidden_layers

In [13]:
#(10,10,10) if you want 3 hidden layers with 10 hidden units each
# activation: relu, logistic
# alpha(regularization)
# random_state
columns_count = len(X_train.columns)
print 'columns: ' + str(columns_count)
labels_count = len(labels)
print 'labels: ' + str(labels_count)
neurons_1 = columns_count/labels_count
neurons_2 = columns_count * 2/3 + labels_count
hidden_layers = [(), (neurons_1), (neurons_2)]
random_state = 19
activations = ['relu', 'logistic']
alphas = [0.0001, 0.0005, 0.001, 0.02, 0.2]

for hidden_layer in hidden_layers:
    for activation in activations:
        for alpha in alphas: 
            print '------------------------------'
            start_time = time.time()
            print 'hidden_layer: ' + str(hidden_layer)
            print 'activation: ' + str(activation)
            print 'alphas: ' + str(alpha)            
            classifier = MLPClassifier(hidden_layer_sizes=hidden_layer, random_state=random_state, 
                                       alpha=alpha, activation=activation)
            classifier.fit(X_train, y_train)                         
            predictions = classifier.predict(X_cross_validation)
            confusion_matrix = metrics.confusion_matrix(y_cross_validation, predictions, labels=index_to_label_map.keys())
            print_summary_statistics(confusion_matrix, 0)
            print_f_scores(y_cross_validation, predictions, index_to_label_map.keys())
            
            total_time = time.time() - start_time
            print 'total prediction time: ' + str(total_time) + ' seconds'

columns: 122
labels: 23
------------------------------
hidden_layer: ()
activation: relu
alphas: 0.0001
total: 214999
normal: 162563
anomalous: 52436
total correctly identified: 214495(99.765580305%)
normal correctly identified: 162505(99.9643215246%)
anomalous correctly identified: 51990(99.1494393165%)
micro: 0.99765580305
macro: 0.495800111127
weighted: 0.997151003655
total prediction time: 46.4818029404 seconds
------------------------------
hidden_layer: ()
activation: relu
alphas: 0.0005


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


total: 214999
normal: 162563
anomalous: 52436
total correctly identified: 214405(99.7237196452%)
normal correctly identified: 162515(99.9704729859%)
anomalous correctly identified: 51890(98.9587306431%)
micro: 0.997237196452
macro: 0.457858951859
weighted: 0.996530909473
total prediction time: 42.3355970383 seconds
------------------------------
hidden_layer: ()
activation: relu
alphas: 0.001
total: 214999
normal: 162563
anomalous: 52436
total correctly identified: 214367(99.7060451444%)
normal correctly identified: 162499(99.9606306478%)
anomalous correctly identified: 51868(98.9167747349%)
micro: 0.997060451444
macro: 0.455286033162
weighted: 0.996346931121
total prediction time: 37.9416491985 seconds
------------------------------
hidden_layer: ()
activation: relu
alphas: 0.02
total: 214999
normal: 162563
anomalous: 52436
total correctly identified: 213950(99.5120907539%)
normal correctly identified: 162506(99.9649366707%)
anomalous correctly identified: 51444(98.1081699596%)
micro:

In [37]:
content = []
with open('/Users/haris/Desktop/NN') as f:
    content = f.readlines()

list = []
item = {}
for line in content:
    if line.startswith('-------------------------'):
        list.append(item)
        item = {}
    else:
        splits = line.split(": ")
        splits[0] = splits[0].replace(' ', '_')
        splits[1] = splits[1].replace('\n', '')

        if splits[0] == 'weighted':
            item[splits[0]] = float(splits[1])
        else:
            item[splits[0]] = splits[1]
        # print splits[0].replace(' ', '_') + '=' + item[splits[0].replace(' ', '_')]
        

### Compare accuracy of various neural networks

In [40]:
list.sort(key=lambda x: x['weighted'])

for item in list:
    print '--------'
    print 'weighted: ' + str(item['weighted'])
    print 'hidden_layer: ' + item['hidden_layer']
    print 'activation: ' + item['activation']
    print 'alphas: ' + item['alphas']   

--------
weighted: 0.972082315961
hidden_layer: 5
activation: logistic
alphas: 0.2
--------
weighted: 0.983623682519
hidden_layer: 104
activation: logistic
alphas: 0.2
--------
weighted: 0.988387873005
hidden_layer: ()
activation: relu
alphas: 0.2
--------
weighted: 0.988387873005
hidden_layer: ()
activation: logistic
alphas: 0.2
--------
weighted: 0.990609117787
hidden_layer: 5
activation: relu
alphas: 0.2
--------
weighted: 0.991900377612
hidden_layer: 104
activation: relu
alphas: 0.2
--------
weighted: 0.992993444672
hidden_layer: 5
activation: logistic
alphas: 0.02
--------
weighted: 0.993775658956
hidden_layer: ()
activation: relu
alphas: 0.02
--------
weighted: 0.993775658956
hidden_layer: ()
activation: logistic
alphas: 0.02
--------
weighted: 0.993876658763
hidden_layer: 104
activation: logistic
alphas: 0.02
--------
weighted: 0.994377821091
hidden_layer: 5
activation: relu
alphas: 0.02
--------
weighted: 0.996346931121
hidden_layer: ()
activation: relu
alphas: 0.001
--------
w

Conclusion: smaller lambda values are more effective<br>
Best performing parameters: <br>hidden_layer: 104<br>activation: relu<br>alphas: 0.0001

### Load test set

In [52]:
test = pd.read_csv("/Users/haris/Desktop/kdd_datasets/test.csv")
print "csv loaded"
print str(len(test)) + ' test rows'
test['label'] = test['label'].map(lambda item: label_to_index_map[item])

y_test = test['label']
X_test = test.drop('label', 1) 

csv loaded
214999 test rows


### Train a neural network using best paramters and run on test set

In [57]:
final_classifier = MLPClassifier(hidden_layer_sizes=neurons_2, random_state=random_state, 
                                       alpha=0.0001, activation='relu')
final_classifier.fit(X_train, y_train)                         
test_predictions = final_classifier.predict(X_test)
test_confusion_matrix = metrics.confusion_matrix(y_test, test_predictions, labels=index_to_label_map.keys())
print_summary_statistics(test_confusion_matrix, 0)
print_f_scores(y_test, test_predictions, index_to_label_map.keys())

total: 214999
normal: 162563
anomalous: 52436
total correctly identified: 214836(99.9241856939%)
normal correctly identified: 162532(99.98093047%)
anomalous correctly identified: 52304(99.7482645511%)
micro: 0.999241856939
macro: 0.561487757204
weighted: 0.999196020092


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
