In [1]:
import numpy as np
import pandas as pd
from sklearn import cross_validation
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn import svm, tree
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC,LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn import linear_model
from sklearn.linear_model import SGDClassifier, RidgeCV, Lasso
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score
import statsmodels.api as sm


##Reading in the test and train files of network data

In [2]:
train = np.loadtxt('train_id.csv',dtype=str,delimiter=',')
test = np.loadtxt('test_id_unlabeled.csv',dtype=str,delimiter=',')


###Checking the number of columns in train and test

In [3]:
print len(train[0])
print len(test[0])

43
42


###Encoding non-numeric columns

In [4]:
le = LabelEncoder()
train[:,2] = le.fit_transform(train[:,2])
train[:,3] = le.fit_transform(train[:,3])
train[:,4] = le.fit_transform(train[:,4])

test[:,2] = le.fit_transform(test[:,2])
test[:,3] = le.fit_transform(test[:,3])
test[:,4] = le.fit_transform(test[:,4])


###Converting the labels into numbers

In [5]:
le.fit(train[:,42])
print list(le.classes_)
train[:,42]=le.transform(train[:,42])

['back', 'ipsweep', 'neptune', 'normal', 'portsweep', 'satan', 'smurf', 'teardrop', 'warezclient']


In [6]:
train =  train.astype(float)
test = test.astype(float)
train[:,0:43]
test[:,0:]

array([[  1.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  2.00000000e+00,   0.00000000e+00,   1.00000000e+00, ...,
          1.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  3.00000000e+00,   0.00000000e+00,   1.00000000e+00, ...,
          1.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       ..., 
       [  2.86086000e+05,   0.00000000e+00,   1.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  2.86087000e+05,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  2.86088000e+05,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00]])

###Standardizing the features

In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train = scaler.fit_transform(train[:,1:41])
y_train = train[:,42]
x_test = scaler.fit_transform(test[:,1:41])


###Checking different classifiers to find the best performing classifier

In [8]:
X_train, X_test,Y_train, Y_test= cross_validation.train_test_split(x_train,y_train,test_size=0.5)

# clfs=[DecisionTreeClassifier(max_depth=3),
#       RandomForestClassifier(max_depth=3, n_estimators=10, max_features=11),
#       RandomForestClassifier(max_depth=3, n_estimators=10, max_features=13),
clfs=[linear_model.SGDClassifier(),
      linear_model.SGDClassifier(loss="hinge", penalty="l2"),
      linear_model.SGDClassifier(loss="modified_huber", penalty="l2"),
      linear_model.SGDClassifier(loss="log", penalty="l2"),
      linear_model.SGDClassifier(loss="hinge", penalty="l1"),
      linear_model.SGDClassifier(loss="modified_huber", penalty="l1"),
      linear_model.SGDClassifier(loss="log", penalty="l1"),
      linear_model.SGDClassifier(loss="hinge", penalty="elasticnet"),
      linear_model.SGDClassifier(loss="modified_huber", penalty="elasticnet"),
      linear_model.SGDClassifier(loss="log", penalty="elasticnet")]

for clf in clfs:
    scores = cross_validation.cross_val_score(clf, X_train, Y_train, cv=10)
    print scores
    print("Accuracy: %0.7f (+/- %0.7f)" % (scores.mean(), scores.std() * 2))


[ 0.99748693  0.99760833  0.99764887  0.99724339  0.99687855  0.99744608
  0.9975676   0.99760804  0.99724306  0.99732382]
Accuracy: 0.9974055 (+/- 0.0004530)
[ 0.99732479  0.99675706  0.99768941  0.99777039  0.99781093  0.99744608
  0.99728382  0.99712154  0.9972836   0.99744546]
Accuracy: 0.9973933 (+/- 0.0006057)
[ 0.99700053  0.99748672  0.99700028  0.9970407   0.99712178  0.99752716
  0.99700004  0.99764859  0.99712143  0.99708053]
Accuracy: 0.9972028 (+/- 0.0004741)
[ 0.99708159  0.99760833  0.99716243  0.99695962  0.99691909  0.99712178
  0.99744598  0.99744588  0.99716197  0.99732382]
Accuracy: 0.9972230 (+/- 0.0004277)
[ 0.99760853  0.99805424  0.99817585  0.99805416  0.99829739  0.99821631
  0.99833786  0.9985405   0.9982161   0.99825643]
Accuracy: 0.9981757 (+/- 0.0004627)
[ 0.99825706  0.99793263  0.99772994  0.99805416  0.99793254  0.99797308
  0.99383792  0.99837833  0.99797284  0.99829698]
Accuracy: 0.9976365 (+/- 0.0025601)
[ 0.99708159  0.99813531  0.99809477  0.997243

In [None]:
### Running the best model on the training data

In [30]:
# clf = linear_model.SGDClassifier()
# clf = RandomForestClassifier(max_depth=3, n_estimators=10, max_features=17)
clf = linear_model.SGDClassifier(loss="hinge", penalty="l1")
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)

# glm_gauss = sm.GLM(y_train, x_train, family=sm.families.Gaussian())
# res = glm_gauss.fit()
# print(res.summary())

In [32]:
len(y_pred)

286088

In [33]:
y_pred.astype(int)

array([6, 2, 2, ..., 3, 6, 6])

In [34]:
result = list(le.inverse_transform(y_pred.astype(int)))

In [35]:
# out=open("output.csv", "wb")
j=1
results=[]
for i in range(0,286088):
    results.append([j,result[i]])
    j=j+1
  

In [36]:
import csv
out=open("output_sgdl1h.csv", "wb")
header = ["ID", "Class"]


# for i in result:
#     out.writerow(i) 
    
a = csv.writer(out, delimiter=',')
a.writerow(header)
a.writerows(results)
out.close()