# Load

In [1]:
"""
Load dataset
"""
import pandas as pd
import numpy as np
path_to_file = "~/Desktop/new.KDD.train.csv"
data = pd.read_csv(path_to_file)
data = data.drop('Unnamed: 0',axis=1)
data.head(2)

Unnamed: 0,duration,aol,auth,bgp,courier,csnet_ns,ctf,daytime,discard,domain,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,outcome.response
0,0,0,0,0,0,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,0


# Split Prep

In [2]:
import sklearn.cross_validation as cv
X = data.iloc[:,0:-1]
y=data.iloc[:,-1]

x_train, x_test, y_train, y_test = cv.train_test_split(X, 
                                                       y, 
                                                       train_size=0.7, 
                                                       random_state=0)

In [3]:
import sklearn.metrics as met

def get_error(x_train, y_train, x_test, y_test, model, show = True):
    model.fit(x_train, y_train)
    train_error = 1 - model.score(x_train, y_train)
    test_error = 1 - model.score(x_test, y_test)
    #train_auc = met.roc_auc_score(x_train, y_train)
    #test_auc = met.roc_auc_score(x_test, y_test)
    if show:
        print "The training error is: %.5f" %train_error 
        print "The test     error is: %.5f" %test_error
        #print "The training auc is: %.5f" %train_auc
        #print "The test     auc is: %.5f" %test_auc
    return [train_error, test_error]

# Logistic Regression

In [5]:
import sklearn.linear_model as lm

In [11]:
#Logistic regression with 5-fold CV
logit = lm.LogisticRegressionCV(Cs=[1e5], cv=5, penalty='l1', solver='liblinear',\
                                scoring='accuracy', max_iter=100, n_jobs=-1, random_state=0)
get_error(x_train, y_train, x_test, y_test, logit)

The training error is: 0.12424
The test     error is: 0.12476


[0.12424445175264509, 0.1247618543607113]

In [12]:
#Logistic regression with Stratified 5-Fold
stratify_divide = cv.StratifiedKFold(y=y_train, n_folds=5, shuffle=True, random_state=0)
logit = lm.LogisticRegressionCV(Cs=[1e5], cv=stratify_divide, penalty='l1', solver='liblinear',\
                                scoring='accuracy', max_iter=100, n_jobs=-1, random_state=0)
get_error(x_train, y_train, x_test, y_test, logit)

The training error is: 0.12424
The test     error is: 0.12471


[0.12424445175264509, 0.12470893310753595]

In [13]:
#Logistic regression with 5-fold CV and regularization
logit = lm.LogisticRegressionCV(Cs=np.logspace(-5, 5, 100), cv=5, penalty='l1', solver='liblinear',\
                                scoring='accuracy', max_iter=100, n_jobs=-1, random_state=0)
get_error(x_train, y_train, x_test, y_test, logit)

The training error is: 0.12367
The test     error is: 0.12381


[0.12366609587099264, 0.12380927180355628]

In [None]:
#Logistic regression with Stratified 5-Fold and regularization
stratify_divide = cv.StratifiedKFold(y=y_train, n_folds=5, shuffle=True, random_state=0)
logit = lm.LogisticRegressionCV(Cs=np.logspace(-5, 5, 100), cv=stratify_divide, penalty='l1', solver='liblinear',\
                                scoring='accuracy', max_iter=100, n_jobs=-1, random_state=0)
get_error(x_train, y_train, x_test, y_test, logit)

# Ridge Classification

http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeClassifierCV.html

In [None]:
#Ridge classifier CV with Stratified 5-Fold
stratify_divide = cv.StratifiedKFold(y=y_train, n_folds=5, shuffle=True, random_state=0)
ridge = lm.RidgeClassifierCV(alphas=np.logspace(-2, 5, 100), fit_intercept=True, normalize=True,\
                             scoring='accuracy', cv=stratify_divide)
get_error(x_train, y_train, x_test, y_test, ridge)

In [None]:
print ridge.cv_values_
print ridge.coef_
print ridge.intercept_
print ridge.alpha_

# Lasso Classification