## Support Vector Machine with different parameters

In [24]:
import pandas as pd
import numpy as np
from datetime import datetime
import requests
import warnings
from sklearn.linear_model import LinearRegression
from matplotlib import pyplot as plt
import seaborn as sns

from IPython.display import Image
warnings.filterwarnings('ignore')

In [25]:
train_df = pd.read_csv("dev.csv")
test_df = pd.read_csv("compete.csv")

In [26]:
train_df = train_df.drop(['is_host_login', 'num_outbound_cmds'], axis=1);
test_df = test_df.drop(['is_host_login', 'num_outbound_cmds'], axis=1);

In [27]:
train_df = pd.get_dummies(train_df, columns=['protocol_type'])
test_df = pd.get_dummies(test_df, columns=['protocol_type'])

In [28]:
from sklearn import preprocessing

cat_cols = ['service', 'flag', 'protocol_type']
for col in cat_cols:
    if col in train_df.columns:
        le = preprocessing.LabelEncoder()
        le.fit(list(train_df[col].astype(str).values) + list(test_df[col].astype(str).values))
        train_df[col] = le.transform(list(train_df[col].astype(str).values))
        test_df[col] = le.transform(list(test_df[col].astype(str).values))   

In [29]:
numerical_features = list(train_df.columns[train_df.dtypes != object].values[:-1])
categorical_features = list(train_df.columns[train_df.dtypes == object].values)

corr_table = train_df.corr()
triu = corr_table.where(np.triu(np.ones(corr_table.shape) ,k=1).astype(np.bool))
to_drop = [feat for feat in triu.columns if any(triu[feat] > 0.95)]

train_df = train_df.drop(to_drop, axis=1)

for feat in to_drop:
    if feat in categorical_features:
        categorical_features.remove(feat)
    else:
        numerical_features.remove(feat)

print(f'\nFeatures dropped: {to_drop}')


Features dropped: ['num_root', 'srv_serror_rate', 'srv_rerror_rate', 'dst_host_same_srv_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'protocol_type_icmp']


In [30]:
from sklearn.model_selection import train_test_split

X = train_df.drop('class', axis=1)
y = train_df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

* Gamma parameter **auto**

In [31]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

svc = make_pipeline(StandardScaler(), SVC(gamma='auto'))
svc.fit(X_train, y_train)

preds = svc.predict(X_test)

acc_svc = (preds == y_test).sum().astype(float) / len(preds)*100

print("MLP Classifier prediction accuracy is: %3.2f" % (acc_svc))

MLP Classifier prediction accuracy is: 99.92


* Gamma parameter **scale**

In [32]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

svc = make_pipeline(StandardScaler(), SVC(gamma='scale'))
svc.fit(X_train, y_train)

preds = svc.predict(X_test)

acc_svc = (preds == y_test).sum().astype(float) / len(preds)*100

print("MLP Classifier prediction accuracy is: %3.2f" % (acc_svc))

MLP Classifier prediction accuracy is: 99.92


* Gamma parameter **auto** and kernel **linear**

### SVC with Standard Scaler for standardize the values

In [33]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

svc = make_pipeline(StandardScaler(), SVC(gamma='auto', kernel='linear'))
svc.fit(X_train, y_train)

preds = svc.predict(X_test)

acc_svc = (preds == y_test).sum().astype(float) / len(preds)*100

print("MLP Classifier prediction accuracy is: %3.2f" % (acc_svc))

MLP Classifier prediction accuracy is: 99.69


In [34]:
from sklearn.metrics import roc_auc_score

# Calculate roc auc
roc_value = roc_auc_score(y_test, preds)
roc_value

0.9962656776516654

In [35]:
test_id = test_df.Id.values
test_df = test_df.drop("Id", axis=1)

In [36]:
test_df = test_df.drop(to_drop, axis=1)

In [37]:
preds_SVC = svc.predict(test_df)

In [38]:
submit = pd.DataFrame({'Id': test_id, 'class': preds_SVC})
submit.to_csv('SVC.csv', index=False)