## Pipeline using Backward feature selection

In [47]:
import pandas as pd
import numpy as np
from datetime import datetime
import requests
import warnings
from sklearn.linear_model import LinearRegression
from matplotlib import pyplot as plt
import seaborn as sns

from IPython.display import Image
warnings.filterwarnings('ignore')

In [48]:
train_df = pd.read_csv("dev.csv")
test_df = pd.read_csv("compete.csv")

In [49]:
train_df = train_df.drop(['is_host_login', 'num_outbound_cmds'], axis=1);
test_df = test_df.drop(['is_host_login', 'num_outbound_cmds'], axis=1);

In [50]:
train_df = pd.get_dummies(train_df, columns=['protocol_type'])
test_df = pd.get_dummies(test_df, columns=['protocol_type'])

In [51]:
from sklearn import preprocessing

cat_cols = ['service', 'flag']
for col in cat_cols:
    if col in train_df.columns:
        le = preprocessing.LabelEncoder()
        le.fit(list(train_df[col].astype(str).values) + list(test_df[col].astype(str).values))
        train_df[col] = le.transform(list(train_df[col].astype(str).values))
        test_df[col] = le.transform(list(test_df[col].astype(str).values))   

In [52]:
numerical_features = list(train_df.columns[train_df.dtypes != object].values[:-1])
categorical_features = list(train_df.columns[train_df.dtypes == object].values)

corr_table = train_df.corr()
triu = corr_table.where(np.triu(np.ones(corr_table.shape) ,k=1).astype(np.bool))
to_drop = [feat for feat in triu.columns if any(triu[feat] > 0.95)]

train_df = train_df.drop(to_drop, axis=1)

for feat in to_drop:
    if feat in categorical_features:
        categorical_features.remove(feat)
    else:
        numerical_features.remove(feat)

print(f'\nFeatures dropped: {to_drop}')
# plt.figure(figsize=(50, 30))
# _ = sns.heatmap(corr_table, annot=True, fmt='.2f')


Features dropped: ['num_root', 'srv_serror_rate', 'srv_rerror_rate', 'dst_host_same_srv_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'protocol_type_icmp']


In [53]:
from sklearn.model_selection import train_test_split

X = train_df.drop('class', axis=1)
y = train_df['class']

### BACKWARD Feature Selection

In [54]:
import statsmodels.api as sm

X=sm.add_constant(X)
model=sm.OLS(y, X).fit()
model.summary()

0,1,2,3
Dep. Variable:,class,R-squared:,0.944
Model:,OLS,Adj. R-squared:,0.944
Method:,Least Squares,F-statistic:,182800.0
Date:,"Sun, 16 May 2021",Prob (F-statistic):,0.0
Time:,14:13:33,Log-Likelihood:,327140.0
No. Observations:,345815,AIC:,-654200.0
Df Residuals:,345782,BIC:,-653900.0
Df Model:,32,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.3456,0.009,40.554,0.000,0.329,0.362
duration,-1.073e-05,2.7e-07,-39.760,0.000,-1.13e-05,-1.02e-05
service,-0.0001,3.08e-05,-3.476,0.001,-0.000,-4.67e-05
flag,0.0777,0.001,87.368,0.000,0.076,0.079
src_bytes,1.76e-07,2.74e-09,64.159,0.000,1.71e-07,1.81e-07
dst_bytes,1.394e-07,4.86e-09,28.708,0.000,1.3e-07,1.49e-07
land,0.2644,0.024,11.168,0.000,0.218,0.311
wrong_fragment,0.2846,0.001,226.823,0.000,0.282,0.287
urgent,0.0075,0.026,0.282,0.778,-0.044,0.059

0,1,2,3
Omnibus:,288605.372,Durbin-Watson:,1.885
Prob(Omnibus):,0.0,Jarque-Bera (JB):,36982337.638
Skew:,3.385,Prob(JB):,0.0
Kurtosis:,53.208,Cond. No.,10300000.0


### Remove the columns with P value greater than 0.05

In [55]:
X=X.drop(['urgent'],axis=1)
model=sm.OLS(y, X).fit()
model.summary()

0,1,2,3
Dep. Variable:,class,R-squared:,0.944
Model:,OLS,Adj. R-squared:,0.944
Method:,Least Squares,F-statistic:,188700.0
Date:,"Sun, 16 May 2021",Prob (F-statistic):,0.0
Time:,14:13:34,Log-Likelihood:,327140.0
No. Observations:,345815,AIC:,-654200.0
Df Residuals:,345783,BIC:,-653900.0
Df Model:,31,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.3456,0.009,40.563,0.000,0.329,0.362
duration,-1.073e-05,2.7e-07,-39.760,0.000,-1.13e-05,-1.02e-05
service,-0.0001,3.08e-05,-3.478,0.001,-0.000,-4.68e-05
flag,0.0777,0.001,87.371,0.000,0.076,0.079
src_bytes,1.76e-07,2.74e-09,64.159,0.000,1.71e-07,1.81e-07
dst_bytes,1.395e-07,4.86e-09,28.716,0.000,1.3e-07,1.49e-07
land,0.2644,0.024,11.168,0.000,0.218,0.311
wrong_fragment,0.2846,0.001,226.823,0.000,0.282,0.287
hot,0.0728,0.000,191.997,0.000,0.072,0.074

0,1,2,3
Omnibus:,288612.641,Durbin-Watson:,1.885
Prob(Omnibus):,0.0,Jarque-Bera (JB):,36985573.29
Skew:,3.385,Prob(JB):,0.0
Kurtosis:,53.21,Cond. No.,10100000.0


In [56]:
X=X.drop(['num_file_creations'],axis=1)
model=sm.OLS(y, X).fit()
model.summary()

0,1,2,3
Dep. Variable:,class,R-squared:,0.944
Model:,OLS,Adj. R-squared:,0.944
Method:,Least Squares,F-statistic:,195000.0
Date:,"Sun, 16 May 2021",Prob (F-statistic):,0.0
Time:,14:13:35,Log-Likelihood:,327140.0
No. Observations:,345815,AIC:,-654200.0
Df Residuals:,345784,BIC:,-653900.0
Df Model:,30,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.3456,0.009,40.566,0.000,0.329,0.362
duration,-1.074e-05,2.69e-07,-39.922,0.000,-1.13e-05,-1.02e-05
service,-0.0001,3.08e-05,-3.497,0.000,-0.000,-4.73e-05
flag,0.0777,0.001,87.371,0.000,0.076,0.079
src_bytes,1.76e-07,2.74e-09,64.169,0.000,1.71e-07,1.81e-07
dst_bytes,1.395e-07,4.86e-09,28.714,0.000,1.3e-07,1.49e-07
land,0.2644,0.024,11.169,0.000,0.218,0.311
wrong_fragment,0.2846,0.001,226.825,0.000,0.282,0.287
hot,0.0728,0.000,192.050,0.000,0.072,0.074

0,1,2,3
Omnibus:,288600.933,Durbin-Watson:,1.885
Prob(Omnibus):,0.0,Jarque-Bera (JB):,36998039.917
Skew:,3.384,Prob(JB):,0.0
Kurtosis:,53.218,Cond. No.,10100000.0


In [57]:
X=X.drop(['num_shells'],axis=1)
model=sm.OLS(y, X).fit()
model.summary()

0,1,2,3
Dep. Variable:,class,R-squared:,0.944
Model:,OLS,Adj. R-squared:,0.944
Method:,Least Squares,F-statistic:,201700.0
Date:,"Sun, 16 May 2021",Prob (F-statistic):,0.0
Time:,14:13:37,Log-Likelihood:,327140.0
No. Observations:,345815,AIC:,-654200.0
Df Residuals:,345785,BIC:,-653900.0
Df Model:,29,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.3456,0.009,40.567,0.000,0.329,0.362
duration,-1.074e-05,2.69e-07,-39.908,0.000,-1.13e-05,-1.02e-05
service,-0.0001,3.08e-05,-3.479,0.001,-0.000,-4.68e-05
flag,0.0777,0.001,87.377,0.000,0.076,0.079
src_bytes,1.76e-07,2.74e-09,64.179,0.000,1.71e-07,1.81e-07
dst_bytes,1.395e-07,4.86e-09,28.728,0.000,1.3e-07,1.49e-07
land,0.2645,0.024,11.172,0.000,0.218,0.311
wrong_fragment,0.2846,0.001,226.836,0.000,0.282,0.287
hot,0.0728,0.000,192.045,0.000,0.072,0.074

0,1,2,3
Omnibus:,288592.896,Durbin-Watson:,1.885
Prob(Omnibus):,0.0,Jarque-Bera (JB):,36990521.341
Skew:,3.384,Prob(JB):,0.0
Kurtosis:,53.213,Cond. No.,10100000.0


In [58]:
X = X.drop(['const'], axis=1)

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [60]:
import lightgbm as lgb

clf = lgb.LGBMClassifier(bagging_fraction=0.8446682014322044, feature_fraction= 0.35294922284424557,
 learning_rate= 0.5683390979599586,
 max_bin= 21,
 max_depth= 28,
 min_data_in_leaf= 77,
 min_sum_hessian_in_leaf= 0.2693920974409014,
 num_leaves= 35,
 subsample= 0.2095972365619459,
 objective= 'binary',
 metric= 'auc',
 is_unbalance = True,
 boost_from_average = False)
clf.fit(X_train, y_train)



LGBMClassifier(bagging_fraction=0.8446682014322044, boost_from_average=False,
               feature_fraction=0.35294922284424557, is_unbalance=True,
               learning_rate=0.5683390979599586, max_bin=21, max_depth=28,
               metric='auc', min_data_in_leaf=77,
               min_sum_hessian_in_leaf=0.2693920974409014, num_leaves=35,
               objective='binary', subsample=0.2095972365619459)

In [61]:
preds = clf.predict(X_test)

acc_lgbm = (preds == y_test).sum().astype(float) / len(preds)*100

print("LGBM Classifier prediction accuracy is: %3.2f" % (acc_lgbm))

LGBM Classifier prediction accuracy is: 99.98


In [62]:
from sklearn.metrics import roc_auc_score

# Calculate roc auc
roc_value = roc_auc_score(y_test, preds)
roc_value

0.9996512180612531

In [63]:
test_df = test_df.drop(to_drop, axis=1)
test_id = test_df.Id.values
test_df = test_df.drop("Id", axis=1)

In [64]:
test_df = test_df.drop(['num_shells', 'num_file_creations', 'urgent'], axis=1)

In [66]:
preds = clf.predict(test_df)
preds_lgbm = clf.predict(test_df)

In [69]:
submit = pd.DataFrame({'Id': test_id, 'class':preds})
submit.to_csv('backward_lgbm.csv', index=False)