In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import requests
import warnings
from sklearn.linear_model import LinearRegression
from matplotlib import pyplot as plt
import seaborn as sns

from IPython.display import Image
warnings.filterwarnings('ignore')

In [2]:
train_df = pd.read_csv("dev.csv")
test_df = pd.read_csv("compete.csv")

In [3]:
train_df = train_df.drop(['is_host_login', 'num_outbound_cmds'], axis=1);
test_df = test_df.drop(['is_host_login', 'num_outbound_cmds'], axis=1);

In [4]:
train_df = pd.get_dummies(train_df, columns=['protocol_type'])
test_df = pd.get_dummies(test_df, columns=['protocol_type'])

In [5]:
from sklearn import preprocessing

cat_cols = ['service', 'flag']
for col in cat_cols:
    if col in train_df.columns:
        le = preprocessing.LabelEncoder()
        le.fit(list(train_df[col].astype(str).values) + list(test_df[col].astype(str).values))
        train_df[col] = le.transform(list(train_df[col].astype(str).values))
        test_df[col] = le.transform(list(test_df[col].astype(str).values))   

In [6]:
numerical_features = list(train_df.columns[train_df.dtypes != object].values[:-1])
categorical_features = list(train_df.columns[train_df.dtypes == object].values)

corr_table = train_df.corr()
triu = corr_table.where(np.triu(np.ones(corr_table.shape) ,k=1).astype(np.bool))
to_drop = [feat for feat in triu.columns if any(triu[feat] > 0.95)]

train_df = train_df.drop(to_drop, axis=1)

for feat in to_drop:
    if feat in categorical_features:
        categorical_features.remove(feat)
    else:
        numerical_features.remove(feat)

print(f'\nFeatures dropped: {to_drop}')
# plt.figure(figsize=(50, 30))
# _ = sns.heatmap(corr_table, annot=True, fmt='.2f')


Features dropped: ['num_root', 'srv_serror_rate', 'srv_rerror_rate', 'dst_host_same_srv_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'protocol_type_icmp']


In [7]:
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import xgboost as xgb

X = train_df.drop('class', axis=1)
y = train_df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [8]:
lgb_model = lgb.LGBMClassifier(bagging_fraction=0.8446682014322044, feature_fraction= 0.35294922284424557,
 learning_rate= 0.5683390979599586,
 max_bin= 21,
 max_depth= 28,
 min_data_in_leaf= 77,
 min_sum_hessian_in_leaf= 0.2693920974409014,
 num_leaves= 35,
 subsample= 0.2095972365619459,
 objective= 'binary',
 metric= 'auc',
 is_unbalance = True,
 boost_from_average = False)
lgb_model.fit(X_train, y_train)
pred1 = lgb_model.predict(X_test)



In [9]:
from sklearn.model_selection import KFold

kfold = 15
skf = KFold(n_splits=kfold)
pred2 = pd.DataFrame()
pred2['class'] = np.zeros(len(X_test))

In [10]:
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV

#Instantiate CatBoostClassifier
cbc = CatBoostClassifier()

#create the grid
grid = {'max_depth': [3,4,5],'n_estimators':[100, 200, 300], 'learning_rate': [0.005, 0.001, 0.05, 0.01, 0.1]}

#Instantiate GridSearchCV
gscv = GridSearchCV (estimator = cbc, param_grid = grid, scoring ='accuracy', cv = 5)

#fit the model
gscv.fit(X,y)

#returns the estimator with the best performance
print(gscv.best_estimator_)

#returns the best score
print(gscv.best_score_)

#returns the best parameters
print(gscv.best_params_)

0:	learn: 0.6726406	total: 187ms	remaining: 18.5s
1:	learn: 0.6579079	total: 215ms	remaining: 10.5s
2:	learn: 0.6401551	total: 243ms	remaining: 7.87s
3:	learn: 0.6218689	total: 270ms	remaining: 6.48s
4:	learn: 0.6091719	total: 296ms	remaining: 5.62s
5:	learn: 0.5919561	total: 321ms	remaining: 5.03s
6:	learn: 0.5800906	total: 347ms	remaining: 4.61s
7:	learn: 0.5623245	total: 373ms	remaining: 4.29s
8:	learn: 0.5464373	total: 399ms	remaining: 4.03s
9:	learn: 0.5350705	total: 421ms	remaining: 3.79s
10:	learn: 0.5233801	total: 444ms	remaining: 3.59s
11:	learn: 0.5084946	total: 466ms	remaining: 3.42s
12:	learn: 0.4989954	total: 489ms	remaining: 3.27s
13:	learn: 0.4891586	total: 510ms	remaining: 3.13s
14:	learn: 0.4742057	total: 533ms	remaining: 3.02s
15:	learn: 0.4615273	total: 555ms	remaining: 2.91s
16:	learn: 0.4483782	total: 580ms	remaining: 2.83s
17:	learn: 0.4392750	total: 603ms	remaining: 2.75s
18:	learn: 0.4267576	total: 625ms	remaining: 2.66s
19:	learn: 0.4186653	total: 646ms	remaini

68:	learn: 0.1592080	total: 1.61s	remaining: 723ms
69:	learn: 0.1571699	total: 1.63s	remaining: 700ms
70:	learn: 0.1533167	total: 1.65s	remaining: 676ms
71:	learn: 0.1498581	total: 1.68s	remaining: 652ms
72:	learn: 0.1464957	total: 1.7s	remaining: 628ms
73:	learn: 0.1444064	total: 1.72s	remaining: 604ms
74:	learn: 0.1416849	total: 1.74s	remaining: 580ms
75:	learn: 0.1396544	total: 1.76s	remaining: 556ms
76:	learn: 0.1383005	total: 1.78s	remaining: 533ms
77:	learn: 0.1368166	total: 1.81s	remaining: 510ms
78:	learn: 0.1342750	total: 1.83s	remaining: 486ms
79:	learn: 0.1323265	total: 1.85s	remaining: 463ms
80:	learn: 0.1298671	total: 1.87s	remaining: 439ms
81:	learn: 0.1283432	total: 1.89s	remaining: 416ms
82:	learn: 0.1270266	total: 1.92s	remaining: 393ms
83:	learn: 0.1250727	total: 1.94s	remaining: 369ms
84:	learn: 0.1237500	total: 1.96s	remaining: 346ms
85:	learn: 0.1221853	total: 1.98s	remaining: 323ms
86:	learn: 0.1203702	total: 2.01s	remaining: 300ms
87:	learn: 0.1183931	total: 2.03

33:	learn: 0.2994552	total: 792ms	remaining: 1.54s
34:	learn: 0.2939431	total: 814ms	remaining: 1.51s
35:	learn: 0.2885893	total: 836ms	remaining: 1.49s
36:	learn: 0.2832300	total: 857ms	remaining: 1.46s
37:	learn: 0.2785650	total: 878ms	remaining: 1.43s
38:	learn: 0.2717750	total: 900ms	remaining: 1.41s
39:	learn: 0.2674867	total: 922ms	remaining: 1.38s
40:	learn: 0.2605307	total: 946ms	remaining: 1.36s
41:	learn: 0.2562356	total: 968ms	remaining: 1.34s
42:	learn: 0.2521801	total: 990ms	remaining: 1.31s
43:	learn: 0.2482156	total: 1.01s	remaining: 1.29s
44:	learn: 0.2447067	total: 1.03s	remaining: 1.26s
45:	learn: 0.2407539	total: 1.05s	remaining: 1.24s
46:	learn: 0.2366833	total: 1.08s	remaining: 1.21s
47:	learn: 0.2326626	total: 1.1s	remaining: 1.19s
48:	learn: 0.2291309	total: 1.12s	remaining: 1.16s
49:	learn: 0.2256471	total: 1.14s	remaining: 1.14s
50:	learn: 0.2226108	total: 1.16s	remaining: 1.11s
51:	learn: 0.2194067	total: 1.18s	remaining: 1.09s
52:	learn: 0.2143792	total: 1.21

99:	learn: 0.1011877	total: 2.32s	remaining: 0us
0:	learn: 0.6726406	total: 26.9ms	remaining: 5.34s
1:	learn: 0.6579079	total: 49.8ms	remaining: 4.93s
2:	learn: 0.6401551	total: 74ms	remaining: 4.86s
3:	learn: 0.6218689	total: 96.9ms	remaining: 4.75s
4:	learn: 0.6091719	total: 121ms	remaining: 4.72s
5:	learn: 0.5919561	total: 143ms	remaining: 4.62s
6:	learn: 0.5800906	total: 165ms	remaining: 4.54s
7:	learn: 0.5623245	total: 187ms	remaining: 4.48s
8:	learn: 0.5464373	total: 209ms	remaining: 4.42s
9:	learn: 0.5350705	total: 233ms	remaining: 4.42s
10:	learn: 0.5233801	total: 255ms	remaining: 4.39s
11:	learn: 0.5084946	total: 277ms	remaining: 4.34s
12:	learn: 0.4989954	total: 300ms	remaining: 4.32s
13:	learn: 0.4891586	total: 322ms	remaining: 4.28s
14:	learn: 0.4742057	total: 343ms	remaining: 4.24s
15:	learn: 0.4615273	total: 369ms	remaining: 4.25s
16:	learn: 0.4483782	total: 395ms	remaining: 4.25s
17:	learn: 0.4392750	total: 419ms	remaining: 4.24s
18:	learn: 0.4267576	total: 444ms	remaini

KeyboardInterrupt: 

In [None]:
from catboost import CatBoostClassifier

cat_model = CatBoostClassifier(max_depth=5, n_estimators=300, learning_rate=0.05)
cat_model.fit(X_train, y_train)
pred3 = cat_model.predict(X_test)

In [None]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV

params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
                    silent=True, nthread=1)

folds = 3
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=skf.split(X,y), verbose=3, random_state=1001 )

random_search.fit(X, y)

In [None]:
random_search.best_params_

In [None]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

xgb_model = xgb.XGBClassifier(subsample= 0.6, min_child_weight = 1, max_depth = 5, gamma = 1.5, colsample_bytree = 0.8)

for i, (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
    X_train_s, X_valid = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_s, y_valid = y_train.iloc[train_index], y_train.iloc[test_index]
    xgb_model.fit(X_train_s, y_train_s, eval_set=[(X_train_s, y_train_s), (X_valid, y_valid)], eval_metric='auc', early_stopping_rounds=100)
    vtest = xgb_model.predict(X_test)
    pred2['class'] += vtest/kfold

In [None]:
test_id = test_df.Id.values
test_df = test_df.drop("Id", axis=1)

In [None]:
test_df = test_df.drop(to_drop, axis=1)

In [None]:
test_pred_lgm = lgb_model.predict(test_df)
test_pred_xgb = xgb_model.predict(test_df)
test_pred_cat = cat_model.predict(test_df)

In [None]:
stack_valid = np.column_stack((pred1, pred2['class'], pred3))
test_pred = np.column_stack((test_pred_lgm, test_pred_xgb, test_pred_cat))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

logistic = LogisticRegression(solver='saga', tol=1e-2, max_iter=200,
                              random_state=0)

distributions = dict(C=uniform(loc=0, scale=4),
                     penalty=['l2', 'l1'])

clf = RandomizedSearchCV(logistic, distributions, random_state=0)
search = clf.fit(X, y)
search.best_params_

In [None]:
from sklearn.linear_model import LogisticRegression

stack_model = LogisticRegression(solver='saga', tol=1e-2, max_iter=200, C = 2.195254015709299, penalty = 'l1')
stack_model.fit(stack_valid, y_test)
test_stack_model = stack_model.predict(test_pred)

In [None]:
submit = pd.DataFrame({'Id': test_id, 'class':test_stack_model})
submit.to_csv('test.csv', index=False)