In [24]:
import pandas as pd
import numpy as np
from datetime import datetime
import requests
import warnings
from sklearn.linear_model import LinearRegression
from matplotlib import pyplot as plt
import seaborn as sns

from IPython.display import Image
warnings.filterwarnings('ignore')

In [25]:
train_df = pd.read_csv("dev.csv")
test_df = pd.read_csv("compete.csv")

In [26]:
train_df = train_df.drop(['is_host_login', 'num_outbound_cmds'], axis=1);
test_df = test_df.drop(['is_host_login', 'num_outbound_cmds'], axis=1);

In [27]:
train_df = pd.get_dummies(train_df, columns=['protocol_type'])
test_df = pd.get_dummies(test_df, columns=['protocol_type'])

In [28]:
from sklearn import preprocessing

cat_cols = ['service', 'flag']
for col in cat_cols:
    if col in train_df.columns:
        le = preprocessing.LabelEncoder()
        le.fit(list(train_df[col].astype(str).values) + list(test_df[col].astype(str).values))
        train_df[col] = le.transform(list(train_df[col].astype(str).values))
        test_df[col] = le.transform(list(test_df[col].astype(str).values))   

In [29]:
numerical_features = list(train_df.columns[train_df.dtypes != object].values[:-1])
categorical_features = list(train_df.columns[train_df.dtypes == object].values)

corr_table = train_df.corr()
triu = corr_table.where(np.triu(np.ones(corr_table.shape) ,k=1).astype(np.bool))
to_drop = [feat for feat in triu.columns if any(triu[feat] > 0.95)]

train_df = train_df.drop(to_drop, axis=1)

for feat in to_drop:
    if feat in categorical_features:
        categorical_features.remove(feat)
    else:
        numerical_features.remove(feat)

print(f'\nFeatures dropped: {to_drop}')
# plt.figure(figsize=(50, 30))
# _ = sns.heatmap(corr_table, annot=True, fmt='.2f')


Features dropped: ['num_root', 'srv_serror_rate', 'srv_rerror_rate', 'dst_host_same_srv_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'protocol_type_icmp']


In [30]:
from sklearn.model_selection import train_test_split

X = train_df.drop('class', axis=1)
y = train_df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [48]:
import xgboost as xgb
xgb_regressor = xgb.XGBRegressor(random_state=42)

In [51]:
from skopt import BayesSearchCV
from sklearn.model_selection import StratifiedKFold
# Classifier
bayes_cv_tuner = BayesSearchCV(
    estimator = xgb.XGBClassifier(
        n_jobs = 1,
        objective = 'binary:logistic',
        eval_metric = 'auc',
        silent=1,
        tree_method='approx'
    ),
    search_spaces = {
        'learning_rate': (0.01, 1.0, 'log-uniform'),
        'min_child_weight': (0, 10),
        'max_depth': (0, 50),
        'max_delta_step': (0, 20),
        'subsample': (0.01, 1.0, 'uniform'),
        'colsample_bytree': (0.01, 1.0, 'uniform'),
        'colsample_bylevel': (0.01, 1.0, 'uniform'),
        'reg_lambda': (1e-9, 1000, 'log-uniform'),
        'reg_alpha': (1e-9, 1.0, 'log-uniform'),
        'gamma': (1e-9, 0.5, 'log-uniform'),
        'min_child_weight': (0, 5),
        'n_estimators': (50, 100),
        'scale_pos_weight': (1e-6, 500, 'log-uniform')
    },    
    scoring = 'roc_auc',
    cv = StratifiedKFold(
        n_splits=3,
        shuffle=True,
        random_state=42
    ),
    n_jobs = 3,
    n_iter = 10,
    verbose = 0,
    refit = True,
    random_state = 42
)

def status_print(optim_result):
    """Status callback durring bayesian hyperparameter search"""
    
    # Get all the models tested so far in DataFrame format
    all_models = pd.DataFrame(bayes_cv_tuner.cv_results_)    
    
    # Get current parameters and the best parameters    
    best_params = pd.Series(bayes_cv_tuner.best_params_)
    print('Model #{}\nBest ROC-AUC: {}\nBest params: {}\n'.format(
        len(all_models),
        np.round(bayes_cv_tuner.best_score_, 4),
        bayes_cv_tuner.best_params_
    ))


TypeError: __init__() got an unexpected keyword argument 'iid'

In [38]:
# Fit the model
result = bayes_cv_tuner.fit(X.values, y.values, callback=status_print)

In [41]:
xgb_bo

TypeError: list indices must be integers or slices, not str

In [39]:
params = xgb_bo.res['max']['max_params']
params['max_depth'] = int(params['max_depth'])

TypeError: list indices must be integers or slices, not str

In [None]:
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_opt = xgb.XGBRegressor(learning_rate=0.01,
                           n_estimators=6000,
                           max_depth=4,
                           min_child_weight=0,
                           gamma=0.6,
                           subsample=0.7,
                           colsample_bytree=0.7,
                           objective='reg:squarederror',
                           nthread=-1,
                           scale_pos_weight=1,
                           seed=27,
                           reg_alpha=0.00006,
                           random_state=42)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(n_estimators=6000,
                                learning_rate=0.01,
                                max_depth=4,
                                max_features='sqrt',
                                min_samples_leaf=15,
                                min_samples_split=10,
                                loss='huber',
                                random_state=42)

In [None]:
from lightgbm import LGBMRegressor

lightgbm = LGBMRegressor(objective='regression', 
                         num_leaves=6,
                         learning_rate=0.01, 
                         n_estimators=7000,
                         max_bin=200, 
                         bagging_fraction=0.8,
                         bagging_freq=4, 
                         bagging_seed=8,
                         feature_fraction=0.2,
                         feature_fraction_seed=8,
                         min_sum_hessian_in_leaf = 11,
                         verbose=-1,
                         random_state=42)

In [None]:
rf_regressor = RandomForestRegressor(random_state=42)
cv_sets = ShuffleSplit(random_state = 4) # shuffling our data for cross-validation
parameters = {'n_estimators':range(5, 950, 5), 
              'min_samples_leaf':range(20, 40, 5), 
              'max_depth':range(3, 5, 1)}
scorer = make_scorer(mean_squared_error)
n_iter_search = 10
grid_obj = RandomizedSearchCV(rf_regressor, 
                              parameters, 
                              n_iter = n_iter_search, 
                              scoring = scorer, 
                              cv = cv_sets,
                              random_state= 99)
grid_fit = grid_obj.fit(X_train, y_train)
rf_opt = grid_fit.best_estimator_

print("best params: " + str(grid_fit.best_estimator_))
print("best params: " + str(grid_fit.best_params_))
print('best score:', grid_fit.best_score_)

In [None]:
rf_reg = RandomForestRegressor(n_estimators=100, 
                               random_state=7)
rf_opt = RandomForestRegressor(n_estimators=1200,
                               max_depth=15,
                               min_samples_split=5,
                               min_samples_leaf=5,
                               max_features=None,
                               oob_score=True,
                               random_state=42)

In [None]:
rf_imp = RandomForestRegressor(n_estimators=1200,
                               max_depth=15,
                               min_samples_split=5,
                               min_samples_leaf=5,
                               max_features=None,
                               oob_score=True,
                               random_state=42)
rf_imp.fit(X_train, y_train)
importances = rf_imp.feature_importances_
df_param_coeff = pd.DataFrame(columns=['Feature', 'Coefficient'])
for i in range(len(X_train.columns)-1):
    feat = X_train.columns[i]
    coeff = importances[i]
    df_param_coeff.loc[i] = (feat, coeff)
df_param_coeff.sort_values(by='Coefficient', ascending=False, inplace=True)
df_param_coeff = df_param_coeff.reset_index(drop=True)
print("Top 10 features:\n{}".format(df_param_coeff.head(10)))

importances = rf_imp.feature_importances_
indices = np.argsort(importances)[::-1] # Sort feature importances in descending order
names = [X_train.columns[i] for i in indices] # Rearrange feature names so they match the sorted feature importances
plt.figure(figsize=(15, 7)) # Create plot
plt.title("Top 10 Most Important Features") # Create plot title
plt.bar(range(10), importances[indices][:10]) # Add bars
plt.xticks(range(10), names[:10], rotation=90) # Add feature names as x-axis labels
#plt.bar(range(X_train.shape[1]), importances[indices]) # Add bars
#plt.xticks(range(X_train.shape[1]), names, rotation=90) # Add feature names as x-axis labels
plt.show()

In [None]:
from sklearn.model_selection import KFold
from sklearn.linear_model import RidgeCV

kfolds = KFold(n_splits=5, shuffle=True, random_state=7)
rcv_alphas = np.arange(14, 16, 0.1)
ridge = RidgeCV(alphas=rcv_alphas, 
                cv=kfolds)

In [None]:
from sklearn.svm import SVR

svr = SVR(C= 20, 
          epsilon= 0.008, 
          gamma=0.0003)

In [None]:
stack_gen = StackingCVRegressor(regressors=(gbr,
                                            xgb_opt,
                                            lightgbm,
                                            rf_opt,
                                            ridge, 
                                            svr),
                                meta_regressor=xgb_opt,
                                use_features_in_secondary=False)

In [None]:
print('Fitting models to the training data:')

print('xgboost....')
xgb_model_full_data = xgb_opt.fit(X, y)
print('GradientBoosting....')
gbr_model_full_data = gbr.fit(X, y)
print('lightgbm....')
lgb_model_full_data = lightgbm.fit(X, y)
print('RandomForest....')
rf_model_full_data = rf_opt.fit(X, y)
print('Ridge....')
ridge_model_full_data = ridge.fit(X, y)
print('SVR....')
svr_model_full_data = svr.fit(X, y)
print('Stacking Regression....')
stack_gen_model = stack_gen.fit(np.array(X), np.array(y))


In [None]:
def blend_models_predict(X):
    return ((0.25 * stack_gen_model.predict(np.array(X))) + \
            (0.25 * gbr_model_full_data.predict(X)) + \
            (0.15 * svr_model_full_data.predict(X)) + \
            (0.15 * lgb_model_full_data.predict(X)) + \
            (0.1 * ridge_model_full_data.predict(X))+ \
            (0.05 * xgb_model_full_data.predict(X)) + \
            (0.05 * rf_model_full_data.predict(X)) 
           )

In [None]:
test_df = test_df.drop(to_drop, axis=1)
test_id = test_df.Id.values
test_df = test_df.drop("Id", axis=1)

In [None]:
# Generate predictions from the blend
y_pred_final = np.floor(np.expm1(blend_models_predict(test_df)))

In [None]:
submit = pd.DataFrame({'Id': test_id, 'class':preds})
submit.to_csv('kmeans_lgbm.csv', index=False)