In [18]:
import numpy as np
import pandas as pd
import os
os.chdir(r'/home/xavient/wns_hack')

#machine learning libraries
from bayes_opt import BayesianOptimization
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import accuracy_score, average_precision_score, f1_score, recall_score, roc_auc_score,confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score

iter_no = 5
gp_params = {'alpha': 1e-5}
cv_splits = 8



In [7]:

def treesCV(eta, gamma,max_depth,min_child_weight,subsample,colsample_bytree,n_estimators):
    #function for cross validation gradient boosted trees
    return cross_val_score(xgb.XGBRegressor(objective='binary:logistic',
    											tree_method = 'hist',
                                                learning_rate=max(eta,0),
                                                gamma=max(gamma,0),
                                                max_depth=int(max_depth),
                                                min_child_weight=int(min_child_weight),
                                                silent=True,
                                                subsample=max(min(subsample,1),0.0001),
                                                colsample_bytree=max(min(colsample_bytree,1),0.0001),
                                                n_estimators=int(n_estimators),
                                                seed=42,nthread=-1), X=X_train, y=y_train, scoring=None, cv=cv_splits, n_jobs=-1).mean()




def data_prep(data_df):

    #how to handle types
    data_df_num = data_df.select_dtypes(exclude=object)
    data_df_obj = data_df.select_dtypes(include=object)

    #how to handle nan
    data_df_num = data_df_num.fillna(data_df_num.mean())

    #get dummy variables
    data_df_obj = data_df_obj.fillna("UNKNOWN")
    data_df_obj = pd.get_dummies(data_df_obj, dummy_na=True)

    data_concat = pd.concat([data_df_num, data_df_obj],axis=1)

    return data_concat.drop(['department_nan','region_region_18','region_nan','education_nan',
                             'gender_nan','recruitment_channel_nan','region_region_10','region_region_12',
                             'region_region_24', 'region_region_31', 'region_region_33'],axis=1) 


In [8]:

# reading data
data_train = pd.read_csv('train.csv', sep=',',encoding="ISO-8859-1")
data_train = data_prep(data_train)

data_pred = pd.read_csv('test.csv', sep=',',encoding="ISO-8859-1")
data_pred = data_prep(data_pred)




In [45]:
#train test split doesnt actually split
X_train, X_test, y_train, y_test = train_test_split(np.array(data_train.drop(['is_promoted','employee_id'],axis=1)), np.array(data_train['is_promoted']), test_size=0, random_state=42)
X_test1 = data_pred.drop(['employee_id'],axis=1)




In [47]:

#Bayesian Hyper parameter optimization of gradient boosted trees
treesBO = BayesianOptimization(treesCV,{'eta':(0.10,0.16),
                                        'gamma':(10,13),
                                        'max_depth':(550,560),
                                        'min_child_weight':(0.4,0.7),
                                        'subsample':(0.4,0.6),
                                        'colsample_bytree':(0.50,0.55),
                                        'n_estimators':(1600,1800)})
treesBO.maximize(n_iter=iter_no, **gp_params)
tree_best = treesBO.res['max']




[31mInitialization[0m
[94m--------------------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |       eta |     gamma |   max_depth |   min_child_weight |   n_estimators |   subsample | 
    1 | 01m08s | [35m   0.29041[0m | [32m            0.5117[0m | [32m   0.1253[0m | [32m  11.3192[0m | [32m   555.6915[0m | [32m            0.4675[0m | [32m     1726.6769[0m | [32m     0.4378[0m | 
    2 | 01m18s | [35m   0.29305[0m | [32m            0.5400[0m | [32m   0.1161[0m | [32m  10.3380[0m | [32m   558.0828[0m | [32m            0.6239[0m | [32m     1625.5646[0m | [32m     0.5675[0m | 
    3 | 01m22s |    0.29296 |             0.5327 |    0.1099 |   10.4878 |    555.4355 |             0.4020 |      1771.1170 |      0.5459 | 
    4 | 01m25s |    0.29268 |             0.5115 |    0.1242 |   10.7298 |    556.6395 |             0.4478 |     

In [48]:
tree_best

{'max_params': {'colsample_bytree': 0.5400425501321214,
  'eta': 0.1161050737052063,
  'gamma': 10.337991942759661,
  'max_depth': 558.0827955992903,
  'min_child_weight': 0.6239178646164122,
  'n_estimators': 1625.5645743297848,
  'subsample': 0.5674805908320731},
 'max_val': 0.2930468350310664}

In [49]:
#train tree with best paras
trees_model = xgb.XGBRegressor(objective='binary:logistic',
                                tree_method = 'hist',
                                seed=42,
                                learning_rate=max(tree_best['max_params']['eta'],0),
                                gamma=max(tree_best['max_params']['gamma'],0),
                                max_depth=int(tree_best['max_params']['max_depth']),
                                min_child_weight=int(tree_best['max_params']['min_child_weight']),
                                silent=True,
                                subsample=max(min(tree_best['max_params']['subsample'],1),0.0001),
                                colsample_bytree=max(min(tree_best['max_params']['colsample_bytree'],1),0.0001),
                                n_estimators=int(tree_best['max_params']['n_estimators']),nthread=-1)
trees_model.fit(X_train, y_train)
y_hat1 = trees_model.predict(np.array(X_test1))



In [51]:
yhat=np.where(y_hat1>=.5,1,0)

In [31]:
#train test split doesnt actually split
X_train, X_test, y_train, y_test = train_test_split(np.array(data_train.drop(['is_promoted','employee_id'],axis=1)), np.array(data_train['is_promoted']), test_size=0.2, random_state=42)
X_test1 = data_pred.drop(['employee_id'],axis=1)

In [32]:

def run_lgb(train_X, train_y, val_X, val_y):
    params = {
    "objective" : "binary",
    "metric" : "auc",
    "num_leaves" : 40,
    "learning_rate" : 0.005,
    "bagging_fraction" : 0.6,
    "feature_fraction" : 0.6,
    "bagging_frequency" : 6,
    "bagging_seed" : 42,
    "verbosity" : -1,
    "seed": 42
    }

    lgtrain = lgb.Dataset(train_X, label = train_y)
    lgval = lgb.Dataset(val_X, label = val_y)
    evals_result = {}
    model = lgb.train(params, lgtrain, 5000, valid_sets = [lgtrain, lgval], early_stopping_rounds = 100, 
              verbose_eval = 150, evals_result = evals_result)
    return model, evals_result


In [33]:

print("LGBM performance")
model_lgbm, evals_result = run_lgb(X_train, y_train, X_test, y_test)

print("LightGBM Training Completed...")
print("################################################")
y_hat2 = model_lgbm.predict(np.array(X_test1))


LGBM performance
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[39]	training's auc: 0.910793	valid_1's auc: 0.92234
LightGBM Training Completed...
################################################


In [40]:
yhats = (y_hat1+y_hat2)

In [41]:
yhat=np.where(yhats>=.5,1,0)

In [52]:

submission = pd.DataFrame({'employee_id':np.array(data_pred['employee_id']),'is_promoted':yhat})

#write to file for submission
submission.to_csv('submission.csv',sep=',', index=False)

In [17]:
submission.head()

Unnamed: 0,employee_id,is_promoted
0,8724,0
1,74430,0
2,72255,0
3,38562,0
4,64486,0


In [13]:
trees_model.feature_importances_

array([0.02543787, 0.08590492, 0.06422018, 0.06338616, 0.03753128,
       0.01834862, 0.24812344, 0.02418682, 0.02293578, 0.01709758,
       0.01084237, 0.02919099, 0.03294412, 0.01793161, 0.03211009,
       0.02251877, 0.00333611, 0.00959133, 0.00375313, 0.00542118,
       0.0058382 , 0.00458716, 0.00750626, 0.00333611, 0.00792327,
       0.00375313, 0.0058382 , 0.01959967, 0.00708924, 0.00667223,
       0.00750626, 0.00542118, 0.0087573 , 0.00750626, 0.00333611,
       0.00333611, 0.00625521, 0.00625521, 0.01209341, 0.0029191 ,
       0.00417014, 0.01334445, 0.00333611, 0.0058382 , 0.00959133,
       0.00250209, 0.00708924, 0.00792327, 0.01292744, 0.00625521,
       0.00333611, 0.00959133, 0.00375313], dtype=float32)

In [14]:
data_train.drop(['is_promoted','employee_id'],axis=1).columns

Index(['no_of_trainings', 'age', 'previous_year_rating', 'length_of_service',
       'KPIs_met >80%', 'awards_won?', 'avg_training_score',
       'department_Analytics', 'department_Finance', 'department_HR',
       'department_Legal', 'department_Operations', 'department_Procurement',
       'department_R&D', 'department_Sales & Marketing',
       'department_Technology', 'region_region_1', 'region_region_11',
       'region_region_13', 'region_region_14', 'region_region_15',
       'region_region_16', 'region_region_17', 'region_region_19',
       'region_region_2', 'region_region_20', 'region_region_21',
       'region_region_22', 'region_region_23', 'region_region_25',
       'region_region_26', 'region_region_27', 'region_region_28',
       'region_region_29', 'region_region_3', 'region_region_30',
       'region_region_32', 'region_region_34', 'region_region_4',
       'region_region_5', 'region_region_6', 'region_region_7',
       'region_region_8', 'region_region_9', 'education