# Modeling

The models we tried include random forest, Adaboost, XGBoost and LightGBM. We will skip the steps of hyperparameter tuning and model selection, and present the final model we chose, which was a weighted average of XGBoost and LightGBM.

In [None]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn import model_selection
from sklearn.ensemble import VotingClassifier
from mlxtend.classifier import StackingCVClassifier
from lightgbm import LGBMClassifier
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import roc_auc_score

In [None]:
train = pd.read_csv('../data/train_clean.csv')
test = pd.read_csv('../data/test_clean.csv')

In [None]:
lgbm_params = {
    'boosting_type': 'gbdt',  
    'objective': 'binary', 
    'num_boost_round': 800,
    'feature_fraction': .321, 
    'bagging_fraction': 0.50, 
    'min_child_samples': 100,  
    'min_child_weigh': 35, 
    'max_depth': 3, 
    'num_leaves': 2, 
    'learing_rate': 0.15,
    'reg_alpha': 5,
    'reg_lambda': 1.1,
    'metric':'auc',
    'max_bin': 52,
    'colsample_bytree': 0.9, 
    'subsample': 0.8, 
    'is_unbalance': 'true'
}

xgb_params = {
    'max_depth': 3,
    'learning_rate': 0.05,
    'n_estimators': 160,
    'silent': True,
    'objective': 'binary:logistic',
    'gamma': 0.3,
    'min_child_weight': 5,
    'max_delta_step': 0,
    'subsample': 0.8,
    'colsample_bytree': 0.785,
    'colsample_bylevel': 1,
    'reg_alpha': 0.01,
    'reg_lambda': 1,
    'scale_pos_weight': 1,
    'seed': 1440,
    'missing': None
}

lgbm = LGBMClassifier(**lgbm_params)
xgb = XGBClassifier(**xgb_params)
avg_mod = VotingClassifier(
    estimators=list(zip(['lgbm', 'xgb'], [lgbm, xgb])), 
    voting='soft', 
    weights=[6, 4]
)

avg_mod.fit(X_train.values, y_train.values)
y_preds = avg_mod.predict_proba(X_test.values)