# Generic code for Machine learning competetions in Python
---
## Load libraries
### Additional things
1. Remove warnings
2. Pandas maximum columns display = 1000
3. Matplotlib inline

In [None]:
import pandas as pd
import math
import numpy as np
import warnings
import matplotlib as mpl
warnings.filterwarnings('ignore')
mpl.rcParams['figure.dpi'] = 500
pd.set_option('display.max_columns', 1000)
%matplotlib inline

***
## Data Preprocessing

In [None]:
def get_missing(data):
    return(data.isnull().sum(axis = 0))

In [None]:
def get_incident_rate(data, target = 'target'):
    if target in data.columns:
        ## check target class
        return(data[target].value_counts(normalize=True))
    else:
        print('No "' + target + '" column in your data')

In [None]:
def combine_train_test(train, test, target = 'target'):
    if target in train.columns:
        # # concatenate train and test to do pre-processing
        train_target = train['target']
        del train['target']
        train['train_flag'] = 1
        test['train_flag'] = 0
        total_data = train.append(test, ignore_index = True)
        return(total_data)
    else:
        print('No "' + target + '" column in your data')

In [None]:
def divide_train_test(data, target):
    # # seperate the data back to train and test
    train = data[data.train_flag == 1]
    test = data[data.train_flag == 0]
    del train['train_flag']
    del test['train_flag']
    train['target'] = target
    return({'train' : train, 'test' : test})

In [None]:
def get_ind_dep_cols(columns, target = 'target', drop_cols = []):
    feature_names = [x for x in columns if x not in ['target'] + drop_cols]
    return({'independent' : feature_names, 'dependent' : target})

***
## Train test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(train, target, train_size = 0.7, stratify = target, random_state = 294056)

***
## Modelling
### Model 1 - XGBoost

In [None]:
def multi_f1(pred, dtrain):
    label = dtrain.get_label()
    fs = f1_score(label, pred, average = 'weighted')
    return 'fscore', fs

In [None]:
import xgboost as xgb
# default parameters
params = {'objective':'binary:logistic',
          'learning_rate': 0.05,
          'reg_alpha' : 5.0,
          'gamma' : 5.0,
          'random_state': 294056,
          'eval_metric' : 'auc',
          # 'colsample_bytree': 0.7,
          # 'subsample': 0.8,
          # 'max_depth': 10,
          # 'min_child_weight': 11,
          # 'missing': -999
         }

In [None]:
dtrain = xgb.DMatrix(data=X_train[feature_names], label=y_train)
dvalid = xgb.DMatrix(data=X_valid[feature_names], label=y_valid)
dtest = xgb.DMatrix(data=test[feature_names])
watchlist = [(dtrain, 'train'),(dvalid, 'eval')]

In [None]:
nrounds = 1000
early_stopping_rounds = 40
clf1 = xgb.train(params, dtrain, nrounds, watchlist, maximize = True, verbose_eval = 20, early_stopping_rounds = early_stopping_rounds)

#### Plot and check importance of features

In [None]:
xgb.plot_importance(clf1)

In [None]:
from numpy import array
imp_vals = clf1.get_fscore()
total = sum(list(imp_vals.values()))
imp_vals_fs = [(i/total) * 100 for i in imp_vals.values()]
imp_vals = pd.DataFrame({'cols' : list(imp_vals.keys()), 'fscore' : imp_vals_fs})
imp_vals
# del imp_vals_fs
imp_vals.sort_values(['fscore'], ascending=[0]).head(10)

#### Predict on new dataset

In [None]:
pred1 = clf1.predict(dvalid)

### Model 2 - LightGBM

In [None]:
import lightgbm as lgb

In [None]:
def multi_f1(pred, data):
    label = data.get_label()
    pred = np.reshape(pred, (len(label), 4), 1)
    pred = np.argmax(pred, axis = 1)
    # print(label.shape)
    fs = f1_score(label, pred, average = 'weighted')
    return 'fscore', fs, True

In [None]:
# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train[feature_names], y_train)
lgb_valid = lgb.Dataset(X_valid[feature_names], y_valid, reference = lgb_train)

In [None]:
# specify your configurations as a dict
params = {
    'boosting_type': 'gbdt', # gbdt
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 50, # 50
    'learning_rate': 0.05, # 0.05
    'feature_fraction': 0.7, # 0.7
    'bagging_fraction': 0.8, # 0.8
    'bagging_freq': 5, # 5
    'verbose': 0, # 0
    'max_depth' : -1, # -1
    # 'num_class' : 2
}

In [None]:
clf2 = lgb.train(params,
                lgb_train,
                num_boost_round=3000,
                valid_sets=[lgb_train, lgb_valid],
                early_stopping_rounds = 40,
                verbose_eval=20)

#### Plot and check importance of features

In [None]:
lgb.plot_importance(clf2)

#### Predict on new dataset

In [None]:
pred2 = clf2.predict(X_valid[feature_names])

***
### Model 3 - Sklearn Ensembles

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [None]:
clf3 = RandomForestClassifier(n_estimators = 500)
clf4 = GradientBoostingClassifier(n_estimators = 500)

In [None]:
cv_pred = cross_val_score(clf4, X_train[feature_names], y_train, cv = 10, scoring = 'roc_auc')
print(cv_pred)
print("Std AUC: " + str(np.std(cv_pred)))
print("MEAN AUC: " + str(np.mean(cv_pred)))

In [None]:
clf4.fit(X_train[feature_names], y_train)

In [None]:
pred4 = clf4.predict(test[feature_names])