In [None]:
import pickle
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
import shap
from sklearn.metrics import roc_auc_score
import sklearn.metrics as metrics
import matplotlib.pyplot as plt

def plot_roc_auc(fpr, tpr, title):
    roc_auc = metrics.auc(fpr, tpr)
    plt.title(title)
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()
  


def train_model(X, target_name = 'y', split = 'in_sample', model_type = 'lgb', params = 'default', 
                objective = 'binary', metric = 'auc', save_path = ''):
    '''returns model and saves log parameters'''
    
    if (split == 'in_sample'):
        X_tr, X_val, y_train, y_val = train_test_split(X.drop(columns = target_name), 
                                                       X[target_name], 
                                                       test_size = 0.1, random_state = 1926)
        
    if (split != 'in_sample'):
        X_tr = X[X['train'] == 1].drop(columns = [target_name, 'train'])
        X_val = X[X['train'] != 1].drop(columns = [target_name, 'train'])
        y_train = X[X['train'] == 1][target_name]
        y_val = X[X['train'] != 1][target_name]
    
    if (params == 'default'):
        params = {'num_leaves': 54, 'min_data_in_leaf': 79, 'objective': objective,
                  'max_depth': 3, 'learning_rate': 0.01, 'boosting': 'gbdt', 'feature_fraction': 1,
                  'bagging_freq': 5, 'bagging_fraction': 0.9, 'bagging_seed': 11, 'metric': metric, 'lambda_l1': 0.1,
                  'verbosity': -1, 'min_child_weight': 5, 'reg_alpha': 3, 'reg_lambda': 2, 'subsample': 0.8,'seed': 1926}
        
    if (model_type == 'lgb'):
        X_train_lgb = lgb.Dataset(X_tr, label = y_train)
        X_val_lgb = lgb.Dataset(X_val, label = y_val)
        
        print('#'*20 + ' '*5 + 'training with ',X_tr.shape[0], ' '*5 + '#'*20)
        print('#'*20 + ' '*5 + 'validating with ',X_val.shape[0], ' '*5 + '#'*20)
        
        model = lgb.train(params, 
                          X_train_lgb,
                          num_boost_round = 1000,
                          valid_sets = [X_train_lgb, X_val_lgb],
                          early_stopping_rounds = 20)
    if (save_path != ''):
        with open(save_path + '/_model.pickle', 'wb') as pfile:
            pickle.dump(model, pfile, protocol = pickle.HIGHEST_PROTOCOL)

    fpr, tpr, threshold = metrics.roc_curve(X[target_name], model.predict(X.drop(columns = target_name)))
    plot_roc_auc(fpr, tpr, f'ROC AUC curve for the train and validation sets')
    
    return model


  



def model_report(model, X, target_name = 'y', model_type = 'lgb', save_path = ''):
    
    if (model_type == 'lgb'):
        feat_imp = pd.DataFrame({'feature': model.feature_name(),
                                 'cover': model.feature_importance(importance_type='split'),
                                 'gain': model.feature_importance(importance_type='gain')}
                               ).sort_values('gain', ascending = False)
        
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X.drop(columns = target_name))[0]
        shap.summary_plot(shap_values, X.drop(columns = target_name))
        
        shap_dict = dict(zip(X.drop(columns = target_name).columns, shap_values.sum(axis = 0)))
        abs_shap_dict = dict(zip(X.drop(columns = target_name).columns, abs(shap_values).sum(axis = 0)))
        feat_imp['sum_shap'] = feat_imp['feature'].map(shap_dict)
        feat_imp['abs_shap'] = feat_imp['feature'].map(abs_shap_dict)
    
    
    
    tempDf = X.copy()
    tempDf['score'] = model.predict(tempDf.drop(columns = target_name)) + np.random.rand(tempDf.shape[0])/1e5
    tempDf['decile'] = pd.qcut(tempDf['score'], 10, labels = ['Decile '+ str(i) for i in range(1,11)])
    decile = tempDf.groupby('decile').size().reset_index()
    decile.columns = ['Decile', '# observations']
    decile['Average target'] = decile['Decile'].map(tempDf.groupby('decile')[target_name].mean())
    decile['Average score'] = decile['Decile'].map(tempDf.groupby('decile').score.mean())
    for feature in feat_imp.feature:
        decile[feature] = decile['Decile'].map(tempDf.groupby('decile')[feature].mean())
        
    
    if (save_path != ''):
        feat_imp.to_csv(os.path.join(save_path, 'FeatureImportances.csv'), index = False, sep = ';', decimal = ',')
        decile.to_csv(os.path.join(save_path, 'DecileAnalysis.csv'), index = False, sep = ';', decimal = ',')
    
    return feat_imp, decile