In [None]:
# LIBRARIES

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import h5py
import scipy

from scipy import ndimage
from sklearn import preprocessing
import math


import matplotlib
import seaborn as sns
%matplotlib inline 

import gc
from datetime import datetime 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

pd.set_option('display.max_columns', 100)


In [None]:
# PARAMETERS

RFC_METRIC = 'gini'  # metric used for RandomForrestClassifier
NUM_ESTIMATORS = 500 # number of estimators used for RandomForrestClassifier
NO_JOBS = 4 # number of parallel jobs used for RandomForrestClassifier

#VALIDATION
VALID_SIZE = 0.20 # simple validation using train_test_split

RANDOM_STATE = 2018

MAX_ROUNDS = 1000 #lgb iterations
EARLY_STOP = 50 #lgb early stop 
OPT_ROUNDS = 1000  #To be adjusted based on best validation rounds
VERBOSE_EVAL = 50 #Print out metric result

In [None]:
# DATASET

path_X_train = "./Data/X_train.csv"
X_train = pd.read_csv(path_X_train,header=0)
path_Y_train = "./Data/Y_train.csv"
Y_train = pd.read_csv(path_Y_train,header=0)

path_X_test = "./Data/X_test.csv"
X_test = pd.read_csv(path_X_test,header=0)
path_Y_test = "./Data/Y_test.csv"
Y_test = pd.read_csv(path_Y_test,header=0)

In [None]:
# INITIALIZE RFC

clf = RandomForestClassifier(n_jobs=NO_JOBS, 
                             random_state=RANDOM_STATE,
                             criterion=RFC_METRIC,
                             n_estimators=NUM_ESTIMATORS,
                             verbose=False)

In [None]:
# TRAIN RFC

clf.fit(X_train.drop(columns='ID'), Y_train)

In [None]:
# PREDICTIONS FOR RFC

preds = clf.predict(X_test.drop(columns='ID'))

In [None]:
# FEATURE IMPORTANCE COMPUTATION

predictors = [  'LIMIT_BAL', 'GENDER', 'EDUCATION', 'MARITAL_STATUS', 'AGE', 
                'PAY_1', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 
                'BILL_AMT1','BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6',
                'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
tmp = pd.DataFrame({'Feature': predictors, 'Feature importance': clf.feature_importances_})
plt.figure(figsize = (7,4))
plt.title('Features importance',fontsize=14)
s = sns.barplot(x='Feature',y='Feature importance',data=tmp)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
plt.show()   

In [None]:
# PRINT FEATURE IMPOTANCE

clf.feature_importances_

In [None]:
# AUC

roc_auc_score(Y_test, preds)

In [None]:
# CONFUSION MATRIX

from sklearn.metrics import confusion_matrix
confusion_matrix(Y_test, preds)
tn, fp, fn, tp = confusion_matrix(Y_test, preds).ravel()

In [None]:
# PRECISION

tp/(tp+fp)

In [None]:
# RECALL

tp/(tp+fn)

In [None]:
# ACCURACY

(tp+tn)/(6000)

In [None]:
# PROBABILITY

x = pd.DataFrame.as_matrix(X_train.iloc[4669])
x = np.reshape(x, [1,24])

predictions_TP = clf.predict_proba(x[:,1:])