# Context:
    
The dataset contains 1000 entries with 20 categorial/symbolic attributes prepared by Prof. Hofmann. In this dataset, 
each entry represents a person who takes a credit by a bank. Each person is classified as good or bad credit risks according to 
the set of attributes. The link to the dataset can be found below.

# Dataset:
https://www.kaggle.com/renaldydermawan25/credit-data/version/1

# Some Attribute information:
    
Age (numeric)

Sex (text: male, female)

Job (numeric: 0 - unskilled and non-resident, 1 - unskilled and resident, 2 - skilled, 3 - highly skilled)

Housing (text: own, rent, or free)

Saving accounts (text - little, moderate, quite rich, rich)

Checking account (numeric, in DM - Deutsch Mark)

Credit amount (numeric, in DM)

Duration (numeric, in month)

Purpose (text: car, furniture/equipment, radio/TV, domestic appliances, repairs, education, business, vacation/others)

# Objective:
    
To Guage Feature Importance

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Import important library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

pd.set_option("display.max_columns", 50)

In [None]:
#Read input file and understand the data
# "default" is my dependent variable
df = pd.read_csv("credit_data.csv")
df.head(3)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.nunique()

In [None]:
# # Randomly select 50% data for this use case
# from sklearn.model_selection import train_test_split
# out_data,df =train_test_split(df_pre, train_size = 0.5,random_state=5)
# df_pre.shape
# df_pre.columns
# df_pre.head(5)

In [None]:
# Lets build a Ensemble model but need to modify the dataset first
obj_df=df.select_dtypes(include=['object'])
obj_df.head()

In [None]:
dff=pd.get_dummies(df, drop_first=True)

In [None]:
dff.shape

In [None]:
# Lets check for highly correlated variables
cor= dff.corr()
cor.loc[:,:] = np.tril(cor,k=-1)
cor=cor.stack()
cor[(cor > 0.8) | (cor< -0.8)]

In [None]:
# Split Train/Test data 70:30 ratio
from sklearn.model_selection import train_test_split

y = dff['default']
X = dff.loc[:, dff.columns != 'default']

X_train, X_test, y_train, y_test = train_test_split (X, y, test_size=0.3, random_state=42,)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
#Build RF Model
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_jobs=2,n_estimators=500,criterion="entropy",random_state=9999)
rfm=rf.fit(X_train, y_train)

In [None]:
preds = rf.predict_proba(X_test)[:,1]
y_pred=rf.predict(X_test)

In [None]:
#calculate Confusion Matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
def calculate_confusion_matrix(y_true, y_pred):
    
    cm=confusion_matrix(y_true, y_pred)
    print(cm)

In [None]:
calculate_confusion_matrix(y_test, y_pred)
print(accuracy_score(y_test, y_pred))

In [None]:
# View a list of the features and their importance scores
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1][:15]
a = dff.columns[:]
features= a.drop('default',1)
#plot it
plt.figure(figsize=(10,10))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), features[indices])
plt.xlabel('Relative Importance')
plt.show()

In [None]:
trainResult = rf.score(X_train, y_train)
testResult = rf.score(X_test, y_test)

In [None]:
"Train Accuracy:"
(trainResult*100.0)

In [None]:
"Test Accuracy:"
(testResult*100.0)

# K-fold cross-validation

k-fold cross validation( without stratification)

Usually k is set as 10-20 in practical settings, depends on data set size

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [None]:
num_folds = 10
seed = 77

In [None]:
kfold = KFold(n_splits=num_folds, random_state=seed)

In [None]:
results = cross_val_score(rfm,X, y, cv=kfold)
results

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [None]:
num_folds = 10
seed = 77

In [None]:
kfold = KFold(n_splits=num_folds, random_state=seed)

In [None]:
results = cross_val_score(rfm,X, y, cv=kfold)
results

In [None]:
np.mean(abs(results))

In [None]:
results.std()

Leave One Out Cross-Validation

In [None]:
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score

In [None]:
# You Will do it on X_train, y_train to save time. It will take too much time, hence not recommended for bigger data
scores = cross_val_score(rfm, X_train, y_train, cv=LeaveOneOut())
scores

Because we have 350 samples, the leave one out cross-validation yields scores for 350 trials, and the score indicates either 
defaulter (1.0) or non-defaulter (0.0) prediction. Taking the mean of these gives an estimate of the error rate:

In [None]:
scores.mean()

In [None]:
scores.std()

# Stratified cross-validation( Bonus)

k-fold cross validation with stratification

In [None]:
k = 10
from sklearn.model_selection  import StratifiedKFold, cross_val_score
stratified_kfold = StratifiedKFold(n_splits = k, random_state = 55)
results = cross_val_score(rfm, X, y, cv = stratified_kfold)

In [None]:
print('Accuracy Score')
print('Avearge: ', results.mean())
print('Standard deviation: ', results.std())

# Bootstrapping ( Bonus)

Given a dataset of size n, a bootstrap sample is created by sampling n instances uniformly from the data 
(with/without replacement)

Create a model with each bootstrap sample and validate it with the test set

Final result is calculated by averaging the accuracy of models

In [None]:
# Number of iterations for bootstrapping
bootstrap_iteration = 10
accuracy = []

In [None]:
from sklearn.utils import resample
from sklearn.metrics import accuracy_score

for i in range(bootstrap_iteration):
    X_, y_ = resample(X_train, y_train)
    rfm.fit(X_, y_)
    y_pred = rfm.predict(X_test)
    
    acc = accuracy_score(y_pred, y_test)
    accuracy.append(acc)

In [None]:
accuracy = np.array(accuracy)
print('Accuracy Score')
print('Avearge: ', accuracy.mean())
print('Standard deviation: ', accuracy.std())

Inference:

Here crossfold validation with stratification gives better result than Bootstrapping.