In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## Supervised Learning: Classification

## Reading and investigating the data

In [None]:
data = pd.read_csv('bankdata.csv')

In [None]:
data.shape

In [None]:
data.dtypes

In [None]:
data.head()

In [None]:
data['status'].value_counts()

### Investigating the numerical variables

In [None]:
data.describe()

#### Investigating variable duration

In [None]:
data['duration'].value_counts()

In [None]:
# we will convert this to a categorical
data['duration'] = data['duration'].apply(str)

In [None]:
# Checking for multicollinearity
plt.figure(figsize=(10, 8))
ax = sns.heatmap(data.corr(), annot=True)
plt.show()

payments and amount_left have a rather high correlation, so we will include only one of them in the model

In [None]:
#sns.distplot(data['amount_total'])
#plt.show()

#sns.distplot(data['balance'])
#plt.show()

sns.distplot(data['payments'])
plt.show()

### Preprocessing the numerical variables

In [None]:
from sklearn.preprocessing import StandardScaler
data_num = data.select_dtypes(include = np.number).drop(columns = 'amount_left')
print(data_num.head())
# Standardizing data
transformer = StandardScaler().fit(data_num)
data_num_standardized = transformer.transform(data_num)
x = pd.DataFrame(data_num_standardized, columns = data_num.columns)
print(x.head())

In [None]:
sns.distplot(data['balance'])
plt.show()

In [None]:
sns.distplot(x['balance'])
plt.show()

### Getting dummies for the categorical variables

In [None]:
from sklearn.preprocessing import OneHotEncoder
cat = data.select_dtypes(include = np.object).drop(columns = 'status')
enc = OneHotEncoder()
categorical = pd.DataFrame(enc.fit_transform(cat).toarray(), columns = enc.get_feature_names())
categorical.head()

In [None]:
categorical.sum(axis = 1)

In [None]:
y = data['status']
X = pd.concat((x, categorical), axis=1)
X.head()

#### Splitting into train and test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=100)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
logisticr = LogisticRegression(max_iter = 500).fit(X_train, y_train)

In [None]:
predictions = logisticr.predict(X_test)
logisticr.score(X_test, y_test)

In [None]:
y_test.value_counts()

In [None]:
pd.Series(predictions).value_counts()

What happened in terms of the classes?

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
decisiontree = DecisionTreeClassifier().fit(X_train, y_train)

In [None]:
predictions = decisiontree.predict(X_test)
decisiontree.score(X_test, y_test)

In [None]:
pd.Series(predictions).value_counts()

In [None]:
decisiontree.feature_importances_

In [None]:
print(dict(zip(X.columns, decisiontree.feature_importances_)))

In [None]:
plt.rcParams['figure.figsize'] = [25, 6]
plt.bar(height = decisiontree.feature_importances_, x = X.columns)
plt.show()

### SVM

In [None]:
from sklearn.svm import SVC

svm = SVC().fit(X_train, y_train)
predictions = svm.predict(X_test)
print(svm.score(X_test, y_test))
print(pd.Series(predictions).value_counts())

### K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

nearest_neighbors = KNeighborsClassifier(n_neighbors=3).fit(X_train, y_train)
predictions = nearest_neighbors.predict(X_test)
print(nearest_neighbors.score(X_test, y_test))
print(pd.Series(predictions).value_counts())

In [None]:
help(KNeighborsClassifier)

## Supervised Learning: Regression

In [None]:
# Decision Tree
from sklearn.tree import DecisionTreeRegressor
# SVR
from sklearn.svm import SVR
# KNN
from sklearn.neighbors import KNeighborsRegressor

## Model Evaluation

### Creating the Confusion Matrix

In [None]:
from sklearn.linear_model import LogisticRegression
logisticr = LogisticRegression(max_iter = 500).fit(X_train, y_train)

In [None]:
predictions = logisticr.predict(X_test)
logisticr.score(X_test, y_test)

In [None]:
from sklearn.metrics import confusion_matrix
cf_matrix = confusion_matrix(y_test, predictions)
print(cf_matrix)

In [None]:
sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True, 
            fmt='.2%', cmap='Blues')

#### Precision, Recall and the F1 Score

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
# We need to assign which is the 'positive class'. Here, we are trying to predict whether a customer will run into
# problems repaying their loan. So Status B is the 'positive' class (meaning credit problems = True)
print(precision_score(y_test, predictions, pos_label='B'))
print(recall_score(y_test, predictions, pos_label='B'))
print(f1_score(y_test, predictions, pos_label='B'))

In [None]:
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, predictions)

#### ROC and AUC analysis

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
help(roc_curve)

In [None]:
# we need to calculate class probabilities!
y_pred_proba = logisticr.predict_proba(X_test)[::,1]
fpr, tpr, _ = roc_curve(y_test, y_pred_proba, pos_label='B') 
plt.plot(fpr,tpr)

In [None]:
help(roc_auc_score)

In [None]:
roc_auc_score(y_test, y_pred_proba)

### Repeating all the steps for a Decision Tree for comparison:

In [None]:
from sklearn.tree import DecisionTreeClassifier
decisiontree = DecisionTreeClassifier().fit(X_train, y_train)

In [None]:
predictions = decisiontree.predict(X_test)
decisiontree.score(X_test, y_test)

In [None]:
cf_matrix = confusion_matrix(y_test, predictions)
print(cf_matrix)

In [None]:
sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True, 
            fmt='.2%', cmap='Blues')

In [None]:
print(precision_score(y_test, predictions, pos_label='B'))
print(recall_score(y_test, predictions, pos_label='B'))
print(f1_score(y_test, predictions, pos_label='B'))

In [None]:
balanced_accuracy_score(y_test, predictions)

In [None]:
y_pred_proba = decisiontree.predict_proba(X_test)[::,1]
fpr, tpr, _ = roc_curve(y_test, y_pred_proba, pos_label='B') 
# we need to assign which one is the "positive" (1) category
plt.plot(fpr,tpr)

In [None]:
roc_auc_score(y_test, y_pred_proba)

### Hyperparameter Tuning

Can we do even better?

In [None]:
from sklearn.model_selection import GridSearchCV
help(GridSearchCV)

In [None]:
help(DecisionTreeClassifier)

In [None]:
param_grid_rf = {"criterion": ["gini", "entropy"],
                 "splitter": ["best", "random"],
                 "max_depth": [3, 10, None]}

tune_rf = GridSearchCV(DecisionTreeClassifier(),
                       param_grid=param_grid_rf,
                       cv=3)

tune_rf.fit(X_train, y_train)

In [None]:
tune_rf.best_params_

In [None]:
decisiontree_opt = DecisionTreeClassifier(**tune_rf.best_params_).fit(X_train, y_train)

In [None]:
predictions = decisiontree_opt.predict(X_test)
decisiontree_opt.score(X_test, y_test)

In [None]:
cf_matrix = confusion_matrix(y_test, predictions)
print(cf_matrix)

In [None]:
sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True, 
            fmt='.2%', cmap='Blues')

In [None]:
print(precision_score(y_test, predictions, pos_label='B'))
print(recall_score(y_test, predictions, pos_label='B'))
print(f1_score(y_test, predictions, pos_label='B'))

In [None]:
balanced_accuracy_score(y_test, predictions)

### Cross Validation

Is our split into training and test set biased?

In [None]:
from sklearn.model_selection import cross_validate
help(cross_validate)

In [None]:
cross_validate(DecisionTreeClassifier(),
               X_train,
               y_train,
               scoring="balanced_accuracy")

In [None]:
cross_validate(DecisionTreeClassifier(),
               X_train,
               y_train,
               scoring="balanced_accuracy")["test_score"].mean()