# Chapter 1_2: Build a loan-approval classifier

# 1. Imports

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

pd.set_option('max_colwidth', 5000)

In [None]:
!pip install sweetviz

In [None]:
import sweetviz

# 2. Load and inspect the data
You have to first upload the loan_approval.csv

(original source of data: https://www.kaggle.com/sethirishabh/finance-company-loan-data)

In [None]:
%%script echo skip

# load the CSV data to a Pandas dataframe
df = pd.____('loan_approval.csv')
# show the first two rows of the dataframe
display(df.____(2))
# show detailed information about column names, data types and missing values
print(df.____())
# 'Loan_Status' is the label: show a bar-chart of the class frequencies
df['Loan_Status'].____().plot(kind='____')

In [None]:
#@title Solution
# load the CSV data to a Pandas dataframe
df = pd.read_csv('loan_approval.csv')
# show the first two rows of the dataframe
display(df.head(2))
# show detailed information about column names, data types and missing values
print(df.info())
# 'Loan_Status' is the label: show a bar-chart of the class frequencies
df['Loan_Status'].value_counts().plot(kind='bar')

## Optional: Exploratory Data Analysis

In [None]:
feature_config = sweetviz.FeatureConfig(skip='Loan_ID', force_num=['Loan_Amount_Term'])
my_report = sweetviz.analyze(
    [df,'Dataset'],
    target_feat='Loan_Status',
    feat_cfg=feature_config)
my_report.show_notebook()

# 3. Build the model

## Extra: Rudimentary Data Preprocessing
We can build a model only with features that are numeric and contain no missing values

In [None]:
# select only the numeric columns
selected_columns = ['ApplicantIncome',
                    'CoapplicantIncome',
                    'LoanAmount',
                    'Loan_Amount_Term',
                    'Credit_History',
                    'Loan_Status']
df = df[selected_columns]

# remove rows with any missing values
df = df.dropna(how='any')

# inspect number of remaining rows
print(f'num rows after dropna: {len(df)}')

## 3.1 Training/Test split

In [None]:
%%script echo skip

# 80%/20% stratified split (use class label for stratification)
X_train, X_test, Y_train, Y_test = ____(df.____(columns=['____']),
                                                    df['____'],
                                                    test_size=____,
                                                    random_state=42,
                                                    ____=df['Loan_Status'])

# show the number of rows in training and test sets
print(f'Number of rows in Training Data: {len(____)}')
print(f'Number of rows in Test Data: {len(____)}')

In [None]:
#@title Solution
# 80%/20% stratified split (use class label for stratification)
X_train, X_test, Y_train, Y_test = train_test_split(df.drop(columns=['Loan_Status']),
                                                    df['Loan_Status'],
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=df['Loan_Status'])

# show the number of rows in training and test sets
print(f'Number of rows in Training Data: {len(X_train)}')
print(f'Number of rows in Test Data: {len(X_test)}')

### Optional: Verify that the training/test split is stratified

Finds the number of cases per label value in the training and test sets, and divides them by the corresponding number of rows, in order to verify that the proportios of the label values in the two sets are comparable.

In [None]:
print(Y_train.value_counts()/len(Y_train))
print(Y_test.value_counts()/len(Y_test))

## 3.2 Generate features
Not required (features are already available)

## 3.3 Train the model

In [None]:
%%script echo skip

# define model type and hyper-parameter values
model = LogisticRegression(C=1,
                           max_iter=10**5,
                           class_weight={'Y':0.5, 'N':0.5},
                           random_state=42)

# fit the model to the training data
model.____(____, ____)

In [None]:
#@title Solution
# define model type and hyper-parameter values
model = LogisticRegression(C=1,
                           max_iter=10**5,
                           class_weight={'Y':0.5, 'N':0.5},
                           random_state=42)

# fit the model to the training data
model.fit(X_train, Y_train)

## 3.4 Train a baseline
Use "most_frequent" as strategy (see documentation of [DummyClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html))

In [None]:
%%script echo skip

baseline = ____(strategy="____")
baseline.____(____, _____)

In [None]:
#@title Solution
baseline = DummyClassifier(strategy="most_frequent")
baseline.fit(X_train, Y_train)

## 3.5 Generate predictions for the test set

In [None]:
%%script echo skip

# generate predictions with the model
Y_pred = ____.____(____)
# generate predictions with the baseline
Y_pred_baseline = ____.____(____)

In [None]:
#@title Solution
# generate predictions with the model
Y_pred = model.predict(X_test)
# generate predictions with the baseline
Y_pred_baseline = baseline.predict(X_test)

# 4. Evaluate the predictions

## Extra: Prediction Accuracy
Measure how frequently the predicted label is equal to the groundtruth by using the function [accuracy_score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html)

In [None]:
%%script echo skip

# accuracy of model
model_accuracy = ____(____, Y_test)
print(f'model accuracy: {np.round(____*100, 2)}')

# accuracy of baseline
baseline_accuracy = ____(____, Y_test)
print(f'baseline accuracy: {np.round(baseline_accuracy*100, 2)}')

In [None]:
#@title Solution
# accuracy of model
model_accuracy = accuracy_score(Y_pred, Y_test)
print(f'model accuracy: {np.round(model_accuracy*100, 2)}')

# accuracy of baseline
baseline_accuracy = accuracy_score(Y_pred_baseline, Y_test)
print(f'baseline accuracy: {np.round(baseline_accuracy*100, 2)}')

## 4.1 Evaluation with Confusion Matrix

In [None]:
def plot_confusion_matrix(confusion_matrix, class_labels):
  ax= plt.subplot()

  sns.heatmap(confusion_matrix, annot=True, fmt='', cmap='Blues')
  ax.set_xlabel('Predicted')
  ax.set_ylabel('Actual');
  ax.xaxis.set_ticklabels(class_labels)
  ax.yaxis.set_ticklabels(class_labels);

In [None]:
cf_matrix = confusion_matrix(Y_test, Y_pred)
plot_confusion_matrix(cf_matrix, list(model.classes_))

In [None]:
cf_matrix_baseline = confusion_matrix(Y_test, Y_pred_baseline)
plot_confusion_matrix(cf_matrix_baseline, list(baseline.classes_))

## Open questions
*   Have we lost too many training data with dropna()? What can we do about it?
*   We were forced to remove all non-numeric columns; how can we avoid this?
*   Does the model present acceptable predictive accuracy for both classes?
*   Try the "uniform" strategy of DummyClassifier and interpret the result
*   How can we find the best values for the hyper-parameters of the model?
> Try experimenting with C = 0.5 and class_weight = {'Y':0.22, 'N':0.78  (in 3.3 Train the model).



