# Supervised Learning
```{admonition} Revised
31 May 2023
```

---

## Programming Environment

In [1]:
import numpy             as np
np.set_printoptions(suppress=True, formatter={'float_kind' : '{:0.8f}'.format})
import pandas            as pd
pd.set_option('display.float_format', lambda x: f'{x:0.8f}')
import matplotlib        as mpl
import matplotlib.pyplot as plt

import sklearn
from   sklearn.linear_model    import LogisticRegression
from   sklearn.metrics         import balanced_accuracy_score, confusion_matrix
from   sklearn.model_selection import train_test_split
import imblearn
from   imblearn.metrics        import classification_report_imbalanced
from   imblearn.over_sampling  import RandomOverSampler

from   pathlib  import Path

import datetime
from   importlib.metadata import version
import os
import platform as p
import sys

pad = 20
print(  f"\n{'Executed' : <{pad}} : {datetime.datetime.now().astimezone().strftime('%Y-%m-%d %H:%M:%S %z %Z')}"
        f"\n{'Platform' : <{pad}} : {p.platform(aliased = False, terse = False)}"
        f"\n{'Conda'    : <{pad}} : {os.environ['CONDA_DEFAULT_ENV'] or sys.executable.split('/')[-3]}"
        f"\n{'Python'   : <{pad}} : {p.python_implementation()} {p.python_version()} {sys.executable}")
print(*[f'{name : <{pad}} : {version(name)}'
        for name in ['hvPlot', 'Imbalanced-Learn', 'Matplotlib', 'NumPy', 'Pandas', 'Scikit-Learn']], sep = '\n')


Executed             : 2023-09-04 17:36:14 -0400 EDT
Platform             : macOS-13.5.1-arm64-arm-64bit
Conda                : ml
Python               : CPython 3.11.5 /Users/df/anaconda3/envs/ml/bin/python
hvPlot               : 0.8.4
Imbalanced-Learn     : 0.11.0
Matplotlib           : 3.7.2
NumPy                : 1.23.5
Pandas               : 2.1.0
Scikit-Learn         : 1.3.0


---

In [3]:
lending_data_df = pd.read_csv(filepath_or_buffer='data/lending_data.csv')
lending_data_df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.43181818,5,1,22800,0
1,8400.0,6.692,43600,0.31192661,3,0,13600,0
2,9000.0,6.963,46100,0.34924078,3,0,16100,0
3,10700.0,7.664,52700,0.43074004,5,1,22700,0
4,10800.0,7.698,53000,0.43396226,5,1,23000,0


In [23]:
X = lending_data_df.drop(columns=['loan_status'])
y = lending_data_df.loan_status
print(y.value_counts())

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

lr_model = LogisticRegression(random_state=1)
lr_model.fit(X=X_train, y=y_train)

y_pred = lr_model.predict(X=X_test)

print()
print(balanced_accuracy_score(y_true=y_test,
                              y_pred=y_pred))
print()
print(confusion_matrix(y_true=y_test,
                       y_pred=y_pred))
print()
print(classification_report_imbalanced(y_true=y_test,
                                       y_pred=y_pred))

ros_model = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros_model.fit_resample(X=X_train, y=y_train)
print(y_resampled.value_counts())

lro_model = LogisticRegression(random_state=1)
lro_model.fit(X=X_resampled, y=y_resampled)

yo_pred = lro_model.predict(X=X_test)

print()
print(balanced_accuracy_score(y_true=y_test,
                              y_pred=yo_pred))

print()
print(confusion_matrix(y_true=y_test,
                       y_pred=yo_pred))

print()
print(classification_report_imbalanced(y_true=y_test,
                                       y_pred=yo_pred))

loan_status
0    75036
1     2500
Name: count, dtype: int64

0.9520479254722232

[[18663   102]
 [   56   563]]

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.99      0.91      1.00      0.95      0.91     18765
          1       0.85      0.91      0.99      0.88      0.95      0.90       619

avg / total       0.99      0.99      0.91      0.99      0.95      0.91     19384

loan_status
0    56271
1    56271
Name: count, dtype: int64

0.9936781215845847

[[18649   116]
 [    4   615]]

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.99      0.99      1.00      0.99      0.99     18765
          1       0.84      0.99      0.99      0.91      0.99      0.99       619

avg / total       0.99      0.99      0.99      0.99      0.99      0.99     19384



---

## Terms

* [[W](https://en.wikipedia.org/wiki/Accuracy_and_precision)] Accuracy
* [[W](https://en.wikipedia.org/wiki/Statistical_classification)] Classification
* [[W](https://en.wikipedia.org/wiki/Confusion_matrix)] Confusion Matrix
* [[W](https://en.wikipedia.org/wiki/Cross-validation_(statistics))] Cross Validation
* [[W](https://en.wikipedia.org/wiki/Ground_truth)] Ground Truth
* [[W](https://en.wikipedia.org/wiki/Logistic_regression)] Logistic Regression
* [[W](https://en.wikipedia.org/wiki/Oversampling_and_undersampling_in_data_analysis)] Oversampling
* [[W](https://en.wikipedia.org/wiki/Accuracy_and_precision)] Precision
* [[W](https://en.wikipedia.org/wiki/Precision_and_recall)] Recall
* [[W](https://en.wikipedia.org/wiki/Sensitivity_and_specificity)] Sensitivity
* [[W](https://en.wikipedia.org/wiki/Sensitivity_and_specificity)] Specificity
* [[W](https://en.wikipedia.org/wiki/Supervised_learning)] Supervised Learning
* [[W](https://en.wikipedia.org/wiki/Training,_validation,_and_test_data_sets)] Test Set
* [[W](https://en.wikipedia.org/wiki/Training,_validation,_and_test_data_sets)] Training Set
* [[W](https://en.wikipedia.org/wiki/Oversampling_and_undersampling_in_data_analysis)] Undersampling
* [[W](https://en.wikipedia.org/wiki/Training,_validation,_and_test_data_sets)] Validation Set

---