# Credit Risk Classification

## Imports and Dependencies

In [1]:
# Imports the required libraries and dependencies
import numpy as np
from pathlib import Path                                    # Enables us to specify a file's system path   
import pandas as pd                                         # Enables us to work with DataFrames          
from sklearn.model_selection import train_test_split        # Enables us to split the data into training and test sets
from sklearn.linear_model import LogisticRegression         # Enables the creation of a logistic regression model
from imblearn.over_sampling import RandomOverSampler        # Enables each of the classes to have an equal amount of data points
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report #Evaluation methods               

## Functions

In [2]:
#    1. Fits to classifier
def fit(X_train, y_train):
    return classifier.fit(X_train, y_train)

#    1. Creates predictions based on classifier
def predict():
    return classifier.predict(X_test)


#    1. Creates predictions based on classifier
def score(training, X_train, y_train):
    print(f"Training Data Score: {classifier.score(X_train_original, y_train_original)}")
    print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

    print("\nThe above helps show that the model is not overfitted as the training and testing scores are within close proximity.")

---

## Loads Lending Data

In [3]:
# Reads the lending_data.csv file from the Resources folder into a Pandas DataFrame
lending_data = Path('Resources/lending_data.csv')
lending_df = pd.read_csv(lending_data)

# Reviews the first five records of lending DataFrame
lending_df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


## Labels and Features Allocation and Exploration

In [4]:
# Creates labels and features from the lending DataFrame:

# Sets the labels (y) to the values contained within the loan_status column in the lending DataFrame
y = lending_df["loan_status"]

# Sets the features (X) to the values contained within the lending DataFrame once the loan_status column is dropped 
X = lending_df.drop(columns="loan_status")

In [5]:
# Reviews the labels (y) series which contains the loan statuses
print(y)

0        0
1        0
2        0
3        0
4        0
        ..
77531    1
77532    1
77533    1
77534    1
77535    1
Name: loan_status, Length: 77536, dtype: int64


In [6]:
# Reviews the first five records of the features (X) DataFrame which contains all credit risk features
X.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


In [7]:
# Checks the balance of our target values
print(y.value_counts())


print("\nThe above shows that within the labels, 75,036 are classed as a 'healthy loan' and 2,500 data points are classed as a 'high-risk loan'. This indicates a inbalance between the labels being used.")

0    75036
1     2500
Name: loan_status, dtype: int64

The above shows that within the labels, 75,036 are classed as a 'healthy loan' and 2,500 data points are classed as a 'high-risk loan'. This indicates a inbalance between the labels being used.


## Splits the Data into Training and Testing Sets

In [8]:
# Splits the features and labels into training and testing sets using train_test_split
X_train_original, X_test, y_train_original, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, # Makes 'randomness' reproduceable for comparison
                                                    stratify=y)     # Stratify maintains class (0,1) proportion when splitting

In [9]:
# Outputs the shape of each training and testing set
print(f"X_train_original shape: {X_train_original.shape}, X_test shape: {X_test.shape}")
print(f"y_train_original shape: {y_train_original.shape}, y_test shape: {y_test.shape}")

print("\nThe above shows that size of each training and test set; the training set contains most of the datapoints.")

X_train_original shape: (58152, 7), X_test shape: (19384, 7)
y_train_original shape: (58152,), y_test shape: (19384,)

The above shows that size of each training and test set; the training set contains most of the datapoints.


---

## Creates a Logistic Regression Model with the Original Data

### Model and Fit

In [10]:
# Instantiates the Logistic Regression model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

# Fits the model using training data's features and labels
fit(X_train_original, y_train_original)

### Training and Test Scores 

In [11]:
# Scores the model using the train and test data
print(f"Training Data Score: {classifier.score(X_train_original, y_train_original)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

print("\nThe above helps show that the model is not overfitted as the training and testing scores are within close proximity.")

Training Data Score: 0.9914878250103177
Testing Data Score: 0.9924164259182832

The above helps show that the model is not overfitted as the training and testing scores are within close proximity.


### Predict

In [12]:
# Predicts the loan statuses of the testing data
original_predictions = predict()

# Plots predictions and actual loan statuses on a results DataFrame
results = pd.DataFrame({"Original Prediction": original_predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Original Prediction,Actual
0,0,0
1,0,1
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


### Model Performance Evaluation

In [13]:
# Prints the balanced_accuracy score of the model
balanced_accuracy_score(y_test, original_predictions)

0.9442676901753825

In [14]:
# Generates a confusion matrix for the model
test_matrix = confusion_matrix(y_test, original_predictions)
print(test_matrix)

[[18679    80]
 [   67   558]]


In [15]:
# Prints the classification report for the model
testing_report = classification_report(y_test, original_predictions)
print(testing_report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18759
           1       0.87      0.89      0.88       625

    accuracy                           0.99     19384
   macro avg       0.94      0.94      0.94     19384
weighted avg       0.99      0.99      0.99     19384



<br>
The model created can classify the status of an unseen loan into `0` (healthy loan) and `1` (high-risk loan) based on the model created from the labeled dataset it was provided with. However, how accurate is this model?
<br>

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** 
<br>
<br>
<b>Balanced accuracy score:</b> 
<br>The balanced accuracy score obtained was 0.944 (3.s.f), this provides a positive indication for our model. This score indicates that the model was good at classifing whether the unseen loan  was healthy or high risk and therefore was quite accurate. As the 'balanced' score was calculated, it has assured that the model was not just accurate at predicting the dominating class which was `0` (healthy loan) but it also is accurate at predicting the non dominating class which was `1` (high-risk loan). However, it should be noted that it suggests 6 out of 100 loans have been wrongly classified and does not explore the impact of these false positives or negatives. Therefore further evaluation is required but it is a promising outcome.


<br>
<b>Precision and Recall:</b> 
<br> 
Precision is the ratio of actual positive observations to the total predicted positive observations. The classification report has returned a precision of 1.0 for healthy loans, this indicates that all the healthy loans predicted were correct for the test data it was provided with. However, the report also returned a precision of 0.87 for high risk loans, this is still fairly high but the model has classified a healthy loan as high risk on 13% of the test data it was provided with. This indicates that the model could be piloted but it would need to undergo further training to increase the precision of classifying high risk loans.
<br>

<br> 
Recall is the ratio of correctly predicted positive observations to all actual positive observations for that class. The classification report has returned a recall of 1.0 for healthy loans which indicates that the model classified of all the actual healthy loans correctly. The report also indicates that the high-risk loans have a recall of 0.89, indicating that the model is quite good at keeping the false negatives minimal.
<br>

<br>
Finally, the combined accuracy and precision is conveyed in the f1-score, this score helps evaluate the model's ability to keep false positives and false negatives minimal. Combined, the healthy loans and high risk loans had a f1-score of 0.99 which does an excellent job at predicting loan statuses. However, the f1-score of high-risk loans has slightly weighed down this overall rating, the high-risk obtained a f1-score of 0.88, this could be improved so that high-risk loans avoid false positive and negatives.

---

## ## Creates a Logistic Regression Model with Resampled Training Data

### Resampling Training Data

In [16]:
# Prints values counts of original (y) labels
y_train_original.value_counts()

0    56277
1     1875
Name: loan_status, dtype: int64

As identified above, the value counts of each class (0, 1) in the original labels (y) are inbalanced (56277 : 1875). Therefore, the classes in the training data will be balanced to view the impact of this on the classifier model.

In [17]:
# Instantiates the random oversampler model
random_oversampler = RandomOverSampler(random_state=1)

# Fits the original training data to the random_oversampler model
X_train_resampled, y_train_resampled = random_oversampler.fit_resample(X_train_original, y_train_original)

In [18]:
# Counts the distinct values of the resampled labels data
y_train_resampled.value_counts()

0    56277
1    56277
Name: loan_status, dtype: int64

### Fit

In [19]:
# Fits the classifier model using the resampled training data's features and labels
fit(X_train_resampled, y_train_resampled)

### Training and Test Scores 

In [20]:
# Scores the model using the resampled train data and test data
print(f"Resampled Training Data Score: {classifier.score(X_train_resampled, y_train_resampled)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

print("\nThe above helps show that the model is not overfitted as the resampled training score and testing score are within close proximity.")

Resampled Training Data Score: 0.994180571103648
Testing Data Score: 0.9952022286421791

The above helps show that the model is not overfitted as the resampled training score and testing score are within close proximity.


### Predict

In [21]:
# Re-predicts the loan statuses of the testing data with the newly fitted classifier
resampled_predictions = predict()

# Plots the resampled predictions and actual loan statuses on a resampled results DataFrame
resampled_results = pd.DataFrame({"Resampled_Prediction": resampled_predictions, "Actual": y_test}).reset_index(drop=True)
resampled_results.head(10)

Unnamed: 0,Resampled_Prediction,Actual
0,0,0
1,1,1
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


### Model Performance Evaluation

In [22]:
# Prints the balanced_accuracy score of the model 
balanced_accuracy_score(y_test, resampled_predictions)

0.9959744975744975

In [23]:
# Generates a confusion matrix for the model
test_matrix = confusion_matrix(y_test, resampled_predictions)
print(test_matrix)

[[18668    91]
 [    2   623]]


In [24]:
# Prints the classification report for the model
testing_report = classification_report(y_test, resampled_predictions)
print(testing_report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18759
           1       0.87      1.00      0.93       625

    accuracy                           1.00     19384
   macro avg       0.94      1.00      0.96     19384
weighted avg       1.00      1.00      1.00     19384



### Step 4: Answer the following question

**Question:** How well does the logistic regression model, fit with oversampled data, predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** YOUR ANSWER HERE!