# Credit Risk Classification

## Imports and Dependencies

In [1]:
# Imports the required libraries and dependencies
from pathlib import Path                                    # Enables us to specify a file's system path   
import pandas as pd                                         # Enables us to work with DataFrames          
from sklearn.model_selection import train_test_split        # Enables us to split the data into training and testing sets
from sklearn.linear_model import LogisticRegression         # Enables the creation of a logistic regression model
from imblearn.over_sampling import RandomOverSampler        # Enables each of the classes to have an equal amount of data points
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report #Evaluation methods               

## Functions

In [2]:
# The below functions help to not repeat code and will make this script more maintainable:

#    [*1.] Scores the logistic regression model based on the input: features and target values
def score(x, y):
    return classifier.score(x, y)

#    [*2.] Fits the logistic regression model using the training set
def fit(x_train, y_train):
    return classifier.fit(x_train, y_train)

#    [*3.] The logistic regression model produces loan status predictions for the input features
def predict():
    return classifier.predict(x_test)

#    [*4.] Creates a confusion matrix DataFrame
def cm_df(cm):
    return pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

# The function reference contained in the square brackets will be mentioned upon any use of the function within the script.

---

## Loads Lending Data

In [3]:
# Reads the lending_data.csv file from the Resources folder into a Pandas DataFrame
lending_data = Path('Resources/lending_data.csv')
lending_df = pd.read_csv(lending_data)

# Reviews the first five records of lending DataFrame
lending_df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


## Labels and Features Allocation and Exploration

In [4]:
# Creates labels and features from the lending DataFrame:

# Sets the labels (y) to the values contained within the loan_status column in the lending DataFrame
y = lending_df["loan_status"]

# Sets the features (x) to the values contained within the lending DataFrame once the loan_status column is dropped 
x = lending_df.drop(columns="loan_status")

In [5]:
# Reviews the labels (y) series which contains the known loan statuses
print(y)

0        0
1        0
2        0
3        0
4        0
        ..
77531    1
77532    1
77533    1
77534    1
77535    1
Name: loan_status, Length: 77536, dtype: int64


In [6]:
# Reviews the first five records of the features (x) DataFrame which contains all the lending features
x.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


In [7]:
# Checks the balance of our known loan status values
print(y.value_counts())

0    75036
1     2500
Name: loan_status, dtype: int64


The above shows that 75,036 labels are classed as a 'healthy loan' and 2,500 labels are classed as a 'high-risk loan'. This indicates an inbalance between the classes."

## Splits the Data into Training and Testing Sets

In [8]:
# Splits the features and labels into training and testing sets using train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, 
                                                    y, 
                                                    # Makes 'randomness' reproduceable for comparison
                                                    random_state=1, 
                                                    # Stratify maintains class (0, 1) proportion when splitting
                                                    stratify=y)    

In [9]:
# Outputs the shape of each training and testing set
print(f"x_train shape: {x_train.shape}, y_train shape: {y_train.shape}")
print(f"x_test shape: {x_test.shape}, y_test shape: {y_test.shape}")

x_train shape: (58152, 7), y_train shape: (58152,)
x_test shape: (19384, 7), y_test shape: (19384,)


The above shows the size of each training and testing set; the training set contains most of the datapoints.

---

## Creating a Logistic Regression Model 

In [10]:
# Instantiates the Logistic Regression model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

---

## Logistic Regression Model with the Original Data

### Fit

In [11]:
# Fits the logistic regression model using training data's features and labels
fit(x_train, y_train) #[*2.]

### Training and Testing Scores 

In [12]:
# Scores the logistic regression using the training and testing data
print(f"Training Data Score: {score(x_train, y_train)}") #[*1.]
print(f"Testing Data Score: {score(x_test, y_test)}")    #[*1.]

Training Data Score: 0.9914878250103177
Testing Data Score: 0.9924164259182832


The above helps show that the model is not overfitted as the training and testing scores are within close proximity.

### Predict

In [13]:
# Predicts the loan statuses of the testing data using the logistic regression model
original_predictions = predict() #[*3.]

# Plots predictions and actual loan statuses on a results DataFrame
original_results = pd.DataFrame({"Original Prediction": original_predictions, "Actual": y_test}).reset_index(drop=True)
original_results.head(10)

Unnamed: 0,Original Prediction,Actual
0,0,0
1,0,1
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


### Model Performance Evaluation

In [14]:
# Prints the balanced_accuracy score of the logistic regression model
balanced_accuracy_score(y_test, original_predictions)

0.9442676901753825

In [15]:
# Generates a confusion matrix for the logistic regression model
cm_original = confusion_matrix(y_test, original_predictions)

# Displays the confusion matrix within a DataFrame
cm_original_df = cm_df(cm_original) #[*4.]

cm_original_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,18679,80
Actual 1,67,558


In [16]:
# Prints the classification report for the logistic regression model
cr_original = classification_report(y_test, original_predictions)
print(cr_original)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18759
           1       0.87      0.89      0.88       625

    accuracy                           0.99     19384
   macro avg       0.94      0.94      0.94     19384
weighted avg       0.99      0.99      0.99     19384



<br>
The model created can classify the status of an unseen loan into `0` (healthy loan) and `1` (high-risk loan) based on the model created from the labeled dataset it was provided with. 
<br>

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** 
The logistic regression model is quite accurate, however, the model does face issues with categorising 0.76% (2.s.f) of all the loans in the unseen lending test data it was provided with. 11% (2.s.f) of the actual high-risk loans were considered healthy by the model, and 0.43% (2.s.f) of the actual healthy loans were considered high-risk by the model. Overall, the model performed very well at predicting the `0` (healthy loans) and it was also quite good at predicting `1` (high-risk loans) but due to the context of the classification, further tweaking is required.

---

## Logistic Regression Model with Resampled Training Data

### Resampling Training Data

In [17]:
# Prints values counts of original (y) labels
y_train.value_counts()

0    56277
1     1875
Name: loan_status, dtype: int64

As identified above, the value counts of each class (0 - healthy, 1 - high-risk) in the original labels (y) are inbalanced (56277 : 1875). Therefore, the classes in the training data will be balanced to view the impact of this on the classifier model.

In [18]:
# Instantiates the random oversampler model
random_oversampler = RandomOverSampler(random_state=1)

# Fits the original training data to the random_oversampler model
x_train_resampled, y_train_resampled = random_oversampler.fit_resample(x_train, y_train)

In [19]:
# Counts the distinct values of the resampled labels data
y_train_resampled.value_counts()

0    56277
1    56277
Name: loan_status, dtype: int64

The classes are now balanced.

### Fit

In [20]:
# Fits the logistic regression model using the resampled training data's features and labels
fit(x_train_resampled, y_train_resampled) #[*2.]

### Training and Test Scores 

In [21]:
# Scores the logistic regression model using the resampled training data and testing data
print(f"Resampled Training Data Score: {score(x_train_resampled, y_train_resampled)}") #[*1.]
print(f"Testing Data Score: {score(x_test, y_test)}") #[*1.]

Resampled Training Data Score: 0.994180571103648
Testing Data Score: 0.9952022286421791


The above helps show that the model is not overfitted as the resampled training score and testing score are within close proximity.

### Predict

In [22]:
# Predicts the loan statuses of the testing data with the newly fitted logistic regression model
resampled_predictions = predict() #[*3.]

# Plots the resampled predictions and actual loan statuses on a resampled results DataFrame
resampled_results = pd.DataFrame({"Resampled_Prediction": resampled_predictions, "Actual": y_test}).reset_index(drop=True)
resampled_results.head(10)

Unnamed: 0,Resampled_Prediction,Actual
0,0,0
1,1,1
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


### Model Performance Evaluation

In [23]:
# Prints the balanced_accuracy score of the newly fitted logistic regression model 
balanced_accuracy_score(y_test, resampled_predictions)

0.9959744975744975

In [24]:
# Generates a confusion matrix for the newly fitted logistic regression model
cm_resampled = confusion_matrix(y_test, resampled_predictions)

# Displays the confusion matrix within a DataFrame
cm_resampled_df = cm_df(cm_resampled) #[*4.]

cm_resampled_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,18668,91
Actual 1,2,623


In [25]:
# Prints the classification report for the  newly fitted logistic regression model
cr_resampled = classification_report(y_test, resampled_predictions)
print(cr_resampled)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18759
           1       0.87      1.00      0.93       625

    accuracy                           1.00     19384
   macro avg       0.94      1.00      0.96     19384
weighted avg       1.00      1.00      1.00     19384



**Question:** How well does the logistic regression model, fit with oversampled data, predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** 
The logistic regression model is very accurate, however, the model does face issues with categorising 0.48% (2.s.f) of all the loans in the unseen lending test data it was provided with. 0.48% (2.s.f) of the actual healthy loans were considered high-risk by the model. Overall, the model performed optimally at predicting the `0` (healthy loans) and it was also quite good at predicting `1` (high-risk loans) but due to the context of the classification, further tweaking is required.