# Credit Risk Resampling Techniques

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

# Read the CSV into DataFrame

In [3]:
# Load the data
file_path = Path('Resources/lending_data.csv')
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,loan_size,interest_rate,homeowner,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,own,52800,0.431818,5,1,22800,low_risk
1,8400.0,6.692,own,43600,0.311927,3,0,13600,low_risk
2,9000.0,6.963,rent,46100,0.349241,3,0,16100,low_risk
3,10700.0,7.664,own,52700,0.43074,5,1,22700,low_risk
4,10800.0,7.698,mortgage,53000,0.433962,5,1,23000,low_risk


* We have two columns in this source DataFrame are not numeric. Homeowner and loan_status.
* We are definitely going to want to transform these values to numeric using Integer Encoding. 
* The positive event in this data is predicting high_risk for loans. low_risk should be 0, high_risk should be 1. 

# Split the Data into Training and Testing

In [4]:
# Create our features
# Our features columns are everything EXCEPT the loan_status column

X = df.copy()                                                                         # make a copy of df
X.drop(columns='loan_status', inplace=True)                                           # simply drop the loan_status column, since that is the y-target we're looking for in the model. 

# Create our target
# In this case our taget is the 'loan status' column 
y = df['loan_status'].values.reshape(-1, 1)                                            # We include .values.reshape (-1, 1) to put the data into 1 column for our model libraries to scale. 

In [5]:
X.describe()     # we should see every column except loan_status. The homeowner column is not included because it's not numeric. 

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
count,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0
mean,9805.562577,7.292333,49221.949804,0.377318,3.82661,0.392308,19221.949804
std,2093.223153,0.889495,8371.635077,0.081519,1.904426,0.582086,8371.635077
min,5000.0,5.25,30000.0,0.0,0.0,0.0,0.0
25%,8700.0,6.825,44800.0,0.330357,3.0,0.0,14800.0
50%,9500.0,7.172,48100.0,0.376299,4.0,0.0,18100.0
75%,10400.0,7.528,51400.0,0.416342,4.0,1.0,21400.0
max,23800.0,13.235,105200.0,0.714829,16.0,3.0,75200.0


### JAK Label Encoding Features (Homeowner) & Target (Loan Status)

**STUDENT COMMENT:**

I am inserting this section for integer encoding. It is going to be necessary to use integer encoding to complete this assignment to be fit our datasets into our scalers and run reports and scoring. The source data has non-numeric columns, so we know we need to use encoding to convert string values into distinct classes that are binary and or numeric. 

The homework file provided did not include a section to do the encoding, 


Not a complaint, but this should be in here.

In [12]:
# X feature encoding using lambda fucntion - homeowner
# Check the homeowner values. We have 3 classes: mortgage, own, and rent
X['homeowner'].value_counts()

mortgage    38572
own         30930
rent         8034
Name: homeowner, dtype: int64

In [13]:
# Encode the Homeowners column.

# Create a dict for homeowner values. We will not make an inline for-loop since there's only 3 calss values.

cust_class = {
    'mortgage' : 1,
    'own' : 2,
    'rent' : 3
}

# Using lambda function encoding.
X['homeowner'] = X['homeowner'].apply(lambda x: cust_class[x])       # lamda should re-value the string for homeowner column as a category number.
X.head()

Unnamed: 0,loan_size,interest_rate,homeowner,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,2,52800,0.431818,5,1,22800
1,8400.0,6.692,2,43600,0.311927,3,0,13600
2,9000.0,6.963,3,46100,0.349241,3,0,16100
3,10700.0,7.664,2,52700,0.43074,5,1,22700
4,10800.0,7.698,1,53000,0.433962,5,1,23000


In [6]:
# y features are formatted to be 1 column in the array. But they're not in a numeric category to be useful for predicting. 
y

array([['low_risk'],
       ['low_risk'],
       ['low_risk'],
       ...,
       ['high_risk'],
       ['high_risk'],
       ['high_risk']], dtype=object)

In [14]:
# Import Label Encoder
from sklearn.preprocessing import LabelEncoder

In [10]:
# Categorize our loan_status values to be numeric 0, 1 using Integer Encoding.
# Create Label Encoder object
label_encoder = LabelEncoder()

# Fit the loan_status to label encoder, because we want to have numeric category values for 
label_encoder.fit(df['loan_status'])

# Transform the values of loan_status from words to integer numbers. Low_risk should be 0, high_risk should be 1
df['loan_status'] = label_encoder.transform(df['loan_status'])

In [9]:
# This should be displaying in 0's and 1's. It's still displaying in string values?
y

array([['low_risk'],
       ['low_risk'],
       ['low_risk'],
       ...,
       ['high_risk'],
       ['high_risk'],
       ['high_risk']], dtype=object)

In [8]:
# Check the balance of our target values
y['loan_status'].value_counts()                                 # Code fails??? Don't know what this error is about, but this should not influence on our models running.  
                                                                # Our target y in previous cell is already formatted and shaped correctly. 

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [16]:
# Binary encoding using Pandas (single column)

# loan status ls binary encoded.
ls_binary_encoded = pd.get_dummies(
    df, 
    columns=['loan_status'])
ls_binary_encoded.head()

Unnamed: 0,loan_size,interest_rate,homeowner,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status_0,loan_status_1
0,10700.0,7.672,own,52800,0.431818,5,1,22800,0,1
1,8400.0,6.692,own,43600,0.311927,3,0,13600,0,1
2,9000.0,6.963,rent,46100,0.349241,3,0,16100,0,1
3,10700.0,7.664,own,52700,0.43074,5,1,22700,0,1
4,10800.0,7.698,mortgage,53000,0.433962,5,1,23000,0,1


In [10]:
# Create X_train, X_test, y_train, y_test
# YOUR CODE HERE
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

Standard Scaler helps to get standardized distribution, with a zero mean and standard deviation of one (unit variance). It works only on numeric values for obvious reasons, Convert the string/text columns to numerical values using one of the encoding techniques (i.e. hot/lget_dummies or label encoding) before using the standard scaler. Hope this helps.

## Data Pre-Processing

Scale the training and testing data using the `StandardScaler` from `sklearn`. Remember that when scaling the data, you only scale the features data (`X_train` and `X_testing`).

In [13]:
# Create the StandardScaler instance
from sklearn.preprocessing import StandardScaler
# YOUR CODE HERE
scaler = StandardScaler()

In [14]:
type(scaler)

sklearn.preprocessing._data.StandardScaler

In [15]:
# Fit the Standard Scaler with the training data
# When fitting scaling functions, only train on the training dataset
# YOUR CODE HERE
X_scaler = scaler.fit(X_train)

ValueError: could not convert string to float: 'own'

In [27]:
# Scale the training and testing data
# YOUR CODE HERE
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

NameError: name 'X_scaler' is not defined

# Simple Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_train, y_train)

In [None]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

# Oversampling

In this section, you will compare two oversampling algorithms to determine which algorithm results in the best performance. You will oversample the data using the naive random oversampling algorithm and the SMOTE algorithm. For each algorithm, be sure to complete the folliowing steps:

1. View the count of the target classes using `Counter` from the collections library. 
3. Use the resampled data to train a logistic regression model.
3. Calculate the balanced accuracy score from sklearn.metrics.
4. Print the confusion matrix from sklearn.metrics.
5. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.

Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests

### Naive Random Oversampling

In [None]:
# Resample the training data with the RandomOversampler
# YOUR CODE HERE

from imblearn.over_sampling import RandomOverSampler                     # import the RandomOverSampler from imblearn 

ros = RandomOverSampler(random_state=1)                                  # create random oversampling (ros) seed so we can resample our training data.

X_resampled, y_resampled = ros.fit_resample(X_train, y_train)            # resample the training data by fitting X_train and y_train into our oversampling seed.

# View the count of target classes with Counter
# YOUR CODE HERE
Counter(y_resampled)                                                     # this should display an equal number of datapoints for both classes, by increasing 

In [None]:
# Train the Logistic Regression model using the resampled data
# YOUR CODE HERE
model_ros = LogisticRegression(solver='lbfgs', random_state=1)            # define a new model variable for our random oversampling 
model_ros.fit(X_resampled, y_resampled)

In [None]:
# Calculated the balanced accuracy score
# YOUR CODE HERE
y_pred_ros = model_ros.predict(X_test)                                    # make new predictions based on random oversampling 
balanced_accuracy_score(y_test, y_pred_ros)                               # generate out 

In [None]:
# Display the confusion matrix
# YOUR CODE HERE
confusion_matrix(y_test, y_pred_ros)                                       # making sure to pass y-predictions on the random oversampling model

In [None]:
# Print the imbalanced classification report
# YOUR CODE HERE
print(classification_report_imbalanced(y_test, y_pred_ros))                # report based on model_ros

### SMOTE Oversampling

In [None]:
# Resample the training data with SMOTE
# YOUR CODE HERE
from imblearn.over_sampling import SMOTE

X_resampled_smte, y_resampled_smte =SMOTE(
    random_state=1,                                # pass random_state
    sampling_strategy=1.0                          # sampling_strategy=1.0, is # you pass that corresponds to number of samples you'd like. pass 1.0 to balance evenly between gold and purple
).fit_resample(X_train, y_train)

# View the count of target classes with Counter
# YOUR CODE HERE
Counter(y_resampled_smte)

In [None]:
# Train the Logistic Regression model using the resampled data
# YOUR CODE HERE
model_smte = LogisticRegression(solver='lbfgs', random_state=1)
model_smte.fit(X_resampled_smte, y_resampled_smte)

In [None]:
# Calculated the balanced accuracy score
# YOUR CODE HERE
y_pred_smte = model_smte.predict(X_test)
balanced_accuracy_score(y_test, y_pred_smte)

In [None]:
# Display the confusion matrix
# YOUR CODE HERE
confusion_matrix(y_test, y_pred_smte)

In [None]:
# Print the imbalanced classification report
# YOUR CODE HERE
print(classification_report_imbalanced(y_test, y_pred_smte))

# Undersampling

In this section, you will test an undersampling algorithm to determine which algorithm results in the best performance compared to the oversampling algorithms above. You will undersample the data using the Cluster Centroids algorithm and complete the folliowing steps:

1. View the count of the target classes using `Counter` from the collections library. 
3. Use the resampled data to train a logistic regression model.
3. Calculate the balanced accuracy score from sklearn.metrics.
4. Display the confusion matrix from sklearn.metrics.
5. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.

Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests

In [None]:
# Resample the data using the ClusterCentroids resampler
# YOUR CODE HERE
from imblearn.under_sampling import RandomUnderSampler                           # Import random under sampler

rus = RandomUnderSampler(random_state=1)

X_resampled_rus, y_resampled_rus = rus.fit_resample(X_train, y_train)

# View the count of target classes with Counter
# YOUR CODE HERE
Counter(y_resampled_rus)

In [None]:
# Train the Logistic Regression model using the resampled data
# YOUR CODE HERE
model_rus = LogisticRegression(solver='lbfgs', random_state=1)                    # define model based on random undersampling method
model_rus.fit(X_resampled_rus, y_resampled_rus)                                   # fit model with resampled training data from RandomUnderSampler

In [None]:
# Calculate the balanced accuracy score
# YOUR CODE HERE
y_pred_rus = model_smte.predict(X_test)                                            # make predictions based on random undersampling 
balanced_accuracy_score(y_test, y_pred_rus)

In [None]:
# Display the confusion matrix
# YOUR CODE HERE
confusion_matrix(y_test, y_pred_rus)

In [None]:
# Print the imbalanced classification report
# YOUR CODE HERE
print(classification_report_imbalanced(y_test, y_pred_rus))

# Combination (Over and Under) Sampling

In this section, you will test a combination over- and under-sampling algorithm to determine if the algorithm results in the best performance compared to the other sampling algorithms above. You will resample the data using the SMOTEENN algorithm and complete the folliowing steps:

1. View the count of the target classes using `Counter` from the collections library. 
3. Use the resampled data to train a logistic regression model.
3. Calculate the balanced accuracy score from sklearn.metrics.
4. Display the confusion matrix from sklearn.metrics.
5. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.

Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests

In [None]:
# Resample the training data with SMOTEENN
# YOUR CODE HERE
from imblearn.combine import SMOTEENN                                                        # Import SMOTEENN

smote_enn = SMOTEENN(random_state=1)                                                         # Build the SMOTEENN object
X_resampled_smtn, y_resampled_smtn = smote_enn.fit_resample(X_train, y_train)                # resample the training data with the SMOTEENN objecy

# View the count of target classes with Counter
# YOUR CODE HERE
Counter(y_resampled_smtn)

In [None]:
# Train the Logistic Regression model using the resampled data
# YOUR CODE HERE
model_smtn = LogisticRegression(solver='lbfgs', random_state=1)
model_smtn.fit(X_resampled_smtn, y_resampled_smtn)

In [None]:
# Calculate the balanced accuracy score
# YOUR CODE HERE
y_pred_smtn = model_smte.predict(X_test)                                                     # predictions based on SMOTEENN model
balanced_accuracy_score(y_test, y_pred_smtn)                                                  

In [None]:
# Display the confusion matrix
# YOUR CODE HERE
confusion_matrix(y_test, y_pred_smtn)

In [None]:
# Print the imbalanced classification report
# YOUR CODE HERE
print(classification_report_imbalanced(y_test, y_pred_smtn))

# Final Questions

1. Which model had the best balanced accuracy score?

   YOUR ANSWER HERE.

2. Which model had the best recall score?

    YOUR ANSWER HERE.

3. Which model had the best geometric mean score?

    YOUR ANSWER HERE.
