In [1]:
# Import dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report, mean_squared_error
from sklearn.ensemble import StackingRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor


In [2]:
# Reading in the loan.csv
df = pd.read_csv(Path(r"Resources\loan.csv"))
df.head()

  df = pd.read_csv(Path(r"Resources\loan.csv"))


Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
0,1077501,1296599,5000,5000,4975.0,36 months,10.65%,162.87,B,B2,...,,,,,0.0,0.0,,,,
1,1077430,1314167,2500,2500,2500.0,60 months,15.27%,59.83,C,C4,...,,,,,0.0,0.0,,,,
2,1077175,1313524,2400,2400,2400.0,36 months,15.96%,84.33,C,C5,...,,,,,0.0,0.0,,,,
3,1076863,1277178,10000,10000,10000.0,36 months,13.49%,339.31,C,C1,...,,,,,0.0,0.0,,,,
4,1075358,1311748,3000,3000,3000.0,60 months,12.69%,67.79,B,B5,...,,,,,0.0,0.0,,,,


In [3]:
# Cleaning data by dropping unnecessary columns
columns_to_drop = ['id', 'member_id', 'emp_title', 'issue_d','pymnt_plan','url', 'desc', 'zip_code', 'addr_state','earliest_cr_line', 'mths_since_last_delinq', 'mths_since_last_record','grade', 'sub_grade']
df.drop(df.loc[:,"initial_list_status":"total_il_high_credit_limit"].columns, axis=1, inplace=True)
df = df.drop(columns= columns_to_drop)

In [4]:
df = df.dropna()

In [5]:
df.columns

Index(['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate',
       'installment', 'emp_length', 'home_ownership', 'annual_inc',
       'verification_status', 'loan_status', 'purpose', 'title', 'dti',
       'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal',
       'revol_util', 'total_acc'],
      dtype='object')

In [6]:
# Cleaning data by typecasting percentage columns to decimals
df['int_rate'] = df['int_rate'].replace('%','',regex=True).astype(float)/100
df['revol_util'] = df['revol_util'].replace('%','',regex=True).astype(float)/100

In [7]:
# Use get_dummies to allow string columns to be analyzed
df_dummies = pd.get_dummies(df, columns=['term', 'emp_length', 'home_ownership','verification_status', 'loan_status', 'purpose', 'title'])

In [8]:
# Separate into target (interest rate) and features
y = df_dummies["int_rate"]

X = df_dummies.drop(columns="int_rate")

In [9]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.5)

In [13]:
from sklearn.model_selection import GridSearchCV

In [16]:
# Define the hyperparameters to run
param_grid = {
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Create a Decision Tree regressor
dt = DecisionTreeRegressor()

# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)

# determine the optimum hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Train the model with the optimal hyperparameters
best_dt = DecisionTreeRegressor(**best_params, random_state=42)
best_dt.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_dt.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error on Test Set:", mse)

Best Hyperparameters: {'max_depth': 7, 'min_samples_leaf': 1, 'min_samples_split': 2}
Mean Squared Error on Test Set: 0.0006595402743428012


In [18]:
# Using Descision Tree Model 
tree_model = DecisionTreeRegressor(max_depth=7, min_samples_leaf=1, min_samples_split=2)


In [19]:
# Fit and Predict the Decision Tree Model
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)

In [20]:
# Determine Mean Squared Error of the model to show accuracy
ms_tree = mean_squared_error(y_test, y_pred_tree)
print("Mean Squared Error on Test Set:", mse)

Mean Squared Error on Test Set: 0.0006595402743428012


In [21]:
feature_importances = pd.Series(tree_model.feature_importances_, index=X.columns)
feature_importances = feature_importances.sort_values(ascending=False)
print(feature_importances)

revol_util                                  0.364459
term_ 36 months                             0.352977
installment                                 0.118182
delinq_2yrs                                 0.045504
inq_last_6mths                              0.034708
                                              ...   
title_Funds for Moving                      0.000000
title_Funds Needed For Debt and Vacation    0.000000
title_Funds For Starting Up a Business      0.000000
title_Funds                                 0.000000
title_îîMY FIRST CAR îî             0.000000
Length: 19169, dtype: float64


In [22]:
# Convert to a DataFrame
feature_importances_df = pd.DataFrame({'Feature': feature_importances.index, 'Importance': feature_importances.values})
feature_importances_df.head(20)

Unnamed: 0,Feature,Importance
0,revol_util,0.364459
1,term_ 36 months,0.352977
2,installment,0.118182
3,delinq_2yrs,0.045504
4,inq_last_6mths,0.034708
5,revol_bal,0.029818
6,open_acc,0.025754
7,funded_amnt,0.011591
8,loan_amnt,0.005977
9,total_acc,0.003675
