In [1]:
# Import dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report, mean_squared_error
from sklearn.ensemble import StackingRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor


In [2]:
# Reading in the loan.csv
df = pd.read_csv(Path(r"Resources\loan Data SQL.csv"))
df.head()

Unnamed: 0,id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,emp_length,home_ownership,annual_inc,...,purpose,title,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc
0,54734,25000,25000,19080.0572,36 months,0.1189,829.1,< 1 year,RENT,85000,...,debt_consolidation,Debt consolidation for on-time payer,19.48,0,0,10,0,28854,0.52,42
1,55742,7000,7000,672.803839,36 months,0.1071,228.22,< 1 year,RENT,65000,...,credit_card,Credit Card payoff,14.29,0,0,7,0,33623,0.77,7
2,57245,1200,1200,1200.0,36 months,0.1311,40.5,10+ years,OWN,54000,...,debt_consolidation,zxcvb,5.47,0,0,5,0,2584,0.4,31
3,57416,10800,10800,10691.55105,36 months,0.1357,366.86,6 years,RENT,32000,...,debt_consolidation,Nicolechr1978,11.63,0,1,14,0,3511,0.26,40
4,58915,7500,5025,557.087228,36 months,0.1008,162.34,3 years,RENT,85000,...,debt_consolidation,sdguy,8.1,0,1,3,0,33667,0.73,11


In [6]:
# Cleaning data by dropping unnecessary columns
columns_to_drop = ['id']
df = df.drop(columns= columns_to_drop)

In [7]:
df = df.dropna()

In [8]:
df.columns

Index(['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate',
       'installment', 'emp_length', 'home_ownership', 'annual_inc',
       'verification_status', 'loan_status', 'purpose', 'title', 'dti',
       'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal',
       'revol_util', 'total_acc'],
      dtype='object')

In [9]:
# Use get_dummies to allow string columns to be analyzed
df_dummies = pd.get_dummies(df, columns=['term', 'emp_length', 'home_ownership','verification_status', 'loan_status', 'purpose', 'title'])

In [10]:
# Separate into target (interest rate) and features
y = df_dummies["int_rate"]

X = df_dummies.drop(columns="int_rate")

In [11]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.5)

In [12]:
from sklearn.model_selection import GridSearchCV

In [13]:
# Define the hyperparameters to run
param_grid = {
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Create a Decision Tree regressor
dt = DecisionTreeRegressor()

# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)

# determine the optimum hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Train the model with the optimal hyperparameters
best_dt = DecisionTreeRegressor(**best_params, random_state=42)
best_dt.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_dt.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error on Test Set:", mse)

Best Hyperparameters: {'max_depth': 7, 'min_samples_leaf': 2, 'min_samples_split': 2}
Mean Squared Error on Test Set: 0.0006433274724282495


In [14]:
# Using Descision Tree Model 
tree_model = DecisionTreeRegressor(max_depth=7, min_samples_leaf=1, min_samples_split=2)


In [15]:
# Fit and Predict the Decision Tree Model
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)

In [16]:
# Determine Mean Squared Error of the model to show accuracy
ms_tree = mean_squared_error(y_test, y_pred_tree)
print("Mean Squared Error on Test Set:", mse)

Mean Squared Error on Test Set: 0.0006433274724282495


In [17]:
feature_importances = pd.Series(tree_model.feature_importances_, index=X.columns)
feature_importances = feature_importances.sort_values(ascending=False)
print(feature_importances)

term_ 60 months                    0.367418
revol_util                         0.334756
installment                        0.123857
delinq_2yrs                        0.053053
inq_last_6mths                     0.028110
                                     ...   
title_Fund My Loan                 0.000000
title_Fund Me FTW                  0.000000
title_Funches & Associates LLC     0.000000
title_Fun and Debt loan            0.000000
title_îîMY FIRST CAR îî    0.000000
Length: 19168, dtype: float64


In [18]:
# Convert to a DataFrame
feature_importances_df = pd.DataFrame({'Feature': feature_importances.index, 'Importance': feature_importances.values})
feature_importances_df.head(20)

Unnamed: 0,Feature,Importance
0,term_ 60 months,0.367418
1,revol_util,0.334756
2,installment,0.123857
3,delinq_2yrs,0.053053
4,inq_last_6mths,0.02811
5,revol_bal,0.025855
6,open_acc,0.023198
7,funded_amnt,0.01904
8,loan_amnt,0.008152
9,funded_amnt_inv,0.0065
