In [8]:
# import dependencies
import pandas as pd
from path import Path

file_load = Path("../Resources/loans_data_encoded.csv")
df = pd.read_csv(file_load)
df.head()

Unnamed: 0,amount,term,age,bad,month_num,education_Bachelor,education_High School or Below,education_Master or Above,education_college,gender_female,gender_male
0,1000,30,45,0,6,0,1,0,0,0,1
1,1000,30,50,0,7,1,0,0,0,1,0
2,1000,30,33,0,8,1,0,0,0,1,0
3,1000,15,27,0,9,0,0,0,1,0,1
4,1000,30,28,0,10,0,0,0,1,1,0


In [9]:
# Split the Feature and Target variable 
X = df.copy()
X = X.drop(columns=["bad"], axis=1)
y = df["bad"].ravel()

In [11]:
# Split them into training and testing set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [12]:
# Scale and standardize before Decision Tree classifier 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Fit the Standard Scaler with training data
X_scalar = scaler.fit(X_train)

# Scaling the data on training and testing data
X_train_scaled = X_scalar.transform(X_train)
X_test_scaled = X_scalar.transform(X_test)

In [13]:
# After the data set is scaled and standardize, put them into decision tree classifier 
# Use the Random forest Decision Tree in this case
# Create a random forest classifier 
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

In [14]:
# After instantiating RF classifier, fit/train the model
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [15]:
# Make the predicitions after training the data 
predictions = rf_model.predict(X_test_scaled)
predictions

array([0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1], dtype=int64)

In [16]:
# Evaluate the model, confusion metrix gives all four entries (Accuracy, Prediction, Recall F1score)
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
cm = confusion_matrix(y_test, predictions)

# Create DataFrame from confusion metrix
cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Prediction 0", "Predictions 1"])
cm_df

Unnamed: 0,Prediction 0,Predictions 1
Actual 0,51,33
Actual 1,23,18


In [18]:
# Get the accuracy of the prediction 
acc_score = accuracy_score(y_test, predictions)
acc_score

0.552

In [23]:
# Displaying the results 
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score: {acc_score}.")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Prediction 0,Predictions 1
Actual 0,51,33
Actual 1,23,18


Accuracy Score: 0.552.
Classification Report
              precision    recall  f1-score   support

           0       0.69      0.61      0.65        84
           1       0.35      0.44      0.39        41

    accuracy                           0.55       125
   macro avg       0.52      0.52      0.52       125
weighted avg       0.58      0.55      0.56       125



## Rank the importance of features


In [24]:
# Calculate feature importance in Random forest model on X_test column whose sum == 1
importances = rf_model.feature_importances_
importances

array([0.05454782, 0.07997292, 0.43280448, 0.32973986, 0.01887172,
       0.02110219, 0.00271658, 0.02151063, 0.01887818, 0.01985562])

In [25]:
# To rank the importance of X_test columns by their importances 
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.43280447750315343, 'age'),
 (0.32973986443922343, 'month_num'),
 (0.07997292251445517, 'term'),
 (0.05454782107242418, 'amount'),
 (0.021510631303272416, 'education_college'),
 (0.021102188881175144, 'education_High School or Below'),
 (0.01985561654170213, 'gender_male'),
 (0.018878176828577283, 'gender_female'),
 (0.018871722006693077, 'education_Bachelor'),
 (0.002716578909323729, 'education_Master or Above')]