# Eli Musk Loans

## Group Members:
 1. Joseph Orta
 2. Erika Anglin
 3. Frank Ditz
 4. Tyler Vo
 5. Herbert Spektor

# Objective: Using the dataset can we produce models that accurately predict loan approvals?

Use linear regression and neural network models to predict loan acceptance and analyze the results

1. Does being a graduate make a difference in getting approved? 
2. Does being Self Employed make a difference?
3. Does the avg no of dependents make a difference in getting approved?
4. Does the total assets affect loan approval?
5. Does the income to loan amount ratio affect the likelihood of an application’s approval?
6. What will the majority of approved loans be in the future?

# Import Dependencies & Clean Data

In [None]:
# Import Dependencies
import pandas as pd
import numpy as np
import hvplot.pandas
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix, classification_report

In [2]:
# Load in the Data
loan_df = pd.read_csv("loan_approval_dataset.csv")
loan_df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,Income_to_Loan,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status,loan_status1
0,1,2,Graduate,No,9600000,29900000,0.32107,12,778,2400000,17600000,22700000,8000000,Approved,1
1,2,0,Not Graduate,Yes,4100000,12200000,0.336066,8,417,2700000,2200000,8800000,3300000,Rejected,0
2,3,3,Graduate,No,9100000,29700000,0.306397,20,506,7100000,4500000,33300000,12800000,Rejected,0
3,4,3,Graduate,No,8200000,30700000,0.267101,8,467,18200000,3300000,23300000,7900000,Rejected,0
4,5,5,Not Graduate,Yes,9800000,24200000,0.404959,20,382,12400000,8200000,29400000,5000000,Rejected,0


In [3]:
# Analyze data and determine proper cleaning method
nan_count = loan_df.isna().sum().sum()
print(nan_count)

0


In [4]:
# Determine the number of rows and see if the dataset should be cut down
row_count = len(loan_df.index)
row_count

4269

In [5]:
# Initial look at datatypes of columns
loan_dtype = loan_df.dtypes
print(loan_dtype)

loan_id                       int64
no_of_dependents              int64
education                    object
self_employed                object
income_annum                  int64
loan_amount                   int64
Income_to_Loan              float64
loan_term                     int64
cibil_score                   int64
residential_assets_value      int64
commercial_assets_value       int64
luxury_assets_value           int64
bank_asset_value              int64
loan_status                  object
loan_status1                  int64
dtype: object


# Linear Regression Model

In [9]:
# Convert categorical data to numeric with `pd.get_dummies`
categorical_columns = ['loan_status', 'education', 'self_employed']
application_df_encoded = pd.get_dummies(loan_df, columns=categorical_columns) 
print(application_df_encoded.head())

   loan_id  no_of_dependents  income_annum  loan_amount  Income_to_Loan  \
0        1                 2       9600000     29900000        0.321070   
1        2                 0       4100000     12200000        0.336066   
2        3                 3       9100000     29700000        0.306397   
3        4                 3       8200000     30700000        0.267101   
4        5                 5       9800000     24200000        0.404959   

   loan_term  cibil_score  residential_assets_value  commercial_assets_value  \
0         12          778                   2400000                 17600000   
1          8          417                   2700000                  2200000   
2         20          506                   7100000                  4500000   
3          8          467                  18200000                  3300000   
4         20          382                  12400000                  8200000   

   luxury_assets_value  bank_asset_value  loan_status1  loan_status_

In [10]:
# Separate the y variable, the labels
y = application_df_encoded["loan_status_Approved"]

# Separate the X variable, the features
X = application_df_encoded.drop("loan_status_Approved", axis=1)


In [11]:
# Review the y variable Series
print(y.head())

0    1
1    0
2    0
3    0
4    0
Name: loan_status_Approved, dtype: uint8


In [12]:
# Review the X variable DataFrame
print(X.head())

   loan_id  no_of_dependents  income_annum  loan_amount  Income_to_Loan  \
0        1                 2       9600000     29900000        0.321070   
1        2                 0       4100000     12200000        0.336066   
2        3                 3       9100000     29700000        0.306397   
3        4                 3       8200000     30700000        0.267101   
4        5                 5       9800000     24200000        0.404959   

   loan_term  cibil_score  residential_assets_value  commercial_assets_value  \
0         12          778                   2400000                 17600000   
1          8          417                   2700000                  2200000   
2         20          506                   7100000                  4500000   
3          8          467                  18200000                  3300000   
4         20          382                  12400000                  8200000   

   luxury_assets_value  bank_asset_value  loan_status1  loan_status_

In [13]:
# Check the balance of our target values
label_counts = y.value_counts()
print(label_counts)

1    2656
0    1613
Name: loan_status_Approved, dtype: int64


In [14]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,stratify=y)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (3201, 17)
X_test shape: (1068, 17)
y_train shape: (3201,)
y_test shape: (1068,)


In [16]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier

# Fit the model using training data
classifier.fit(X_train, y_train)

In [17]:
# Score the model
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

# Make a prediction using the testing data
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Training Data Score: 0.6219931271477663
Testing Data Score: 0.6198501872659176


Unnamed: 0,Prediction,Actual
2381,1,1
1642,1,1
750,1,1
2726,1,0
3841,1,1
...,...,...
2107,1,0
57,1,1
3024,1,1
2277,1,1


In [22]:
# Print the balanced_accuracy score of the model
accuracy_score(y_test, predictions)

0.6198501872659176

In [23]:
# Generate a confusion matrix for the model
confusion_mat = confusion_matrix(y_test, predictions)
# Print
print("Confusion Matrix:")
print(confusion_mat)

Confusion Matrix:
[[  0 404]
 [  2 662]]


In [24]:
# Print the classification report for the model
class_report = classification_report(y_test, predictions)
# Print
print("Classification Report:")
print(class_report)

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       404
           1       0.62      1.00      0.77       664

    accuracy                           0.62      1068
   macro avg       0.31      0.50      0.38      1068
weighted avg       0.39      0.62      0.48      1068



In [26]:
# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
rds = RandomOverSampler(random_state=1)
print(rds)
# Fit the original training data to the random_oversampler model
X_train_resampled, y_train_resampled = rds.fit_resample(X_train, y_train)
print("Resampled X_train shape:", X_train_resampled.shape)
print("Resampled y_train shape:", y_train_resampled.shape)

RandomOverSampler(random_state=1)
Resampled X_train shape: (3984, 17)
Resampled y_train shape: (3984,)


In [27]:
# Count the distinct values of the resampled labels data
label_counts_resampled = y_train_resampled.value_counts()
print(label_counts_resampled)

1    1992
0    1992
Name: loan_status_Approved, dtype: int64


In [28]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logreg_model_resampled = LogisticRegression(random_state=1)
print(logreg_model_resampled)
# Fit the model using the resampled training data
logreg_model_resampled.fit(X_train_resampled, y_train_resampled)

# Make a prediction using the testing data
predictions_resampled = logreg_model_resampled.predict(X_test)
print(predictions_resampled)

LogisticRegression(random_state=1)
[1 0 0 ... 1 0 0]


In [29]:
# Print the balanced_accuracy score of the model 
balanced_accuracy_resampled = balanced_accuracy_score(y_test, predictions_resampled)
print("Balanced Accuracy Score:", balanced_accuracy_resampled)

Balanced Accuracy Score: 0.5464779911726112


In [30]:
# Generate a confusion matrix for the model
confusion_mat_resampled = confusion_matrix(y_test, predictions_resampled)
print("Confusion Matrix for Resampled Model:")
print(confusion_mat_resampled)

Confusion Matrix for Resampled Model:
[[214 190]
 [290 374]]


In [31]:
# Print the classification report for the model
class_report_resampled = classification_report(y_test, predictions_resampled)
print("Classification Report for Resampled Model:")
print(class_report_resampled)

Classification Report for Resampled Model:
              precision    recall  f1-score   support

           0       0.42      0.53      0.47       404
           1       0.66      0.56      0.61       664

    accuracy                           0.55      1068
   macro avg       0.54      0.55      0.54      1068
weighted avg       0.57      0.55      0.56      1068



# Neural Network Model

In [6]:
# import and read loan data set
application_df = pd.read_csv("loan_approval_dataset.csv")
application_df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,Income_to_Loan,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status,loan_status1
0,1,2,Graduate,No,9600000,29900000,0.32107,12,778,2400000,17600000,22700000,8000000,Approved,1
1,2,0,Not Graduate,Yes,4100000,12200000,0.336066,8,417,2700000,2200000,8800000,3300000,Rejected,0
2,3,3,Graduate,No,9100000,29700000,0.306397,20,506,7100000,4500000,33300000,12800000,Rejected,0
3,4,3,Graduate,No,8200000,30700000,0.267101,8,467,18200000,3300000,23300000,7900000,Rejected,0
4,5,5,Not Graduate,Yes,9800000,24200000,0.404959,20,382,12400000,8200000,29400000,5000000,Rejected,0


In [7]:
# Look at education value counts for binning
education_counts = application_df['education'].value_counts()
print(education_counts)

Graduate        2144
Not Graduate    2125
Name: education, dtype: int64


In [8]:
# Choose a cutoff value and create a list of application types to be replaced
cutoff_value = 4
rare_education_score = education_counts[education_counts < cutoff_value].index.tolist()
education_score_to_replace = rare_education_score

# Replace in dataframe
for app in education_score_to_replace:
    application_df['education'] = application_df['education'].replace(app,"Other")
# Check to make sure binning was successful
application_df['education'].value_counts()

Graduate        2144
Not Graduate    2125
Name: education, dtype: int64

In [9]:
column_types = application_df.dtypes
print(column_types)

loan_id                       int64
no_of_dependents              int64
education                    object
self_employed                object
income_annum                  int64
loan_amount                   int64
Income_to_Loan              float64
loan_term                     int64
cibil_score                   int64
residential_assets_value      int64
commercial_assets_value       int64
luxury_assets_value           int64
bank_asset_value              int64
loan_status                  object
loan_status1                  int64
dtype: object


In [10]:
# Convert categorical data to numeric with `pd.get_dummies`
categorical_columns = ['education', 'self_employed', 'cibil_score', 'loan_status']
application_df_encoded = pd.get_dummies(application_df, columns=categorical_columns) 
print(application_df_encoded.head())

   loan_id  no_of_dependents  income_annum  loan_amount  Income_to_Loan  \
0        1                 2       9600000     29900000        0.321070   
1        2                 0       4100000     12200000        0.336066   
2        3                 3       9100000     29700000        0.306397   
3        4                 3       8200000     30700000        0.267101   
4        5                 5       9800000     24200000        0.404959   

   loan_term  residential_assets_value  commercial_assets_value  \
0         12                   2400000                 17600000   
1          8                   2700000                  2200000   
2         20                   7100000                  4500000   
3          8                  18200000                  3300000   
4         20                  12400000                  8200000   

   luxury_assets_value  bank_asset_value  ...  cibil_score_893  \
0             22700000           8000000  ...                0   
1             

In [11]:
# Split our preprocessed data into our features and target arrays
X = application_df_encoded.drop('loan_status_Approved', axis=1)  # Features
y = application_df_encoded['loan_status_Approved']  # Target

# Display the shapes of the arrays
print("Shape of features (X):", X.shape)
print("Shape of target (y):", y.shape)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Display the shapes of the datasets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of features (X): (4269, 617)
Shape of target (y): (4269,)
Shape of X_train: (3201, 617)
Shape of X_test: (1068, 617)
Shape of y_train: (3201,)
Shape of y_test: (1068,)


In [12]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [13]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
input_features = X_train.shape[1]

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=128, activation='relu'))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=64, activation='relu'))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Build the model
nn.build(input_shape=(None, input_features))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               79104     
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 87425 (341.50 KB)
Trainable params: 87425 (341.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [14]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [15]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [16]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

34/34 - 0s - loss: 0.0382 - accuracy: 0.9916 - 145ms/epoch - 4ms/step
Loss: 0.03820072486996651, Accuracy: 0.9915730357170105
