## Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

#  Import and read the charity_data.csv.
import pandas as pd 
application_df = pd.read_csv("loan_approval_dataset.csv")
application_df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,Income_to_Loan,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status,loan_status1
0,1,2,Graduate,No,9600000,29900000,0.32107,12,778,2400000,17600000,22700000,8000000,Approved,1
1,2,0,Not Graduate,Yes,4100000,12200000,0.336066,8,417,2700000,2200000,8800000,3300000,Rejected,0
2,3,3,Graduate,No,9100000,29700000,0.306397,20,506,7100000,4500000,33300000,12800000,Rejected,0
3,4,3,Graduate,No,8200000,30700000,0.267101,8,467,18200000,3300000,23300000,7900000,Rejected,0
4,5,5,Not Graduate,Yes,9800000,24200000,0.404959,20,382,12400000,8200000,29400000,5000000,Rejected,0


In [2]:
print(application_df.head())

   loan_id  no_of_dependents     education self_employed  income_annum  \
0        1                 2      Graduate            No       9600000   
1        2                 0  Not Graduate           Yes       4100000   
2        3                 3      Graduate            No       9100000   
3        4                 3      Graduate            No       8200000   
4        5                 5  Not Graduate           Yes       9800000   

   loan_amount  Income_to_Loan  loan_term  cibil_score  \
0     29900000        0.321070         12          778   
1     12200000        0.336066          8          417   
2     29700000        0.306397         20          506   
3     30700000        0.267101          8          467   
4     24200000        0.404959         20          382   

   residential_assets_value  commercial_assets_value  luxury_assets_value  \
0                   2400000                 17600000             22700000   
1                   2700000                  2200000

In [3]:
# Look at cibil_score value counts for binning
cibil_score_counts = application_df['cibil_score'].value_counts()
print(cibil_score_counts)

348    16
543    15
538    15
778    14
509    14
       ..
334     2
773     2
893     2
484     2
864     1
Name: cibil_score, Length: 601, dtype: int64


In [4]:
# Choose a cutoff value and create a list of application types to be replaced
# use the variable name `application_types_to_replace`
cutoff_value = 4
rare_cibil_score = cibil_score_counts[cibil_score_counts < cutoff_value].index.tolist()
cibil_score_to_replace = rare_cibil_score

# Replace in dataframe
for app in cibil_score_to_replace:
    application_df['cibil_score'] = application_df['cibil_score'].replace(app,"Other")
# Check to make sure binning was successful
application_df['cibil_score'].value_counts()

Other    105
348       16
543       15
538       15
778       14
        ... 
764        4
862        4
690        4
357        4
568        4
Name: cibil_score, Length: 565, dtype: int64

In [5]:
column_types = application_df.dtypes
print(column_types)

loan_id                       int64
no_of_dependents              int64
education                    object
self_employed                object
income_annum                  int64
loan_amount                   int64
Income_to_Loan              float64
loan_term                     int64
cibil_score                  object
residential_assets_value      int64
commercial_assets_value       int64
luxury_assets_value           int64
bank_asset_value              int64
loan_status                  object
loan_status1                  int64
dtype: object


In [6]:
# Convert categorical data to numeric with `pd.get_dummies`
categorical_columns = ['education', 'self_employed', 'cibil_score', 'loan_status']
application_df_encoded = pd.get_dummies(application_df, columns=categorical_columns) 
print(application_df_encoded.head())

   loan_id  no_of_dependents  income_annum  loan_amount  Income_to_Loan  \
0        1                 2       9600000     29900000        0.321070   
1        2                 0       4100000     12200000        0.336066   
2        3                 3       9100000     29700000        0.306397   
3        4                 3       8200000     30700000        0.267101   
4        5                 5       9800000     24200000        0.404959   

   loan_term  residential_assets_value  commercial_assets_value  \
0         12                   2400000                 17600000   
1          8                   2700000                  2200000   
2         20                   7100000                  4500000   
3          8                  18200000                  3300000   
4         20                  12400000                  8200000   

   luxury_assets_value  bank_asset_value  ...  cibil_score_894  \
0             22700000           8000000  ...                0   
1             

In [7]:
# Split our preprocessed data into our features and target arrays
X = application_df_encoded.drop('loan_status_Approved', axis=1)  # Features
y = application_df_encoded['loan_status_Approved']  # Target

# Display the shapes of the arrays
print("Shape of features (X):", X.shape)
print("Shape of target (y):", y.shape)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Display the shapes of the datasets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of features (X): (4269, 581)
Shape of target (y): (4269,)
Shape of X_train: (3201, 581)
Shape of X_test: (1068, 581)
Shape of y_train: (3201,)
Shape of y_test: (1068,)


In [8]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
column_types = application_df_encoded.dtypes
print(column_types)
print(application_df_encoded.head())

loan_id                   int64
no_of_dependents          int64
income_annum              int64
loan_amount               int64
Income_to_Loan          float64
                         ...   
cibil_score_899           uint8
cibil_score_900           uint8
cibil_score_Other         uint8
loan_status_Approved      uint8
loan_status_Rejected      uint8
Length: 582, dtype: object
   loan_id  no_of_dependents  income_annum  loan_amount  Income_to_Loan  \
0        1                 2       9600000     29900000        0.321070   
1        2                 0       4100000     12200000        0.336066   
2        3                 3       9100000     29700000        0.306397   
3        4                 3       8200000     30700000        0.267101   
4        5                 5       9800000     24200000        0.404959   

   loan_term  residential_assets_value  commercial_assets_value  \
0         12                   2400000                 17600000   
1          8                   27000

## Compile, Train and Evaluate the Model

In [10]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
input_features = X_train.shape[1]

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=128, activation='relu'))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=64, activation='relu'))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Build the model
nn.build(input_shape=(None, input_features))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               74496     
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 82817 (323.50 KB)
Trainable params: 82817 (323.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [11]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [12]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [13]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

34/34 - 0s - loss: 0.0083 - accuracy: 0.9972 - 203ms/epoch - 6ms/step
Loss: 0.008251693099737167, Accuracy: 0.9971910119056702


In [14]:
# Export our model to HDF5 file
model_filename = "loan_apps_layers.h5"
nn.save("loan_apps_layers.h5")

  saving_api.save_model(
