# Student Loan Risk with Deep Learning

In [2]:
pip install keras_tuner

Collecting keras_tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/129.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━[0m [32m81.9/129.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras_tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras_tuner
Successfully installed keras_tuner-1.4.7 kt-legacy-1.0.5


In [3]:
# Imports
import pandas as pd
import tensorflow as tf
import sklearn as skl
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from pathlib import Path

---

## Prepare the data to be used on a neural network model

### Step 1: Read the `student-loans.csv` file into a Pandas DataFrame. Review the DataFrame, looking for columns that could eventually define your features and target variables.   

In [4]:
# Read the csv into a Pandas DataFrame
file_path = "https://static.bc-edx.com/ai/ail-v-1-0/m18/lms/datasets/student-loans.csv"
loans_df = pd.read_csv(file_path)

# Review the DataFrame
loans_df.head()

Unnamed: 0,payment_history,location_parameter,stem_degree_score,gpa_ranking,alumni_success,study_major_code,time_to_completion,finance_workshop_score,cohort_ranking,total_loan_score,financial_aid_score,credit_ranking
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,1
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0


In [5]:
loans_df.sample(15).sort_values(by='credit_ranking')

Unnamed: 0,payment_history,location_parameter,stem_degree_score,gpa_ranking,alumni_success,study_major_code,time_to_completion,finance_workshop_score,cohort_ranking,total_loan_score,financial_aid_score,credit_ranking
1353,7.6,0.645,0.03,1.9,0.086,14.0,57.0,0.9969,3.37,0.46,10.3,0
841,6.6,0.66,0.0,3.0,0.115,21.0,31.0,0.99629,3.45,0.63,10.3,0
687,9.1,0.64,0.23,3.1,0.095,13.0,38.0,0.9998,3.28,0.59,9.7,0
180,8.8,0.61,0.14,2.4,0.067,10.0,42.0,0.9969,3.19,0.59,9.5,0
1009,9.6,0.5,0.36,2.8,0.116,26.0,55.0,0.99722,3.18,0.68,10.9,0
705,8.4,1.035,0.15,6.0,0.073,11.0,54.0,0.999,3.37,0.49,9.9,0
814,12.6,0.41,0.54,2.8,0.103,19.0,41.0,0.99939,3.21,0.76,11.3,1
948,8.9,0.12,0.45,1.8,0.075,10.0,21.0,0.99552,3.41,0.76,11.9,1
70,7.7,0.63,0.08,1.9,0.076,15.0,27.0,0.9967,3.32,0.54,9.5,1
776,6.9,0.765,0.18,2.4,0.243,5.5,48.0,0.99612,3.4,0.6,10.3,1


In [6]:
# Review the data types associated with the columns
loans_df.dtypes

payment_history           float64
location_parameter        float64
stem_degree_score         float64
gpa_ranking               float64
alumni_success            float64
study_major_code          float64
time_to_completion        float64
finance_workshop_score    float64
cohort_ranking            float64
total_loan_score          float64
financial_aid_score       float64
credit_ranking              int64
dtype: object

In [7]:
# Check the credit_ranking value counts
loans_df["credit_ranking"].value_counts()

credit_ranking
1    855
0    744
Name: count, dtype: int64

### Step 2: Using the preprocessed data, create the features (`X`) and target (`y`) datasets. The target dataset should be defined by the preprocessed DataFrame column “credit_ranking”. The remaining columns should define the features dataset.

In [30]:
# Define the target set y using the credit_ranking column
y = loans_df.copy()['credit_ranking']

# Display a sample of y
y.sample(10)

1111    1
455     1
381     1
261     0
600     0
1239    0
1473    0
1076    1
785     0
997     1
Name: credit_ranking, dtype: int64

In [9]:
# Define features set X by selecting all columns but credit_ranking
x = loans_df.copy().drop(columns='credit_ranking')

# Review the features DataFrame
x.head()

Unnamed: 0,payment_history,location_parameter,stem_degree_score,gpa_ranking,alumni_success,study_major_code,time_to_completion,finance_workshop_score,cohort_ranking,total_loan_score,financial_aid_score
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


### Step 3: Split the features and target sets into training and testing datasets.


In [10]:
# Split the preprocessed data into a training and testing dataset
# Assign the function a random_state equal to 1
x_train, x_test, y_train, y_test = train_test_split(x, y)

### Step 4: Use scikit-learn's `StandardScaler` to scale the features data.

In [11]:
# Create a StandardScaler instance
sc = skl.preprocessing.StandardScaler()

# Fit the scaler to the features training dataset
# Fit the scaler to the features training dataset
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)


---

## Compile and Evaluate a Model Using a Neural Network

### Step 1: Create a deep neural network by assigning the number of input features, the number of layers, and the number of neurons on each layer using Tensorflow’s Keras.

> **Hint** You can start with a two-layer deep neural network model that uses the `relu` activation function for both layers.


In [12]:
# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=10,
        step=2), activation='relu', input_dim=len(x.columns)))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 6)):

        # Declare hyperparams for this loop
        learn_rate = hp.Float(f'learn_rate_{i}', min_value=.1, max_value=.3, step=.1)
        layer_type_str = hp.Choice(f'layer_type_{i}',['Dense', 'Dropout', 'BatchNormalization'])
        activation = hp.Choice(f'activation_{i}',['relu','tanh','sigmoid'])
        units = hp.Int('units_' + str(i), min_value=1, max_value=10, step=2)

        nn_model_toadd = None
        match layer_type_str:
            case 'Dense':
                # Allow kerastuner to decide which activation function to use in hidden layers

                nn_model_toadd = tf.keras.layers.Dense(units=units,
                    activation=activation)
            case 'Dropout':
                nn_model_toadd = tf.keras.layers.Dropout(learn_rate)

            # case 'Embedding':
            #     nn_model_toadd = tf.keras.layers.Embedding(input_dim=input_dim, output_dim=output_dim)
            # removing Embedding as it is for Natural Language Processing: , 'Embedding'
            # input_dim = hp.Int('input_dim', min_value=50, max_value=100, step=2)
            # output_dim = hp.Int('output_dim', min_value=50, max_value=100, step=2)

            case 'BatchNormalization':
                nn_model_toadd = tf.keras.layers.BatchNormalization()

        nn_model.add(nn_model_toadd)

    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])

    return nn_model

In [13]:
# Import the kerastuner library
import keras_tuner as kt

tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=50,
    hyperband_iterations=50,
    overwrite=False,
    project_name='student_loans')

tuner.search(x_train, y_train, epochs=50, validation_data=(x_test, y_test))

Trial 1671 Complete [00h 00m 20s]
val_accuracy: 0.7325000166893005

Best val_accuracy So Far: 0.7649999856948853
Total elapsed time: 04h 05m 10s

Search: Running Trial #1672

Value             |Best Value So Far |Hyperparameter
9                 |9                 |first_units
2                 |1                 |num_layers
0.1               |0.1               |learn_rate_0
Dense             |Dense             |layer_type_0
sigmoid           |tanh              |activation_0
3                 |3                 |units_0
0.2               |0.2               |learn_rate_1
Dropout           |Dropout           |layer_type_1
tanh              |relu              |activation_1
7                 |7                 |units_1
0.1               |0.2               |learn_rate_2
Dropout           |Dense             |layer_type_2
sigmoid           |sigmoid           |activation_2
1                 |9                 |units_2
0.2               |0.2               |learn_rate_3
Dropout           |BatchN

KeyboardInterrupt: 

In [14]:
best_hyper = tuner.get_best_hyperparameters(1)[0]
best_hyper.values

{'first_units': 9,
 'num_layers': 1,
 'learn_rate_0': 0.1,
 'layer_type_0': 'Dense',
 'activation_0': 'tanh',
 'units_0': 3,
 'learn_rate_1': 0.2,
 'layer_type_1': 'Dropout',
 'activation_1': 'relu',
 'units_1': 7,
 'learn_rate_2': 0.2,
 'layer_type_2': 'Dense',
 'activation_2': 'sigmoid',
 'units_2': 9,
 'learn_rate_3': 0.2,
 'layer_type_3': 'BatchNormalization',
 'activation_3': 'sigmoid',
 'units_3': 3,
 'learn_rate_4': 0.1,
 'layer_type_4': 'Dense',
 'activation_4': 'tanh',
 'units_4': 3,
 'learn_rate_5': 0.2,
 'layer_type_5': 'Dense',
 'activation_5': 'sigmoid',
 'units_5': 5,
 'tuner/epochs': 50,
 'tuner/initial_epoch': 17,
 'tuner/bracket': 2,
 'tuner/round': 2,
 'tuner/trial_id': '0607'}

{'first_units': 9,
 'num_layers': 1,

 'learn_rate_0': 0.1,
 'layer_type_0': 'Dense',
 'activation_0': 'tanh',
 'units_0': 3,

 'learn_rate_1': 0.2,
 'layer_type_1': 'Dropout',
 'activation_1': 'relu',
 'units_1': 7,

 'learn_rate_2': 0.2,
 'layer_type_2': 'Dense',
 'activation_2': 'sigmoid',
 'units_2': 9,

 'learn_rate_3': 0.2,
 'layer_type_3': 'BatchNormalization',
 'activation_3': 'sigmoid',
 'units_3': 3,

 'learn_rate_4': 0.1,
 'layer_type_4': 'Dense',
 'activation_4': 'tanh',
 'units_4': 3,

 'learn_rate_5': 0.2,
 'layer_type_5': 'Dense',
 'activation_5': 'sigmoid',
 'units_5': 5,

 Based on these results we should have 3 layers. A starting input layer, a single hidden layer, and an output layer

 The hidden layers are what the tuning was for, which said to have just one as a Dense layer with tanh activation and 9 units

In [15]:
# Create the Sequential model instance
nn_model = tf.keras.models.Sequential()

# Review the number of features
input_nodes = len(x.columns)

In [16]:
# Define the number of hidden nodes for the first hidden layer
nn_model.add(tf.keras.layers.Dense(units=9, activation="relu", input_dim=input_nodes))
nn_model.add(tf.keras.layers.Dense(units=9, activation="tanh"))
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))


In [17]:
# Display the Sequential model summary
nn_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 9)                 108       
                                                                 
 dense_4 (Dense)             (None, 9)                 90        
                                                                 
 dense_5 (Dense)             (None, 1)                 10        
                                                                 
Total params: 208 (832.00 Byte)
Trainable params: 208 (832.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


### Step 2: Compile and fit the model using the `binary_crossentropy` loss function, the `adam` optimizer, and the `accuracy` evaluation metric.


In [18]:
# Compile the Sequential model
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [19]:
# Fit the model using 50 epochs and the training data

fit_model = nn_model.fit(x_train, y_train, epochs=200)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

### Step 3: Evaluate the model using the test data to determine the model’s loss and accuracy.


In [20]:
# Evaluate the model loss and accuracy metrics using the evaluate method and the test data
model_loss, model_accuracy = nn_model.evaluate(x_test, y_test, verbose=2)

# Display the model loss and accuracy results
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

# given outputs:
# 13/13 - 0s - loss: 0.5049 - accuracy: 0.7350 - 175ms/epoch - 13ms/step
# Loss: 0.5049149394035339, Accuracy: 0.7350000143051147

13/13 - 0s - loss: 0.5340 - accuracy: 0.7275 - 156ms/epoch - 12ms/step
Loss: 0.5340055823326111, Accuracy: 0.7275000214576721


Based on the expected results, my accuracy is lower and my loss is higher. Perhaps I could have used the hyperparameter tuning to make this better but ran out of time to let it run, it ran for over 4 hours to get these results.

I also had to adjust the tuning multiple times and rerun it, each giving different results.

### Step 4: Save and export your model to a keras file, and name the file `student_loans.keras`.


In [24]:
from google.colab import drive
import pickle

# Set the model's file path
path = '/content/drive/' #dev/student loans/
filename = 'student_loans.keras'

# Export your model to a keras file
drive.mount(path)

# Export your model to a keras file
pickle.dump(nn_model, open(filename+'.pkl','wb'))

Mounted at /content/drive/


---
## Predict Loan Repayment Success by Using your Neural Network Model

### Step 1: Reload your saved model.

In [25]:
# Set the model's file path
path
filename

# Load the model to a new object
model = pickle.load(open(filename+'.pkl','rb'))

### Step 2: Make predictions on the testing data and save the predictions to a DataFrame.

In [26]:
# Make predictions with the test data
pred = model.predict(x_test)

# Display a sample of the predictions
pred[:5]



array([[0.68860644],
       [0.22568597],
       [0.8684801 ],
       [0.35793403],
       [0.79996717]], dtype=float32)

In [27]:
# Save the predictions to a DataFrame and round the predictions to binary results
pred_df = pd.DataFrame(pred.round())

# Display a sample of the DataFrame
pred_df.head()

Unnamed: 0,0
0,1.0
1,0.0
2,1.0
3,0.0
4,1.0


### Step 4: Display a classification report with the y test data and predictions

In [29]:
# Print the classification report with the y test data and predictions
print(classification_report(y_test, pred_df))

              precision    recall  f1-score   support

           0       0.67      0.75      0.71       177
           1       0.78      0.71      0.74       223

    accuracy                           0.73       400
   macro avg       0.73      0.73      0.73       400
weighted avg       0.73      0.73      0.73       400



---
## Discuss creating a recommendation system for student loans

Briefly answer the following questions in the space provided:

1. Describe the data that you would need to collect to build a recommendation system to recommend student loan options for students. Explain why this data would be relevant and appropriate.

2. Based on the data you chose to use in this recommendation system, would your model be using collaborative filtering, content-based filtering, or context-based filtering? Justify why the data you selected would be suitable for your choice of filtering method.

3. Describe two real-world challenges that you would take into consideration while building a recommendation system for student loans. Explain why these challenges would be of concern for a student loan recommendation system.

**1. Describe the data that you would need to collect to build a recommendation system to recommend student loan options for students. Explain why this data would be relevant and appropriate.**
We need more financial data. This seems to be class scores, even if the classes are related to finance it may not be a great predictor of how they manage their finances.

**2. Based on the data you chose to use in this recommendation system, would your model be using collaborative filtering, content-based filtering, or context-based filtering? Justify why the data you selected would be suitable for your choice of filtering method.**
Using context-based filtering makes the most sense, because based on the context of the students performance, we are determining their credit worthiness


**3. Describe two real-world challenges that you would take into consideration while building a recommendation system for student loans. Explain why these challenges would be of concern for a student loan recommendation system.**
1) Many students are young adults with not a lot of history / data about them to make a good prediction.
2) I believe student loans aren't paid until after they stop taking classes. I would expect re-payment is more based on the job they get, if they get one, after graduating. Which is information you won't have as they are going through school.
