In [None]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, dataloader
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
import torch.onnx

import matplotlib.pyplot as plt

pd.options.display.max_rows=999

In [None]:
FULL_DATASET_NAME = 'data/landing_club/accepted_2007_to_2018Q4.csv'
SMALL_SAMPLE_DATASET_NAME = 'data/landing_club/accepted_2007_to_2018Q4_small_sample.csv'
SMALL_HEAD_DATASET_NAME = 'data/landing_club/accepted_2007_to_2018Q4_small_head.csv'

Sample a few records for faster work

In [None]:
# Make small dataset
# col_names = ['loan_amnt', 'home_ownership', 'annual_inc', 'purpose', 'addr_state', 'term', 'emp_length', 'int_rate']
# col_names = ['loan_amnt', 'home_ownership', 'annual_inc','term', 'emp_length', 'int_rate']
col_names = ['loan_amnt', 'home_ownership', 'annual_inc', 'int_rate']
record_count = 1000
# acc_raw_df = pd.read_csv(FULL_DATASET_NAME)
# acc_raw_df.sample(record_count).to_csv(SMALL_SAMPLE_DATASET_NAME, columns=col_names)
# acc_raw_df.head(record_count).to_csv(SMALL_HEAD_DATASET_NAME, columns=col_names)

Read the small dataset

In [None]:

class Landing_Club_Dataset(Dataset):
    def __init__(self, file_path: str, col_label: str, col_names = []):
        self.file_path = file_path
        self.df = pd.read_csv(file_path, usecols = col_names)
        self.df.describe()
        self.np = self.to_numpy()
        self.col_label = col_label
        self.clean()
        self.encode_categorical_columns()
        self.to_tensor()
    
    def __get_item__(self, index: int):
        return (self.X_tsor[index], self.y_tsor[index])
    
    def __len__(self):
        return len(self.df)
    
    def to_numpy(self):
        return self.df.to_numpy()
    
    def to_tensor(self, with_grad=False):
        # return torch.from_numpy(self.np.astype(np.float32))
        self._split_X_y()
        self.X_tsor = torch.tensor(self.X_df.values, requires_grad=with_grad)
        self.y_tsor = torch.tensor(self.y_df.values, requires_grad=with_grad)
    
    def clean(self):
        self.df.dropna(inplace=True)
        self.np = self.to_numpy()
        return
    
    def encode_categorical_columns(self):
        categorical_cols = list(self.df.select_dtypes(include=['object']))
        print(categorical_cols)
        label_enc = LabelEncoder()
        for col in categorical_cols:
            label_encoded = label_enc.fit_transform(self.df[col])
            self.df[col] = label_encoded
        self.np = self.to_numpy()
        print(self.np)

    def _split_X_y(self):
        self.X_df = self.df.drop(columns=['int_rate'])
        self.y_df = self.df['int_rate']

    def _split_train_test_and_standardize(self, test_size):
        X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(self.X_df.values, self.y_df.values, test_size=test_size)

        self.sc_x = StandardScaler()
        self.sc_y = StandardScaler()

        y_train_orig = y_train_orig.reshape(-1, 1)
        y_test_orig = y_test_orig.reshape(-1, 1)
        self.X_train = self.sc_x.fit_transform(X_train_orig)
        self.X_test = self.sc_x.transform(X_test_orig)
        self.y_train = self.sc_y.fit_transform(y_train_orig)
        self.y_test = self.sc_y.transform(y_test_orig)
    
    def get_train_dataset(self):
        if (not hasattr(self, 'X_train')):
            self._split_train_test_and_standardize(test_size=.2)
        X_train_tsor = torch.from_numpy(self.X_train.astype(np.float32))
        y_train_tsor = torch.from_numpy(self.y_train.astype(np.float32))
        return (X_train_tsor, y_train_tsor)

    def get_test_dataset(self):
        if (not hasattr(self, 'X_train')):
            self._split_train_test_and_standardize(test_size=.2)
        X_test_tsor = torch.from_numpy(self.X_test.astype(np.float32))
        y_test_tsor = torch.from_numpy(self.y_test.astype(np.float32))
        return (X_test_tsor, y_test_tsor)



    

Remove Nan rows

In [None]:
# Remove records with nan
# acc_df = acc_raw_df.loc[:, col_names]
# acc_df_len = len(acc_df)

# acc_df.dropna(inplace=True)
# print(f'Records dropped: {acc_df_len - len(acc_df)}')

# print(acc_df.head(10))
# acc_df.describe()


Encoding functions for categorical features

In [None]:
# def encode_one_hot(df):
#    categorical_cols = list(acc_df.select_dtypes(include=['object']))
#    print(categorical_cols)
#    one_hot_enc = OneHotEncoder(sparse_output=False)
#    for col in categorical_cols:
#       label_encoded = one_hot_enc.fit_transform(df[col].to_numpy().reshape(-1, 1))
#       print(col)
#       df[col] = label_encoded.tolist()
#    return df

# def encode_label(df):
#    categorical_cols = list(acc_df.select_dtypes(include=['object']))
#    print(categorical_cols)
#    label_enc = LabelEncoder()
#    for col in categorical_cols:
#       label_encoded = label_enc.fit_transform(df[col])
#       df[col] = label_encoded
#    return df

Encode categorical features

In [None]:
dataset = Landing_Club_Dataset(SMALL_HEAD_DATASET_NAME, ['int_rate'], col_names)

dataset.df.head(9)
type(dataset.df.int_rate[0])


Split features and label

Define Model explicitely as a Class

In [None]:
feature_count = len(dataset.X_df.columns)

class ClassModel(nn.Module):
   def __init__(self, input_dim, output_dim):
      super().__init__()
      self.fc1 = nn.Linear(input_dim, 5)
      self.relu1 = nn.ReLU()
      self.fc2 = nn.Linear(5, 5)
      self.relu2 = nn.ReLU()
      self.fc3 = nn.Linear(5, 1)

   def forward(self, X):
      x = self.fc1(X)
      x = self.relu1(x)
      x = self.fc2(x)
      x = self.relu2(x)
      x = self.fc3(x)
      return x
   
class_model = ClassModel(feature_count, 1)
seq_model = nn.Sequential(
   nn.Linear(feature_count, 15),
   nn.Sigmoid(),
   nn.Linear(15, 15),
   nn.Sigmoid(),
   nn.Linear(15, 1)
)

Define training objects

In [None]:
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(lr=.3, params=seq_model.parameters())

Create Tensors

In [None]:
(X_train_tsor, y_train_tsor) = dataset.get_train_dataset()
(X_test_tsor, y_test_tsor) = dataset.get_test_dataset()

print(X_train_tsor[0:9])
print(y_train_tsor[0:9])
# print(f'Encode {y_train_orig[5][0]} = {y_train[5][0]}, Decode = {sc_y.inverse_transform(y_train[5][0].reshape(-1, 1)).reshape(1)[0]}')


Train Model

In [None]:
losses = []
n_epochs = 20000
seq_model.train()
for n in range(n_epochs):
   # outputs = myModel(X_train_tsor)
   outputs = seq_model(X_train_tsor)
   
   loss = criterion(outputs, y_train_tsor)
   losses.append(loss)
   # print(loss.item())
   if (n % (n_epochs / 10) == 0):
      with torch.no_grad():
         peek_output_element = 3
         print(f'Loss on epoch ' 
               f'{n:04}: {loss.item():.4f} - '
               f'interest rate: '
               f'{dataset.sc_y.inverse_transform(outputs[peek_output_element].reshape(-1,1)).reshape(1).item():.4f}'
               f'/ '
               f'{dataset.sc_y.inverse_transform(y_train_tsor[peek_output_element].reshape(-1, 1)).reshape(1).item():.4f}')
   loss.backward()
   optimizer.step()
   optimizer.zero_grad()


Validate Predictions

In [None]:
with torch.no_grad():
   seq_model.eval()
   preds = seq_model(X_test_tsor)
   # X_test_feat_0 = X_test[2:, 0:1].reshape(-1)
   preds_decoded = dataset.sc_y.inverse_transform(preds)
   y_test_tsor_decoded = dataset.sc_y.inverse_transform(y_test_tsor)
   for i in range(0, 10):
      print(f'Prediction / Test [{i}] = '
            f'{preds_decoded[i].reshape(-1, 1)} / '
            f'{y_test_tsor_decoded[i].reshape(-1, 1)}')
   plt.cla()
   plt.plot(preds_decoded)
   plt.plot(y_test_tsor_decoded)
   # plt.scatter(X_test_feat_0[0:10], y_test[0:10])
   # plt.scatter(X_test_feat_0[0:10], preds[0:10])
