In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import ListedColormap

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDRegressor

from sklearn import metrics




In [2]:
#LOADING DATA
df = pd.read_csv("D://Download//Loan_default.csv//Loan_default.csv") #Source: https://www.kaggle.com/datasets/nikhil1e9/loan-default
df.shape

(255347, 17)

In [3]:
df.head()

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,56,85994,50587,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes,0
1,69,50432,124440,458,15,1,4.81,60,0.68,Master's,Full-time,Married,No,No,Other,Yes,0
2,46,84208,129188,451,26,3,21.17,24,0.31,Master's,Unemployed,Divorced,Yes,Yes,Auto,No,1
3,32,31713,44799,743,0,3,7.07,24,0.23,High School,Full-time,Married,No,No,Business,No,0
4,60,20437,9139,633,8,4,6.51,48,0.73,Bachelor's,Unemployed,Divorced,No,Yes,Auto,No,0


In [4]:
#CLEANING

In [5]:
#Dropping Unnecessary Features
df = df.drop(['InterestRate','DTIRatio', 'Default'], axis = 1)

In [6]:
#Checking for Missing Values
df.isnull().sum()

Age               0
Income            0
LoanAmount        0
CreditScore       0
MonthsEmployed    0
NumCreditLines    0
LoanTerm          0
Education         0
EmploymentType    0
MaritalStatus     0
HasMortgage       0
HasDependents     0
LoanPurpose       0
HasCoSigner       0
dtype: int64

In [7]:
#Casting Strings to Int
def castToInt(df, feature):
    tempFeature = df[feature].unique()

    i = 0
    while i < len(tempFeature):
        df[feature] = df[feature].replace(tempFeature[i], i)
        i += 1

    df[feature] = df[feature].astype(float)

In [8]:
#Tracking the mapping of the feature values
df['Education'].unique()

array(["Bachelor's", "Master's", 'High School', 'PhD'], dtype=object)

In [9]:
df['Education'].value_counts()

Education
Bachelor's     64366
High School    63903
Master's       63541
PhD            63537
Name: count, dtype: int64

In [10]:
castToInt(df, 'Education')
df['Education'].unique()


  df[feature] = df[feature].replace(tempFeature[i], i)


array([0., 1., 2., 3.])

In [11]:
df['Education'].value_counts()

Education
0.0    64366
2.0    63903
1.0    63541
3.0    63537
Name: count, dtype: int64

In [12]:
#Casting the rest of the features
castToInt(df, 'EmploymentType')
castToInt(df, 'MaritalStatus')
castToInt(df, 'HasMortgage')
castToInt(df, 'HasDependents')
castToInt(df, 'LoanPurpose')
castToInt(df, 'HasCoSigner')

  df[feature] = df[feature].replace(tempFeature[i], i)
  df[feature] = df[feature].replace(tempFeature[i], i)
  df[feature] = df[feature].replace(tempFeature[i], i)
  df[feature] = df[feature].replace(tempFeature[i], i)
  df[feature] = df[feature].replace(tempFeature[i], i)
  df[feature] = df[feature].replace(tempFeature[i], i)


In [13]:
#Dropping Outliers
def drop_outliers_IQR(df):

   q1=df.quantile(0.25)

   q3=df.quantile(0.75)

   IQR=q3-q1

   not_outliers = df[~((df<(q1-1.5*IQR)) | (df>(q3+1.5*IQR)))]

   outliers_dropped = not_outliers.dropna()

   return outliers_dropped

df = drop_outliers_IQR(df)
df.shape

(255347, 14)

In [14]:
#MACHINE LEARNING

In [15]:
#Checking for Correlation
df.corr()

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,LoanTerm,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner
Age,1.0,-0.001244,-0.002213,-0.000548,-0.000341,-0.00089,0.000263,-0.001034,-0.002529,-0.002187,-3.5e-05,-0.00071,-0.004313,0.002918
Income,-0.001244,1.0,-0.000865,-0.00143,0.002675,-0.002016,-0.000998,-0.00406,-0.003805,0.000637,0.000945,0.00157,-0.002211,0.003524
LoanAmount,-0.002213,-0.000865,1.0,0.001261,0.002817,0.000794,0.002538,0.001551,-0.001685,-0.000771,0.000801,-0.000139,-0.000403,0.001848
CreditScore,-0.000548,-0.00143,0.001261,1.0,0.000613,1.6e-05,0.00113,0.00202,0.001308,-0.003218,-0.001728,0.003018,-0.001689,0.002755
MonthsEmployed,-0.000341,0.002675,0.002817,0.000613,1.0,0.001267,-0.001166,-0.002241,0.001947,-9.5e-05,-0.00021,-0.00145,-0.000814,-0.001045
NumCreditLines,-0.00089,-0.002016,0.000794,1.6e-05,0.001267,1.0,-0.000226,0.002652,0.000633,-0.000664,0.001744,0.001895,0.002571,-0.002105
LoanTerm,0.000263,-0.000998,0.002538,0.00113,-0.001166,-0.000226,1.0,-0.002654,-0.000626,-0.001042,-0.001775,-0.002417,-0.001102,0.001166
Education,-0.001034,-0.00406,0.001551,0.00202,-0.002241,0.002652,-0.002654,1.0,-0.000973,-0.005682,0.0008,-0.001614,-0.000908,0.000519
EmploymentType,-0.002529,-0.003805,-0.001685,0.001308,0.001947,0.000633,-0.000626,-0.000973,1.0,-0.000538,-0.001773,-0.001082,0.001589,0.002268
MaritalStatus,-0.002187,0.000637,-0.000771,-0.003218,-9.5e-05,-0.000664,-0.001042,-0.005682,-0.000538,1.0,0.000408,0.000437,-0.00219,0.000888


In [16]:
# Split data into dependent/independent variables
y= df['LoanAmount']
x= df.drop(['LoanAmount'], axis = 1)

In [17]:
# Split data into test/train set
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = True)

In [18]:
"""# Scale dataset
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)"""

'# Scale dataset\nsc = StandardScaler()\nX_train = sc.fit_transform(X_train)\nX_test = sc.transform(X_test)'

In [19]:
#Test Models
def cross_val(model):
    pred = cross_val_score(model, x, y, cv=10)
    return pred.mean()

def print_evaluate(true, predicted):  
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    print('MAE:', mae)
    print('MSE:', mse)
    print('RMSE:', rmse)
    print('R2 Square', r2_square)
    print('__________________________________')
    
def evaluate(true, predicted):
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    return mae, mse, rmse, r2_square


In [20]:
#Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train,y_train)


test_pred = lin_reg.predict(X_test)
train_pred = lin_reg.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

Test set evaluation:
_____________________________________
MAE: 61361.01524416669
MSE: 5013570746.483598
RMSE: 70806.57276329365
R2 Square -0.00018184649638963712
__________________________________
Train set evaluation:
_____________________________________
MAE: 61384.021440726014
MSE: 5020008413.290615
RMSE: 70852.01770797085
R2 Square 5.301058478179854e-05
__________________________________


In [21]:
#Ridge Regression
model = Ridge(alpha=100, solver='cholesky', tol=0.0001, random_state=42)
model.fit(X_train, y_train)
pred = model.predict(X_test)

test_pred = model.predict(X_test)
train_pred = model.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('====================================')
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

Test set evaluation:
_____________________________________
MAE: 61361.01028963356
MSE: 5013569773.860012
RMSE: 70806.5658951203
R2 Square -0.00018165246293455617
__________________________________
Train set evaluation:
_____________________________________
MAE: 61384.021782344964
MSE: 5020008413.42144
RMSE: 70852.01770889408
R2 Square 5.3010558722532686e-05
__________________________________


In [22]:
# Lasso Regression
model = Lasso(alpha=0.1, 
              precompute=True, 
#               warm_start=True, 
              positive=True, 
              selection='random',
              random_state=42)
model.fit(X_train, y_train)

test_pred = model.predict(X_test)
train_pred = model.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('====================================')
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

Test set evaluation:
_____________________________________
MAE: 61356.339318087405
MSE: 5012771154.896991
RMSE: 70800.92622908961
R2 Square -2.233204451984605e-05
__________________________________
Train set evaluation:
_____________________________________
MAE: 61385.285826894906
MSE: 5020177190.111151
RMSE: 70853.20874957711
R2 Square 1.9391542803615103e-05
__________________________________


In [23]:
#ElasticNet
model = ElasticNet(alpha=0.1, l1_ratio=0.9, selection='random', random_state=42)
model.fit(X_train, y_train)

test_pred = model.predict(X_test)
train_pred = model.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('====================================')
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

Test set evaluation:
_____________________________________
MAE: 61360.9184837731
MSE: 5013551756.212322
RMSE: 70806.43866353061
R2 Square -0.00017805803392434427
__________________________________
Train set evaluation:
_____________________________________
MAE: 61384.028063917634
MSE: 5020008463.025231
RMSE: 70852.01805894615
R2 Square 5.3000678029491155e-05
__________________________________


In [24]:
#Stochastic Gradient Descent Regressor 
sgd_reg = SGDRegressor(n_iter_no_change=250, penalty=None, eta0=0.0001, max_iter=10000)
sgd_reg.fit(X_train, y_train)

test_pred = sgd_reg.predict(X_test)
train_pred = sgd_reg.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('====================================')
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

Test set evaluation:
_____________________________________
MAE: 3252377198658172.0
MSE: 1.2923758150692354e+31
RMSE: 3594962885857426.0
R2 Square -2.578223973382308e+21
__________________________________
Train set evaluation:
_____________________________________
MAE: 3249557315788664.0
MSE: 1.2912483452648507e+31
RMSE: 3593394419298904.0
R2 Square -2.572067194183448e+21
__________________________________




In [27]:
#Saving the Model
import pickle
pickle.dump(lin_reg, open('lin_reg.pkl', 'wb'))

#model = pickle.load(open('model_dt.pkl', 'rb'))