In [64]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import random
from matplotlib.colors import ListedColormap
import hvplot.pandas


from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score


from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDRegressor



from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score



In [65]:
#LOADING DATA
df = pd.read_csv("D://Download//Loan_default.csv//Loan_default.csv") #Source: https://www.kaggle.com/datasets/nikhil1e9/loan-default
#df = df.loc[:,['Sex','Age', 'Height', 'Weight', 'Sport', 'Team', 'Season']]
df.shape

(255347, 17)

In [66]:
df.head()

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,56,85994,50587,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes,0
1,69,50432,124440,458,15,1,4.81,60,0.68,Master's,Full-time,Married,No,No,Other,Yes,0
2,46,84208,129188,451,26,3,21.17,24,0.31,Master's,Unemployed,Divorced,Yes,Yes,Auto,No,1
3,32,31713,44799,743,0,3,7.07,24,0.23,High School,Full-time,Married,No,No,Business,No,0
4,60,20437,9139,633,8,4,6.51,48,0.73,Bachelor's,Unemployed,Divorced,No,Yes,Auto,No,0


In [67]:
#CLEANING

In [68]:
#Checking for Missing Values
df.isnull().sum()

Age               0
Income            0
LoanAmount        0
CreditScore       0
MonthsEmployed    0
NumCreditLines    0
InterestRate      0
LoanTerm          0
DTIRatio          0
Education         0
EmploymentType    0
MaritalStatus     0
HasMortgage       0
HasDependents     0
LoanPurpose       0
HasCoSigner       0
Default           0
dtype: int64

In [69]:
#Casting Strings to Int
def castToInt(df, feature):
    tempFeature = df[feature].unique()

    i = 0
    while i < len(tempFeature):
        df[feature] = df[feature].replace(tempFeature[i], i)
        i += 1

    df[feature] = df[feature].astype(float)

In [70]:
#Tracking the mapping of the feature values
df['Education'].unique()

array(["Bachelor's", "Master's", 'High School', 'PhD'], dtype=object)

In [71]:
df['Education'].value_counts()

Education
Bachelor's     64366
High School    63903
Master's       63541
PhD            63537
Name: count, dtype: int64

In [72]:
castToInt(df, 'Education')
df['Education'].unique()


  df[feature] = df[feature].replace(tempFeature[i], i)


array([0., 1., 2., 3.])

In [73]:
df['Education'].value_counts()

Education
0.0    64366
2.0    63903
1.0    63541
3.0    63537
Name: count, dtype: int64

In [74]:
#Casting the rest of the features
castToInt(df, 'EmploymentType')
castToInt(df, 'MaritalStatus')
castToInt(df, 'HasMortgage')
castToInt(df, 'HasDependents')
castToInt(df, 'LoanPurpose')
castToInt(df, 'HasCoSigner')

  df[feature] = df[feature].replace(tempFeature[i], i)
  df[feature] = df[feature].replace(tempFeature[i], i)
  df[feature] = df[feature].replace(tempFeature[i], i)
  df[feature] = df[feature].replace(tempFeature[i], i)
  df[feature] = df[feature].replace(tempFeature[i], i)
  df[feature] = df[feature].replace(tempFeature[i], i)


In [75]:
#Dropping Outliers
def drop_outliers_IQR(df):

   q1=df.quantile(0.25)

   q3=df.quantile(0.75)

   IQR=q3-q1

   not_outliers = df[~((df<(q1-1.5*IQR)) | (df>(q3+1.5*IQR)))]

   outliers_dropped = not_outliers.dropna()

   return outliers_dropped

df = drop_outliers_IQR(df)
df.shape

(225694, 17)

In [76]:
#MACHINE LEARNING

In [77]:
#Checking for Correlation
df.corr()

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
Age,1.0,-0.015318,0.010386,-0.006388,-0.016822,0.003101,0.02248,0.001558,-0.001588,-0.00287,0.001156,-0.002518,0.002755,0.005278,-0.00612,0.00997,
Income,-0.015318,1.0,0.025807,-0.003785,-0.004592,0.00021,0.008528,-0.00112,0.001422,-0.005177,-0.002606,0.000345,0.002697,0.004897,-0.003058,0.006001,
LoanAmount,0.010386,0.025807,1.0,0.003783,0.010413,-0.001463,-0.011789,0.003617,0.000615,0.002217,-0.002776,0.000909,-0.000539,-0.001892,0.001057,-0.001192,
CreditScore,-0.006388,-0.003785,0.003783,1.0,-0.002826,0.000262,0.004694,0.001799,-0.001401,0.001939,0.002179,-0.003384,-0.001102,0.003374,-0.001755,0.005436,
MonthsEmployed,-0.016822,-0.004592,0.010413,-0.002826,1.0,0.004243,0.01199,-0.00055,0.003574,-0.003893,0.004474,-0.003068,0.001773,0.000737,-0.001397,0.003597,
NumCreditLines,0.003101,0.00021,-0.001463,0.000262,0.004243,1.0,-0.003858,0.00049,-0.000911,0.002608,-0.000117,-0.000757,0.000732,0.0032,0.00186,-0.002564,
InterestRate,0.02248,0.008528,-0.011789,0.004694,0.01199,-0.003858,1.0,0.000399,-0.00227,0.003079,-0.00199,-0.002587,-0.00234,-0.003571,0.003017,-0.000804,
LoanTerm,0.001558,-0.00112,0.003617,0.001799,-0.00055,0.00049,0.000399,1.0,0.00162,-0.003096,-0.001041,-0.001241,-0.001211,-0.001506,-0.001494,0.001022,
DTIRatio,-0.001588,0.001422,0.000615,-0.001401,0.003574,-0.000911,-0.00227,0.00162,1.0,0.001002,-0.001001,0.004602,-0.001092,-0.002284,0.001668,-0.001151,
Education,-0.00287,-0.005177,0.002217,0.001939,-0.003893,0.002608,0.003079,-0.003096,0.001002,1.0,-0.000353,-0.006154,0.002345,-0.003009,-0.000493,0.001583,


In [78]:
# Split data into dependent/independent variables
y= df['LoanAmount']
x= df.drop(['LoanAmount'], axis = 1)

In [79]:
# Split data into test/train set
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = True)

In [80]:
"""# Scale dataset
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)"""

In [81]:
#Test Models
def cross_val(model):
    pred = cross_val_score(model, X, y, cv=10)
    return pred.mean()

def print_evaluate(true, predicted):  
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    print('MAE:', mae)
    print('MSE:', mse)
    print('RMSE:', rmse)
    print('R2 Square', r2_square)
    print('__________________________________')
    
def evaluate(true, predicted):
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    return mae, mse, rmse, r2_square


In [82]:
#Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train,y_train)


test_pred = lin_reg.predict(X_test)
train_pred = lin_reg.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

Test set evaluation:
_____________________________________
MAE: 61139.94007989999
MSE: 4979849969.415969
RMSE: 70568.05204493014
R2 Square 0.001020823570702989
__________________________________
Train set evaluation:
_____________________________________
MAE: 61191.43964941075
MSE: 4998968677.141724
RMSE: 70703.3851887003
R2 Square 0.0010990357561838637
__________________________________


In [83]:
#Ridge Regression
model = Ridge(alpha=100, solver='cholesky', tol=0.0001, random_state=42)
model.fit(X_train, y_train)
pred = model.predict(X_test)

test_pred = model.predict(X_test)
train_pred = model.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('====================================')
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

Test set evaluation:
_____________________________________
MAE: 61139.95260424196
MSE: 4979849763.920548
RMSE: 70568.05058892125
R2 Square 0.0010208647939621285
__________________________________
Train set evaluation:
_____________________________________
MAE: 61191.45405608063
MSE: 4998968679.208812
RMSE: 70703.38520331832
R2 Square 0.0010990353431353794
__________________________________


In [84]:
# Lasso Regression
model = Lasso(alpha=0.1, 
              precompute=True, 
#               warm_start=True, 
              positive=True, 
              selection='random',
              random_state=42)
model.fit(X_train, y_train)

test_pred = model.predict(X_test)
train_pred = model.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('====================================')
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

Test set evaluation:
_____________________________________
MAE: 61141.533501447826
MSE: 4979749438.862466
RMSE: 70567.33974624852
R2 Square 0.0010409904291749372
__________________________________
Train set evaluation:
_____________________________________
MAE: 61201.88828462595
MSE: 5000051537.396452
RMSE: 70711.04254214084
R2 Square 0.00088265709441393
__________________________________


In [85]:
#ElasticNet
model = ElasticNet(alpha=0.1, l1_ratio=0.9, selection='random', random_state=42)
model.fit(X_train, y_train)

test_pred = model.predict(X_test)
train_pred = model.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('====================================')
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

Test set evaluation:
_____________________________________
MAE: 61140.15451252358
MSE: 4979846935.168629
RMSE: 70568.03054619442
R2 Square 0.0010214322536808984
__________________________________
Train set evaluation:
_____________________________________
MAE: 61191.6848853728
MSE: 4998969269.3808
RMSE: 70703.3893768948
R2 Square 0.001098917414137146
__________________________________


In [88]:
#Stochastic Gradient Descent Regressor 
sgd_reg = SGDRegressor(n_iter_no_change=250, penalty=None, eta0=0.0001, max_iter=10000)
sgd_reg.fit(X_train, y_train)

test_pred = sgd_reg.predict(X_test)
train_pred = sgd_reg.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('====================================')
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

Test set evaluation:
_____________________________________
MAE: 61139.92894224074
MSE: 4979850459.574488
RMSE: 70568.05551787924
R2 Square 0.001020725242810383
__________________________________
Train set evaluation:
_____________________________________
MAE: 61191.41452369919
MSE: 4998968687.213153
RMSE: 70703.3852599234
R2 Square 0.0010990337436965758
__________________________________


In [89]:
#Saving the Model
import pickle
pickle.dump(lin_reg, open('models/lin_reg.pkl', 'wb'))

#model = pickle.load(open('model_dt.pkl', 'rb'))