In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('/content/train_loan.csv')

In [3]:
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
data.drop('Loan_ID',axis = 1)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
609,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [6]:
df = data.copy() # A new variable storing the existing table

In [7]:
df.drop('Loan_ID',axis = 1,inplace = True)

In [8]:
df.isna().sum() # Finding all missing values

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [9]:
df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])

In [10]:
df['Married'] = df['Married'].fillna(df['Married'].mode()[0])

Before converting the Dependents we need to remove + signs from it (It may be used to denote a postive value)

In [11]:
df['Dependents'] = df['Dependents'].str.replace('+','')

In [12]:
df['Dependents'] = df['Dependents'].fillna(df['Dependents'].mode()[0])

In [13]:
df['Self_Employed'] = df['Self_Employed'].fillna(df['Self_Employed'].mode()[0])

In [14]:
df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].median())

In [15]:
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].median())

In [16]:
df['Credit_History'] = df['Credit_History'].fillna(df['Credit_History'].median())

In [17]:
df.isna().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [None]:
# Treated all missing values

In [18]:
df.dtypes

Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [19]:
x = df.drop('Loan_Amount_Term',axis = 1)

In [21]:
y = df['Loan_Amount_Term']

In [22]:
x.tail()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Credit_History,Property_Area,Loan_Status
609,Female,No,0,Graduate,No,2900,0.0,71.0,1.0,Rural,Y
610,Male,Yes,3,Graduate,No,4106,0.0,40.0,1.0,Rural,Y
611,Male,Yes,1,Graduate,No,8072,240.0,253.0,1.0,Urban,Y
612,Male,Yes,2,Graduate,No,7583,0.0,187.0,1.0,Urban,Y
613,Female,No,0,Graduate,Yes,4583,0.0,133.0,0.0,Semiurban,N


In [23]:
y.head()

0    360.0
1    360.0
2    360.0
3    360.0
4    360.0
Name: Loan_Amount_Term, dtype: float64

In [25]:
x = pd.get_dummies(x,dtype =int) #Encoding

In [26]:
x.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Credit_History,Gender_Female,Gender_Male,Married_No,Married_Yes,Dependents_0,Dependents_1,...,Dependents_3,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Loan_Status_N,Loan_Status_Y
0,5849,0.0,128.0,1.0,0,1,1,0,1,0,...,0,1,0,1,0,0,0,1,0,1
1,4583,1508.0,128.0,1.0,0,1,0,1,0,1,...,0,1,0,1,0,1,0,0,1,0
2,3000,0.0,66.0,1.0,0,1,0,1,1,0,...,0,1,0,0,1,0,0,1,0,1
3,2583,2358.0,120.0,1.0,0,1,0,1,1,0,...,0,0,1,1,0,0,0,1,0,1
4,6000,0.0,141.0,1.0,0,1,1,0,1,0,...,0,1,0,1,0,0,0,1,0,1


In [None]:
# Label Encoding

In [27]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [28]:
y = le.fit_transform(y)

In [29]:
y = le.fit_transform(y)

In [30]:
y

array([8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 8, 6, 8, 8, 8, 8, 8,
       8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
       8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8,
       5, 8, 2, 8, 8, 8, 7, 8, 8, 9, 8, 8, 7, 8, 8, 8, 8, 8, 6, 8, 8, 8,
       8, 8, 8, 5, 8, 8, 4, 8, 8, 8, 5, 8, 5, 8, 8, 8, 8, 8, 8, 8, 8, 9,
       8, 8, 8, 8, 8, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 8, 9, 8,
       8, 4, 8, 8, 8, 8, 8, 8, 8, 8, 5, 8, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8,
       8, 5, 8, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 9, 8, 8, 7, 5, 8, 8, 8,
       8, 8, 8, 9, 8, 8, 5, 8, 8, 8, 7, 8, 8, 5, 8, 8, 8, 8, 8, 8, 8, 8,
       8, 8, 8, 8, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
       8, 8, 8, 8, 8, 8, 8, 8, 9, 8, 8, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5,
       2, 8, 8, 5, 8, 5, 9, 8, 8, 8, 8, 5, 8, 8, 8, 8, 8, 8, 8, 8, 1, 8,
       8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8,
       7, 8, 8, 8, 8, 8, 8, 9, 8, 8, 8, 8, 9, 8, 8,

In [None]:
# Scaling

In [31]:
from sklearn.preprocessing import MinMaxScaler
ms = MinMaxScaler()

In [32]:
x.loc[:,['ApplicantIncome','CoapplicantIncome','LoanAmount']] = ms.fit_transform(x.loc[:,['ApplicantIncome','CoapplicantIncome','LoanAmount']])

In [33]:
x.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Credit_History,Gender_Female,Gender_Male,Married_No,Married_Yes,Dependents_0,Dependents_1,...,Dependents_3,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Loan_Status_N,Loan_Status_Y
0,0.070489,0.0,0.172214,1.0,0,1,1,0,1,0,...,0,1,0,1,0,0,0,1,0,1
1,0.05483,0.036192,0.172214,1.0,0,1,0,1,0,1,...,0,1,0,1,0,1,0,0,1,0
2,0.03525,0.0,0.082489,1.0,0,1,0,1,1,0,...,0,1,0,0,1,0,0,1,0,1
3,0.030093,0.056592,0.160637,1.0,0,1,0,1,1,0,...,0,0,1,1,0,0,0,1,0,1
4,0.072356,0.0,0.191027,1.0,0,1,1,0,1,0,...,0,1,0,1,0,0,0,1,0,1


In [None]:
# Train Test Split

In [34]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state = 42)

In [None]:
# Importing Regressors,XGB,CatBoost

In [35]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

ModuleNotFoundError: No module named 'catboost'

In [36]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [37]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

In [38]:
models = {'Random Forest': RandomForestRegressor() , 'XGB Regressor': XGBRegressor() , 'CatBoost': CatBoostRegressor(verbose=0)}

In [39]:
for name,model in models.items():
  model.fit(x_train,y_train)
  print(f'{name} : {model.score(x_test,y_test)}')

Random Forest : -0.35797975671078897
XGB Regressor : -0.753019926882744
CatBoost : -0.36386990893101445


In [40]:
from sklearn.metrics import mean_squared_error

In [43]:
for name,model in models.items():
  y_pred = model.predict(x_test)
  mse = mean_squared_error(y_test,y_pred)
  print(name +' MSE:{:.4f} '.format(mse))

Random Forest MSE:1.0731 
XGB Regressor MSE:1.3853 
CatBoost MSE:1.0778 


In [44]:
# Conclusion That XGB Regressor provides the most accurate model