In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("Prosper_Loan_Cleaned_data.csv")
df.shape

(113937, 64)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113937 entries, 0 to 113936
Data columns (total 64 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   ListingCreationDate                  113937 non-null  object 
 1   CreditGrade                          113937 non-null  object 
 2   LoanStatus                           113937 non-null  object 
 3   ClosedDate                           113937 non-null  object 
 4   ProsperRating (Alpha)                113937 non-null  object 
 5   BorrowerState                        113937 non-null  object 
 6   Occupation                           113937 non-null  object 
 7   EmploymentStatus                     113937 non-null  object 
 8   FirstRecordedCreditLine              113937 non-null  object 
 9   IncomeRange                          113937 non-null  object 
 10  LoanOriginationDate                  113937 non-null  object 
 11  LoanOriginati

In [4]:
df.isnull().sum().sum()

0

In [5]:
df.duplicated().sum()

0

##  Loan Tenure:

In [6]:
df["Term"].describe()

count    113937.000000
mean         40.830248
std          10.436212
min          12.000000
25%          36.000000
50%          36.000000
75%          36.000000
max          60.000000
Name: Term, dtype: float64

In [7]:
date_cols = ["ClosedDate", "LoanOriginationDate", "ListingCreationDate"]

df.dropna()

df.shape

(113937, 64)

In [8]:
for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors='coerce')
df.shape

(113937, 64)

In [9]:
print(df.shape)
df[date_cols].head()

(113937, 64)


Unnamed: 0,ClosedDate,LoanOriginationDate,ListingCreationDate
0,2009-08-14,2007-12-09,2023-12-23 09:29:18
1,2014-04-03,2014-03-03,NaT
2,2009-12-17,2007-01-17,2023-12-23 00:47:06
3,2014-04-03,2012-01-11,2023-12-23 02:35:00
4,2014-04-03,2013-09-20,NaT


**LoanTenure = (𝑀𝑎𝑡𝑢𝑟𝑖𝑡𝑦𝐷𝑎𝑡_𝑂𝑟𝑖𝑔𝑖𝑛𝑎𝑙𝑦𝑒𝑎𝑟 − 𝐿𝑜𝑎𝑛𝐷𝑎𝑡𝑒𝑦𝑒𝑎𝑟) 𝑥 12 −
(𝑀𝑎𝑡𝑢𝑟𝑖𝑡𝑦𝐷𝑎𝑡𝑒_𝑂𝑟𝑖𝑔𝑖𝑛𝑎𝑙𝑚𝑜𝑛𝑡ℎ − 𝐿𝑜𝑎𝑛𝐷𝑎𝑡𝑒𝑚𝑜𝑛𝑡ℎ)**

In [10]:
df["LoanTenure"] = ((df["ClosedDate"].dt.year - df["LoanOriginationDate"].dt.year) * 12) - (df["ClosedDate"].dt.month - df["LoanOriginationDate"].dt.month)

In [11]:
print(df["LoanTenure"].shape)
df.shape

(113937,)


(113937, 65)

In [12]:
df["LoanTenure"].describe()

count    113928.000000
mean         17.853855
std          12.778681
min         -27.000000
25%          11.000000
50%          17.000000
75%          27.000000
max         204.000000
Name: LoanTenure, dtype: float64

In [13]:
print(df["Term"].describe())
print(df["LoanTenure"].describe())

count    113937.000000
mean         40.830248
std          10.436212
min          12.000000
25%          36.000000
50%          36.000000
75%          36.000000
max          60.000000
Name: Term, dtype: float64
count    113928.000000
mean         17.853855
std          12.778681
min         -27.000000
25%          11.000000
50%          17.000000
75%          27.000000
max         204.000000
Name: LoanTenure, dtype: float64


In [14]:
df.drop(date_cols, axis=1, inplace=True)
df.drop("LoanTenure", axis=1, inplace=True)
df.rename(columns={"Term" : "LoanTenure"}, inplace=True)

In [15]:
df.shape

(113937, 61)

# Equated Monthly Installments (EMI):

-	Tenure ---> **LoanTenure**
-	Principle repayment ---> **LP_CustomerPrinciplePayments**
-	Interest ---> **BorrowerRate**


In [16]:
emi_cols = ["LP_CustomerPayments", "LP_CustomerPrincipalPayments"]
df[emi_cols].head()

Unnamed: 0,LP_CustomerPayments,LP_CustomerPrincipalPayments
0,11396.14,9425.0
1,0.0,0.0
2,4186.63,3001.0
3,5143.2,4091.09
4,2819.85,1563.22


**For each row in the dataset:**
1. Calculate result_1 = P * r * 〖(1+r)〗^n
2. Calculate result_2 = 〖(1+r)〗^n – 1
3. Calculate EMI = result_1 / result_2

In [17]:
def cal_EMI(P, r, n):
  P = P.values
  r = r.values
  n = n.values
  #print(P.shape[0])
  result_1 = np.empty(0)
  result_2 = np.empty(0)
  result = np.empty(0)
  for i in range(P.shape[0]):
    #print(P[i])
    #print(r[i])
    #print(n[i])
    # EMI = P × r × (1 + r) ^ n / ((1 + r) ^ n – 1)
    #print(P[i] * (1 + r[i]))
    result_1 = np.append(result_1, P[i] * r[i] * np.power((1 + r[i]),n[i]))
    result_2 = np.append(result_2, np.power((1 + r[i]),n[i]) - 1)
    result = np.append(result, (result_1[i] / result_2[i]))

  return result

In [18]:
df["BorrowerRate"].describe()

count    113937.000000
mean          0.192764
std           0.074818
min           0.000000
25%           0.134000
50%           0.184000
75%           0.250000
max           0.497500
Name: BorrowerRate, dtype: float64

In [19]:
df["EMI"] = cal_EMI(df["LP_CustomerPrincipalPayments"], df["BorrowerRate"], df["LoanTenure"])

df["EMI"].describe()

count    113929.000000
mean        558.230883
std         720.251611
min           0.000000
25%          90.728391
50%         287.662239
75%         739.577224
max        8780.054436
Name: EMI, dtype: float64

# Eligible Loan Amount (ELA):

**Components of ELA:**
-	A: “AppliedAmount” ---> **LoanOriginalAmount**
-	R: “Interest” ---> **BorrowerRate**
-	N: “LoanTenure” ---> **LoanTenure**
-	I: “IncomeTotal”  ---> **StatedMonthlyIncome**

In [20]:
ela_cols = ['DebtToIncomeRatio', 'IncomeRange', 'IncomeVerifiable',
       'StatedMonthlyIncome']

df[ela_cols].head()

Unnamed: 0,DebtToIncomeRatio,IncomeRange,IncomeVerifiable,StatedMonthlyIncome
0,0.17,"$25,000-49,999",True,3083.333333
1,0.18,"$50,000-74,999",True,6125.0
2,0.06,Not displayed,True,2083.333333
3,0.15,"$25,000-49,999",True,2875.0
4,0.26,"$100,000+",True,9583.333333


In [21]:
df_new = df[df['IncomeVerifiable'] == True]

print(df_new.shape)
print(df.shape[0] - df_new.shape[0])

(105268, 62)
8669


**Calculation Procedure:**
**For each row in the dataset:**
1.	Calculate: Total Payment Due = (A + (A*r)) * n
2.	Calculate: Max allowable amount = I * 12 * 30%
3.	If ( Total Payment Due <= Max allowable amount)
            Then ELA = AppliedAmount
            Else ELA = Max allowable amount


In [22]:
def eligible_loan_amnt(df):
  df['Ava_Inc'] = (df['StatedMonthlyIncome'] * 12 * 0.3) * df['LoanTenure']
  df['Total_Loan_Amnt'] = np.round(df['LoanOriginalAmount'] + (df['LoanOriginalAmount'] * df['BorrowerRate']) *df['LoanTenure'])

  ELA = np.empty(0)

  for i in range(len(df['Ava_Inc'].values)):
    if df['Ava_Inc'].iloc[i] <= 0:
      ELA = np.append(ELA, 0)
    elif df['Total_Loan_Amnt'].iloc[i] <= df['Ava_Inc'].iloc[i]:
      ELA = np.append(ELA, df['Total_Loan_Amnt'].iloc[i])
    else:
      ELA = np.append(ELA, df['Ava_Inc'].iloc[i])

  df.drop(["Ava_Inc", "Total_Loan_Amnt"], axis=1, inplace=True)

  return ELA

In [23]:
df['ELA'] = eligible_loan_amnt(df)

df['ELA'].describe()

count    113937.000000
mean      70997.027956
std       60375.953468
min           0.000000
25%       28057.000000
50%       49749.000000
75%       96184.000000
max      416990.000000
Name: ELA, dtype: float64

In [24]:
# Dropping extra cols
['CreditGrade', 'ClosedDate',
       'ProsperRating (Alpha)', 'ListingCategory', 'BorrowerState',
       'Occupation', 'EmploymentStatus', 'IsBorrowerHomeowner',
       'CurrentlyInGroup', 'GroupKey', 'FirstRecordedCreditLine',
       'IncomeRange', 'IncomeVerifiable', 'LoanOriginationDate']
df.drop(['LoanStatus'], axis=1, inplace=True)

In [25]:
df.shape

(113937, 62)

## ROI

In [26]:
df['InterestAmount'] = (df['LoanOriginalAmount']*(df['BorrowerRate']))
df['TotalAmount'] = (df['InterestAmount'] + df['LoanOriginalAmount'])
df['ROI'] = (df['InterestAmount'] / df['TotalAmount'])

In [27]:
 print(df['ROI'].describe())

count    113937.000000
mean          0.158343
std           0.052204
min           0.000000
25%           0.118166
50%           0.155405
75%           0.200000
max           0.332220
Name: ROI, dtype: float64


In [28]:
df.head()

Unnamed: 0,CreditGrade,ProsperRating (Alpha),BorrowerState,Occupation,EmploymentStatus,FirstRecordedCreditLine,IncomeRange,LoanOriginationQuarter,LoanTenure,ListingCategory (numeric),...,InvestmentFromFriendsAmount,CreditScore,IsBorrowerHomeowner,CurrentlyInGroup,IncomeVerifiable,EMI,ELA,InterestAmount,TotalAmount,ROI
0,C,not available,CO,Other,Self-employed,11-10-2001 00:00,"$25,000-49,999",Q3 2007,36,0,...,0.0,649.5,True,True,True,1496.76457,63034.0,1489.15,10914.15,0.136442
1,not available,A,CO,Professional,Employed,18-03-1996 00:00,"$50,000-74,999",Q1 2014,36,2,...,0.0,689.5,False,False,True,0.0,43120.0,920.0,10920.0,0.084249
2,HR,not available,GA,Other,Not available,27-07-2002 00:00,Not displayed,Q1 2007,36,0,...,0.0,489.5,False,True,True,825.406311,32711.0,825.275,3826.275,0.215686
3,not available,A,GA,Skilled Labor,Employed,28-02-1983 00:00,"$25,000-49,999",Q4 2012,36,16,...,0.0,809.5,True,False,True,413.021192,45064.0,974.0,10974.0,0.088755
4,not available,D,MN,Executive,Employed,20-02-2004 00:00,"$100,000+",Q3 2013,36,2,...,0.0,689.5,True,False,True,326.288414,127590.0,3127.5,18127.5,0.172528


In [29]:
df.isnull().sum()

CreditGrade              0
ProsperRating (Alpha)    0
BorrowerState            0
Occupation               0
EmploymentStatus         0
                        ..
EMI                      8
ELA                      0
InterestAmount           0
TotalAmount              0
ROI                      0
Length: 65, dtype: int64

In [30]:
df.dropna(inplace=True)

In [31]:
##taking ROI,EMI and ELA as target variable
## kepping ROI,EMI,ELA in y and rest in x

y = df[['ROI','EMI','ELA']]
df.drop(columns=['ROI','EMI','ELA'],axis=1,inplace=True)

In [32]:
x = df

In [33]:
x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 113929 entries, 0 to 113936
Data columns (total 62 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   CreditGrade                          113929 non-null  object 
 1   ProsperRating (Alpha)                113929 non-null  object 
 2   BorrowerState                        113929 non-null  object 
 3   Occupation                           113929 non-null  object 
 4   EmploymentStatus                     113929 non-null  object 
 5   FirstRecordedCreditLine              113929 non-null  object 
 6   IncomeRange                          113929 non-null  object 
 7   LoanOriginationQuarter               113929 non-null  object 
 8   LoanTenure                           113929 non-null  int64  
 9   ListingCategory (numeric)            113929 non-null  int64  
 10  OpenRevolvingAccounts                113929 non-null  int64  
 11  LoanCurrentDa

### Data Encoding

In [34]:
categorical_data = x.select_dtypes(include=['object','bool'])
categorical_data.head()

Unnamed: 0,CreditGrade,ProsperRating (Alpha),BorrowerState,Occupation,EmploymentStatus,FirstRecordedCreditLine,IncomeRange,LoanOriginationQuarter,IsBorrowerHomeowner,CurrentlyInGroup,IncomeVerifiable
0,C,not available,CO,Other,Self-employed,11-10-2001 00:00,"$25,000-49,999",Q3 2007,True,True,True
1,not available,A,CO,Professional,Employed,18-03-1996 00:00,"$50,000-74,999",Q1 2014,False,False,True
2,HR,not available,GA,Other,Not available,27-07-2002 00:00,Not displayed,Q1 2007,False,True,True
3,not available,A,GA,Skilled Labor,Employed,28-02-1983 00:00,"$25,000-49,999",Q4 2012,True,False,True
4,not available,D,MN,Executive,Employed,20-02-2004 00:00,"$100,000+",Q3 2013,True,False,True


In [35]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
categorical_data = categorical_data.apply(LabelEncoder().fit_transform)

In [36]:
categorical_data.head()

Unnamed: 0,CreditGrade,ProsperRating (Alpha),BorrowerState,Occupation,EmploymentStatus,FirstRecordedCreditLine,IncomeRange,LoanOriginationQuarter,IsBorrowerHomeowner,CurrentlyInGroup,IncomeVerifiable
0,3,7,5,35,7,4232,3,17,1,1,1
1,8,0,5,41,0,6672,4,7,0,0,1
2,6,7,10,35,2,10091,6,1,0,1,1
3,8,0,10,50,0,10291,3,31,1,0,1
4,8,4,23,19,0,7393,2,23,1,0,1


In [37]:
numerical_data = x.select_dtypes(include=['int','float'])
numerical_data.head()

Unnamed: 0,LoanTenure,ListingCategory (numeric),OpenRevolvingAccounts,LoanCurrentDaysDelinquent,LoanOriginalAmount,Recommendations,InvestmentFromFriendsCount,Investors,Status,BorrowerAPR,...,ProsperPrincipalBorrowed,ProsperPrincipalOutstanding,MonthlyLoanPayment,LP_CustomerPayments,LP_CustomerPrincipalPayments,PercentFunded,InvestmentFromFriendsAmount,CreditScore,InterestAmount,TotalAmount
0,36,0,1,0,9425,0,0,258,0,0.16516,...,8472.311961,2930.313906,330.43,11396.14,9425.0,1.0,0.0,649.5,1489.15,10914.15
1,36,2,13,0,10000,0,0,1,0,0.12016,...,8472.311961,2930.313906,318.93,0.0,0.0,1.0,0.0,689.5,920.0,10920.0
2,36,0,0,0,3001,0,0,41,0,0.28269,...,8472.311961,2930.313906,123.32,4186.63,3001.0,1.0,0.0,489.5,825.275,3826.275
3,36,16,7,0,10000,0,0,158,0,0.12528,...,8472.311961,2930.313906,321.45,5143.2,4091.09,1.0,0.0,809.5,974.0,10974.0
4,36,2,6,0,15000,0,0,20,0,0.24614,...,11000.0,9947.9,563.97,2819.85,1563.22,1.0,0.0,689.5,3127.5,18127.5


In [38]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# column names are (annoyingly) lost after Scaling
# (i.e. the dataframe is converted to a numpy ndarray)

numerical_data_rescaled = pd.DataFrame(scaler.fit_transform(numerical_data), 
                                    columns = numerical_data.columns, 
                                    index = numerical_data.index)

numerical_data_rescaled.head()

Unnamed: 0,LoanTenure,ListingCategory (numeric),OpenRevolvingAccounts,LoanCurrentDaysDelinquent,LoanOriginalAmount,Recommendations,InvestmentFromFriendsCount,Investors,Status,BorrowerAPR,...,ProsperPrincipalBorrowed,ProsperPrincipalOutstanding,MonthlyLoanPayment,LP_CustomerPayments,LP_CustomerPrincipalPayments,PercentFunded,InvestmentFromFriendsAmount,CreditScore,InterestAmount,TotalAmount
0,-0.462857,-0.694123,-1.289121,-0.327719,0.174164,-0.14451,-0.100827,1.71948,-0.39033,-0.668191,...,-3.54464e-12,2.100031e-11,0.300707,1.505678,1.553066,0.079053,-0.056028,-0.687575,0.034815,0.157519
1,-0.462857,-0.193729,1.302124,-0.327719,0.266229,-0.14451,-0.100827,-0.769862,-0.39033,-1.228315,...,-3.54464e-12,2.100031e-11,0.241026,-0.873168,-0.763162,0.079053,-0.056028,-0.084073,-0.520337,0.158339
2,-0.462857,-0.694123,-1.505059,-0.327719,-0.854399,-0.14451,-0.100827,-0.382416,-0.39033,0.794728,...,-3.54464e-12,2.100031e-11,-0.774107,0.000755,-0.025656,0.079053,-0.056028,-3.101585,-0.612733,-0.83627
3,-0.462857,3.309026,0.006501,-0.327719,0.266229,-0.14451,-0.100827,0.750864,-0.39033,-1.164585,...,-3.54464e-12,2.100031e-11,0.254104,0.200431,0.242238,0.079053,-0.056028,1.726434,-0.467665,0.165911
4,-0.462857,-0.193729,-0.209436,-0.327719,1.066791,-0.14451,-0.100827,-0.585825,-0.39033,0.339783,...,0.7763075,4.18721,1.51268,-0.284549,-0.378995,0.079053,-0.056028,-0.084073,1.632873,1.168902


In [39]:
scaled_data = pd.concat([categorical_data,numerical_data_rescaled],axis=1)
scaled_data.head()

Unnamed: 0,CreditGrade,ProsperRating (Alpha),BorrowerState,Occupation,EmploymentStatus,FirstRecordedCreditLine,IncomeRange,LoanOriginationQuarter,IsBorrowerHomeowner,CurrentlyInGroup,...,ProsperPrincipalBorrowed,ProsperPrincipalOutstanding,MonthlyLoanPayment,LP_CustomerPayments,LP_CustomerPrincipalPayments,PercentFunded,InvestmentFromFriendsAmount,CreditScore,InterestAmount,TotalAmount
0,3,7,5,35,7,4232,3,17,1,1,...,-3.54464e-12,2.100031e-11,0.300707,1.505678,1.553066,0.079053,-0.056028,-0.687575,0.034815,0.157519
1,8,0,5,41,0,6672,4,7,0,0,...,-3.54464e-12,2.100031e-11,0.241026,-0.873168,-0.763162,0.079053,-0.056028,-0.084073,-0.520337,0.158339
2,6,7,10,35,2,10091,6,1,0,1,...,-3.54464e-12,2.100031e-11,-0.774107,0.000755,-0.025656,0.079053,-0.056028,-3.101585,-0.612733,-0.83627
3,8,0,10,50,0,10291,3,31,1,0,...,-3.54464e-12,2.100031e-11,0.254104,0.200431,0.242238,0.079053,-0.056028,1.726434,-0.467665,0.165911
4,8,4,23,19,0,7393,2,23,1,0,...,0.7763075,4.18721,1.51268,-0.284549,-0.378995,0.079053,-0.056028,-0.084073,1.632873,1.168902


In [40]:
# split into train and test

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(scaled_data, y, train_size=0.7, random_state=100)

## Feature Importance

In [41]:
from sklearn.ensemble import RandomForestRegressor

# Assuming X_train and y_train are your training data
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Feature importances
importances = model.feature_importances_

In [42]:
importances

array([3.87418624e-07, 1.21733098e-06, 3.02835680e-06, 5.94179505e-06,
       1.62312191e-06, 4.33953656e-06, 2.09692454e-04, 1.78500515e-06,
       2.36917974e-07, 1.74360504e-07, 1.27339216e-06, 1.39785755e-01,
       9.47441950e-07, 2.87925499e-06, 1.50072720e-06, 1.10747384e-05,
       1.64022237e-07, 4.37196892e-08, 2.48872118e-06, 4.31918791e-07,
       4.37141228e-05, 1.24360908e-05, 9.20161517e-06, 7.11748435e-06,
       8.18099636e-06, 1.65168000e-05, 1.19322551e-06, 1.73240228e-06,
       1.37058773e-06, 1.42980308e-06, 3.92905161e-06, 1.74239864e-06,
       7.96721233e-06, 1.48670623e-06, 6.33397383e-07, 1.19175835e-06,
       7.74668086e-07, 5.68803621e-07, 2.31424818e-08, 1.70763780e-06,
       1.82403767e-06, 2.25504515e-06, 1.55110889e-06, 1.15607104e-06,
       5.88943074e-07, 6.85198147e-05, 2.32011979e-02, 3.17726723e-07,
       2.57336864e-06, 2.42309206e-06, 1.05571278e-06, 2.62257096e-07,
       8.75081036e-07, 4.43991064e-07, 9.27470632e-04, 4.62568246e-05,
      

In [43]:
improtance_info = pd.Series(importances)
improtance_info.index = X_train.columns
importance_columns = improtance_info.sort_values(ascending=False).head(20)

In [44]:
importance_columns.index

Index(['InterestAmount', 'LoanTenure', 'StatedMonthlyIncome', 'TotalAmount',
       'MonthlyLoanPayment', 'IncomeRange', 'DebtToIncomeRatio',
       'LP_CustomerPrincipalPayments', 'LP_CustomerPayments', 'BorrowerAPR',
       'EstimatedReturn', 'BorrowerRate', 'LoanOriginalAmount', 'LenderYield',
       'EstimatedLoss', 'InquiriesLast6Months', 'EstimatedEffectiveYield',
       'Occupation', 'FirstRecordedCreditLine', 'TotalCreditLinespast7years'],
      dtype='object')

In [45]:
X_train = X_train[['InterestAmount', 'LoanTenure', 'StatedMonthlyIncome', 'TotalAmount',
       'MonthlyLoanPayment', 'IncomeRange', 'DebtToIncomeRatio',
       'LP_CustomerPrincipalPayments', 'LP_CustomerPayments', 'BorrowerAPR',
       'EstimatedReturn', 'BorrowerRate', 'LenderYield', 'LoanOriginalAmount',
       'EstimatedLoss', 'InquiriesLast6Months', 'EstimatedEffectiveYield',
       'FirstRecordedCreditLine', 'Occupation', 'OnTimeProsperPayments']]

X_test = X_test[['InterestAmount', 'LoanTenure', 'StatedMonthlyIncome', 'TotalAmount',
       'MonthlyLoanPayment', 'IncomeRange', 'DebtToIncomeRatio',
       'LP_CustomerPrincipalPayments', 'LP_CustomerPayments', 'BorrowerAPR',
       'EstimatedReturn', 'BorrowerRate', 'LenderYield', 'LoanOriginalAmount',
       'EstimatedLoss', 'InquiriesLast6Months', 'EstimatedEffectiveYield',
       'FirstRecordedCreditLine', 'Occupation', 'OnTimeProsperPayments']]

In [46]:
X_train

Unnamed: 0,InterestAmount,LoanTenure,StatedMonthlyIncome,TotalAmount,MonthlyLoanPayment,IncomeRange,DebtToIncomeRatio,LP_CustomerPrincipalPayments,LP_CustomerPayments,BorrowerAPR,EstimatedReturn,BorrowerRate,LenderYield,LoanOriginalAmount,EstimatedLoss,InquiriesLast6Months,EstimatedEffectiveYield,FirstRecordedCreditLine,Occupation,OnTimeProsperPayments
101761,-0.926458,-2.762489,0.062437,-0.797381,0.263290,4,-0.233817,0.121548,-0.064241,-0.254073,-5.857251e-01,-0.706889,-0.708909,-0.758492,-8.252766e-01,0.646891,-6.932319e-01,240,0,-2.324576e+00
91234,-0.083355,1.836774,-0.057125,0.080944,-0.298015,4,-0.103126,-0.636597,-0.649491,-0.541727,-5.327487e-01,-0.545131,-0.546495,0.106116,-6.914715e-01,-0.586321,-5.783180e-01,388,41,7.374010e-11
93570,-0.403289,-0.462857,0.197264,0.175165,0.270192,5,-0.233817,1.694372,1.440206,-1.166701,-1.538539e+00,-1.186814,-1.190782,0.266229,-1.048285e+00,-0.586321,-1.268648e+00,135,41,7.374010e-11
56536,1.857705,-0.462857,0.085837,1.902269,2.275548,5,0.083575,4.151910,3.349670,-0.177024,-2.770134e-01,-0.332574,-0.333075,1.867354,-5.056311e-01,-0.586321,-3.378280e-01,4369,23,7.374010e-11
74829,1.391148,-0.462857,0.475834,0.292756,0.531435,2,-0.177806,1.448372,2.075775,1.558861,-3.182040e-09,1.700760,1.708511,0.105956,-6.769677e-09,-0.586321,1.214385e-09,4282,38,7.374010e-11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65618,-1.066419,-0.462857,-0.616158,-1.175035,-1.169760,1,1.166441,-0.505121,-0.589862,2.034593,3.107382e+00,2.008233,2.017232,-1.166779,1.479145e+00,-0.586321,1.512756e-01,3214,59,6.909156e-01
77659,-0.283801,-0.462857,-0.299703,-0.158184,-0.055299,3,0.251606,1.079982,1.077153,-0.523679,-3.182040e-09,-0.505026,-0.506227,-0.134053,-6.769677e-09,-0.175250,1.214385e-09,7243,35,7.374010e-11
79689,1.827467,1.836774,0.364406,1.196873,0.743845,2,-0.159136,2.923142,2.429479,0.348247,1.746764e+00,0.387983,0.390406,1.066791,-3.817375e-01,-0.175250,6.454563e-01,11268,2,7.374010e-11
56090,0.273644,-0.462857,0.475834,0.272470,0.444925,2,-0.271157,0.175272,0.323237,-0.106947,-1.855432e-01,-0.259047,-0.259250,0.266229,-4.436843e-01,1.469032,-2.549004e-01,2254,47,-1.962717e+00


In [47]:
y_train

Unnamed: 0,ROI,EMI,ELA
101761,0.122730,635.728895,9644.0
91234,0.131944,78.297612,91080.0
93570,0.094203,1070.383507,47440.0
56536,0.143762,3370.621363,140888.0
74829,0.242424,2879.808232,112667.0
...,...,...,...
65618,0.255398,360.158826,14015.0
77659,0.134199,1169.024735,49350.0
79689,0.181535,3327.020047,214620.0
56090,0.147776,664.245483,72424.0


In [48]:
y_train_array = y_train.values
y_train_array

array([[1.22730064e-01, 6.35728895e+02, 9.64400000e+03],
       [1.31944444e-01, 7.82976116e+01, 9.10800000e+04],
       [9.42028986e-02, 1.07038351e+03, 4.74400000e+04],
       ...,
       [1.81535440e-01, 3.32702005e+03, 2.14620000e+05],
       [1.47775695e-01, 6.64245483e+02, 7.24240000e+04],
       [6.14734866e-02, 7.70920482e+02, 3.54970000e+04]])

### Multi-Output Regressor

- Multi target regression.

This strategy consists of fitting one regressor per target. This is a simple strategy for extending regressors that do not natively support multi-target regression.
- Reference : https://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputRegressor.html

## Multi-Output Regressor Using Linear Regression

In [49]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression


# Create a base regressor (Linear Regression)
base_regressor = LinearRegression()

# Create a MultiOutputRegressor with the base regressor
model = MultiOutputRegressor(base_regressor)

# Fit the model
model.fit(X_train, y_train_array)


In [50]:
y_train_pred = model.predict(X_train)

In [51]:
from sklearn import metrics

print('Mean Absolute Error: ', metrics.mean_absolute_error(y_train, y_train_pred))

print('Mean Squared Error: ', metrics.mean_squared_error(y_train, y_train_pred))

print('Root Mean Squared Error: ', np.sqrt(metrics.mean_squared_error(y_train, y_train_pred)))

print(metrics.r2_score(y_train, y_train_pred))

Mean Absolute Error:  2230.9796798037223
Mean Squared Error:  64016594.257473916
Root Mean Squared Error:  8001.037073871981
0.9523580936972014


In [52]:
y_test_pred = model.predict(X_test)

In [53]:
from sklearn import metrics

print('Mean Absolute Error: ', metrics.mean_absolute_error(y_test, y_test_pred))

print('Mean Squared Error: ', metrics.mean_squared_error(y_test, y_test_pred))

print('Root Mean Squared Error: ', np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

print(metrics.r2_score(y_test, y_test_pred))

Mean Absolute Error:  2278.1984988412137
Mean Squared Error:  67171608.64016724
Root Mean Squared Error:  8195.82873418956
0.9519137654487689


## Multi-Output Regressor Using Ridge  Regression

In [54]:
from sklearn.linear_model import Ridge


# Create a base regressor (Linear Regression)
base_regressor = Ridge(alpha=1)

# Create a MultiOutputRegressor with the base regressor
model = MultiOutputRegressor(base_regressor)

# Fit the model
model.fit(X_train, y_train_array)


In [55]:
y_train_pred = model.predict(X_train)

In [56]:
from sklearn import metrics

print('Mean Absolute Error: ', metrics.mean_absolute_error(y_train, y_train_pred))

print('Mean Squared Error: ', metrics.mean_squared_error(y_train, y_train_pred))

print('Root Mean Squared Error: ', np.sqrt(metrics.mean_squared_error(y_train, y_train_pred)))

print(metrics.r2_score(y_train, y_train_pred))

Mean Absolute Error:  2230.789819208892
Mean Squared Error:  64016599.442944795
Root Mean Squared Error:  8001.037397921897
0.9523580311139369


In [57]:
y_test_pred = model.predict(X_test)

In [58]:
from sklearn import metrics

print('Mean Absolute Error: ', metrics.mean_absolute_error(y_test, y_test_pred))

print('Mean Squared Error: ', metrics.mean_squared_error(y_test, y_test_pred))

print('Root Mean Squared Error: ', np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

print(metrics.r2_score(y_test, y_test_pred))

Mean Absolute Error:  2278.0284178728457
Mean Squared Error:  67173381.17668216
Root Mean Squared Error:  8195.936869979061
0.9519129724352856
