#### Importing Required Packages

In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as ss
from scipy.stats import chi2_contingency
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
from sklearn.ensemble import RandomForestClassifier
!pip install pycaret
from pycaret.classification import *
from pycaret.regression import *



#### Importing the data

In [15]:
data=pd.read_csv("data/FannieAcq2019Q1FMac30bin9.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,sellerName,origChannel,origIntRate,origUPB,origDate,firstPmtDate,origLTV,origCLTV,numBorrowers,...,loanPurp,propState,zipCode,pMIperct,mortInsType,fmacRate,fredRate,rateDiff,rateDiffPct,rank
0,0,"UNITED SHORE FINANCIAL SERVICES, LLC DBA UNITE...",1,4.625,160000,2019-02-01,4/1/2019,80,80,2,...,C,IN,462,0,0,4.46,2.7,0.165,0.036996,3
1,1,"UNITED SHORE FINANCIAL SERVICES, LLC DBA UNITE...",1,4.5,510000,2019-02-01,4/1/2019,64,64,2,...,C,CA,913,0,0,4.46,2.7,0.04,0.008969,2
2,2,QUICKEN LOANS INC.,0,4.75,143000,2019-02-01,4/1/2019,68,68,1,...,C,NC,286,0,0,4.46,2.7,0.29,0.065022,3
3,3,"UNITED SHORE FINANCIAL SERVICES, LLC DBA UNITE...",1,4.5,160000,2019-02-01,4/1/2019,17,17,1,...,C,CA,920,0,0,4.46,2.7,0.04,0.008969,2
4,4,PENNYMAC CORP.,2,4.5,320000,2019-02-01,4/1/2019,74,74,2,...,R,LA,707,0,0,4.46,2.7,0.04,0.008969,2


In [16]:
data.rename(columns={'Unnamed: 0':'namedCol'}, inplace=True )

In [17]:
data.shape

(28939, 25)

In [18]:
#Checking for missing values
data.isnull().sum()

namedCol            0
sellerName          0
origChannel         0
origIntRate         0
origUPB             0
origDate            0
firstPmtDate        0
origLTV             0
origCLTV            0
numBorrowers        0
origDebtIncRatio    0
borrCreditScore     0
BestCreditScore     0
WorstCreditScore    0
AvgCreditScore      0
loanPurp            0
propState           0
zipCode             0
pMIperct            0
mortInsType         0
fmacRate            0
fredRate            0
rateDiff            0
rateDiffPct         0
rank                0
dtype: int64

In [19]:
#Checking datatypes of individual feature
data.dtypes

namedCol              int64
sellerName           object
origChannel           int64
origIntRate         float64
origUPB               int64
origDate             object
firstPmtDate         object
origLTV               int64
origCLTV              int64
numBorrowers          int64
origDebtIncRatio      int64
borrCreditScore       int64
BestCreditScore       int64
WorstCreditScore      int64
AvgCreditScore      float64
loanPurp             object
propState            object
zipCode               int64
pMIperct              int64
mortInsType           int64
fmacRate            float64
fredRate            float64
rateDiff            float64
rateDiffPct         float64
rank                  int64
dtype: object

In [56]:
#Dropping 'gameId' feature as it's not required in model building and prediction
data.drop(['namedCol', 'sellerName', 'origChannel', 'origDate', 'firstPmtDate', 'borrCreditScore', 'BestCreditScore', 'WorstCreditScore',
           'loanPurp', 'propState', 'zipCode', 'pMIperct', 'mortInsType', 'numBorrowers',
          'rank'],1,inplace=True)

KeyError: "['namedCol' 'sellerName' 'origChannel' 'origDate' 'firstPmtDate'\n 'borrCreditScore' 'BestCreditScore' 'WorstCreditScore' 'loanPurp'\n 'propState' 'zipCode' 'pMIperct' 'mortInsType' 'rank'] not found in axis"

In [21]:
data.head()

Unnamed: 0,origIntRate,origUPB,origLTV,origCLTV,numBorrowers,origDebtIncRatio,AvgCreditScore,fmacRate,fredRate,rateDiff,rateDiffPct
0,4.625,160000,80,80,2,34,755.0,4.46,2.7,0.165,0.036996
1,4.5,510000,64,64,2,40,769.5,4.46,2.7,0.04,0.008969
2,4.75,143000,68,68,1,16,674.0,4.46,2.7,0.29,0.065022
3,4.5,160000,17,17,1,43,738.0,4.46,2.7,0.04,0.008969
4,4.5,320000,74,74,2,26,729.0,4.46,2.7,0.04,0.008969


In [22]:
# #Dividing features into numerical and categorical features
# categorical=[]
# numerical=[]
# for col in list(data):
#     if(len(data[col].unique())<=30):
#         categorical.append(col)
#     else:
#         numerical.append(col)

### Chi-Square test for Feature Importance of Categorical Features

As majority of the features in the data are categorical, also the target feature is categorical we can use Chi-Square test for to get the feature importance.

In [23]:
def Chi_square(col_1,col_2):
    X=data[col_1].astype('str')
    Y=data[col_2].astype('str')
    observed_values=pd.crosstab(Y,X)
    chi2, p, dof, expected = ss.chi2_contingency(observed_values)
    if(p>0.05):
        print(col_1," is not required")
    else:
        print(col_1," is required")
        
for col in data:
    Chi_square(col,"origIntRate")

origIntRate  is required
origUPB  is required
origLTV  is required
origCLTV  is required
numBorrowers  is not required
origDebtIncRatio  is not required
AvgCreditScore  is required
fmacRate  is required
fredRate  is required
rateDiff  is required
rateDiffPct  is required


### Feature Selection using Backward Elimination for Numerical Features

Using Backward Elimination method for numerical features

In [24]:
X=data.drop(['origIntRate'],1)
y=le.fit_transform(data["origIntRate"])

import statsmodels.api as sm
cols = list(X.columns)
pmax = 1
while (pmax>0.05):
    p=[]
    X_1 = X[cols]
    X_1 = sm.add_constant(X_1)
    model = sm.OLS(y,X_1).fit()
    p = pd.Series(model.pvalues.values[1:],index = cols)      
    pmax = max(p)
    feature_with_p_max = p.idxmax()
    if(pmax>0.05):
        cols.remove(feature_with_p_max)
    else:
        breakselected_features_BE = cols
print("Best features using Backward Elimination: ",cols)

Best features using Backward Elimination:  ['origUPB', 'origLTV', 'origCLTV', 'origDebtIncRatio', 'AvgCreditScore', 'fmacRate', 'rateDiff', 'rateDiffPct']


### Feature Importance using Random Forest Classifier

Random Forest is considered to be one of the most unbiased model. As it creates multiple Decision Trees taking into account Random Features for each Decision Tree.

Because of this randomness the Random Forest Classifier considerd to be giving most unbiased Feature Importance

In [38]:
X_rfc=data.drop(["origIntRate"],1)
y_rfc=data["origIntRate"]

In [39]:
# rfc=RandomForestClassifier(criterion='entropy',random_state=0,n_estimators=10)
# rfc.fit(X_rfc,y_rfc)

In [40]:
# plt.figure(figsize=(10,10))
# plt.barh(list(X_rfc),rfc.feature_importances_)
# plt.title("Feature Importance using Random Forest Classifier")
# plt.ylabel("Features")
# plt.xlabel('Feature Importance Value')

In [49]:
data.head()

Unnamed: 0,origIntRate,origUPB,origLTV,origCLTV,numBorrowers,origDebtIncRatio,AvgCreditScore,fmacRate,fredRate,rateDiff,rateDiffPct
0,4.625,160000,80,80,2,34,755.0,4.46,2.7,0.165,0.036996
1,4.5,510000,64,64,2,40,769.5,4.46,2.7,0.04,0.008969
2,4.75,143000,68,68,1,16,674.0,4.46,2.7,0.29,0.065022
3,4.5,160000,17,17,1,43,738.0,4.46,2.7,0.04,0.008969
4,4.5,320000,74,74,2,26,729.0,4.46,2.7,0.04,0.008969


### Model building using Pycaret Library

In [55]:
models=setup(data, target = 'origIntRate')

IntProgress(value=0, description='Processing: ', max=13)

Text(value="Following data types have been inferred automatically, if they are correct press enter to continue…

Unnamed: 0,Data Type
origIntRate,Label
origUPB,Numeric
origLTV,Numeric
origCLTV,Numeric
numBorrowers,Categorical
origDebtIncRatio,Numeric
AvgCreditScore,Numeric
fmacRate,Numeric
fredRate,Numeric
rateDiff,Numeric





ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [38]:
model_results=compare_models()
model_results

NameError: name 'y' is not defined

In [None]:
logreg_model=create_model('lr')

In [None]:
tunned_logreg_model=tune_model('lr')

In [None]:
plot_model(estimator=tunned_logreg_model,plot='parameter')

In [None]:
plot_model(estimator=tunned_logreg_model,plot='feature')

In [None]:
plot_model(estimator=tunned_logreg_model,plot='pr')

In [None]:
plot_model(estimator=tunned_logreg_model,plot='confusion_matrix')

In [None]:
plot_model(estimator=tunned_logreg_model,plot='class_report')

In [None]:
plot_model(tunned_logreg_model)