#### Importing Required Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as ss
from scipy.stats import chi2_contingency
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
from sklearn.ensemble import RandomForestClassifier
# !pip install pycaret
from pycaret.regression import *

#### Importing the data

In [2]:
data=pd.read_csv("data/FannieAcq2000-19FMac30-sample25.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,loanIdentifier,origChannel,sellerName,origIntRate,origUPB,origDate,firstPmtDate,origLTV,origCLTV,...,pMIperct,mortInsType,bestCreditScore,worstCreditScore,avgCreditScore,fmacRate,fredRate,rateDiff,rateDiffPct,rank
0,248147,212381983889,1,OTHER,7.25,70000,2001-03-01,2001-04-01,67.0,67.0,...,0.0,0.0,625.0,620.0,622.5,7.12,4.87,0.13,0.018258,3.0
1,1251128,590266713253,3,"JPMORGAN CHASE BANK, NA",6.5,123000,2001-12-01,2002-02-01,86.0,86.0,...,25.0,1.0,762.0,762.0,762.0,7.02,4.78,-0.52,-0.074074,3.0
2,5622227,790109185814,3,"BANK OF AMERICA, N.A.",6.75,401000,2007-09-01,2007-11-01,78.0,78.0,...,0.0,0.0,686.0,630.0,658.0,6.45,4.54,0.3,0.046512,3.0
3,6014547,571967892461,3,"JPMORGAN CHASE BANK, NATIONAL ASSOCIATION",7.125,215000,2008-06-01,2008-08-01,80.0,80.0,...,0.0,0.0,652.0,652.0,652.0,6.08,4.06,1.045,0.171875,4.0
4,8543620,192898085850,3,"JPMORGAN CHASE BANK, NATIONAL ASSOCIATION",3.375,215000,2013-02-01,2013-04-01,74.0,74.0,...,0.0,0.0,799.0,799.0,799.0,3.53,2.04,-0.155,-0.043909,3.0


In [3]:
data.rename(columns={'Unnamed: 0':'namedCol'}, inplace=True )

In [4]:
data.shape

(2616152, 26)

In [5]:
data.columns

Index(['namedCol', 'loanIdentifier', 'origChannel', 'sellerName',
       'origIntRate', 'origUPB', 'origDate', 'firstPmtDate', 'origLTV',
       'origCLTV', 'numBorrowers', 'origDebtIncRatio', 'borrCreditScore',
       'loanPurp', 'propState', 'zipCode', 'pMIperct', 'mortInsType',
       'bestCreditScore', 'worstCreditScore', 'avgCreditScore', 'fmacRate',
       'fredRate', 'rateDiff', 'rateDiffPct', 'rank'],
      dtype='object')

In [6]:
data['sellerNameCat'] = data['sellerName'].astype('category')

In [7]:
data['sellerNameCat'] = data['sellerNameCat'].cat.codes

In [8]:
data.head()

Unnamed: 0,namedCol,loanIdentifier,origChannel,sellerName,origIntRate,origUPB,origDate,firstPmtDate,origLTV,origCLTV,...,mortInsType,bestCreditScore,worstCreditScore,avgCreditScore,fmacRate,fredRate,rateDiff,rateDiffPct,rank,sellerNameCat
0,248147,212381983889,1,OTHER,7.25,70000,2001-03-01,2001-04-01,67.0,67.0,...,0.0,625.0,620.0,622.5,7.12,4.87,0.13,0.018258,3.0,54
1,1251128,590266713253,3,"JPMORGAN CHASE BANK, NA",6.5,123000,2001-12-01,2002-02-01,86.0,86.0,...,1.0,762.0,762.0,762.0,7.02,4.78,-0.52,-0.074074,3.0,44
2,5622227,790109185814,3,"BANK OF AMERICA, N.A.",6.75,401000,2007-09-01,2007-11-01,78.0,78.0,...,0.0,686.0,630.0,658.0,6.45,4.54,0.3,0.046512,3.0,4
3,6014547,571967892461,3,"JPMORGAN CHASE BANK, NATIONAL ASSOCIATION",7.125,215000,2008-06-01,2008-08-01,80.0,80.0,...,0.0,652.0,652.0,652.0,6.08,4.06,1.045,0.171875,4.0,45
4,8543620,192898085850,3,"JPMORGAN CHASE BANK, NATIONAL ASSOCIATION",3.375,215000,2013-02-01,2013-04-01,74.0,74.0,...,0.0,799.0,799.0,799.0,3.53,2.04,-0.155,-0.043909,3.0,45


In [9]:
# Create categories out of loan purpose that can be fed into ML models: 
# data['loanPurpCat'] = data['loanPurp'].replace(['C','R'],[0,1])
# data['loanPurpCat'].unique()

In [10]:
# # Create categories out of Bank Name that can be fed into ML models: 
# data['bankNumber'] = data['sellerName'].replace(['UNITED SHORE FINANCIAL SERVICES, LLC DBA UNITED WHOLESALE MORTGAGE',
#        'QUICKEN LOANS INC.', 'PENNYMAC CORP.',
#        'FAIRWAY INDEPENDENT MORTGAGE CORPORATION', 'OTHER',
#        'JPMORGAN CHASE BANK, NATIONAL ASSOCIATION',
#        'WELLS FARGO BANK, N.A.', 'LAKEVIEW LOAN SERVICING, LLC',
#        'AMERIHOME MORTGAGE COMPANY, LLC', 'DITECH FINANCIAL LLC',
#        'FLAGSTAR BANK, FSB', 'FREEDOM MORTGAGE CORP.',
#        'TRUIST BANK (FORMERLY SUNTRUST BANK)', 'U.S. BANK N.A.',
#        'EAGLE HOME MORTGAGE, LLC', 'MOVEMENT MORTGAGE, LLC',
#        'NATIONSTAR MORTGAGE, LLC'],[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16])
# bankNumber = data['bankNumber'].unique()
# bankNumber

In [11]:
# Create categories out of State Name that can be fed into ML models: 
data['stateNumber'] = data['propState'].replace(['IN', 'CA', 'NC', 'LA', 'AZ', 'GA', 'CT', 'MN', 'MI', 'TX', 'TN',
       'MD', 'IL', 'KY', 'VA', 'WI', 'PA', 'OR', 'FL', 'MA', 'WA', 'MS',
       'SC', 'NJ', 'AL', 'RI', 'CO', 'NY', 'NE', 'NM', 'ID', 'AR', 'DC',
       'MO', 'IA', 'OH', 'DE', 'UT', 'MT', 'KS', 'ND', 'OK', 'WY', 'SD',
       'NV', 'ME', 'AK', 'NH', 'VT', 'HI', 'WV', 'PR', 'GU'],[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,4,43,44,45,46,47,48,49,50,51,52])
stateNumber = data['stateNumber'].unique()
stateNumber

array([2, 15, 5, 9, 19, 35, 49, 16, 12, 30, 22, 8, 1, 4, 23, 20, 14, 33,
       26, 47, 18, 44, 11, 17, 41, 0, 7, 27, 34, 31, 6, 28, 25, 13, 3, 10,
       43, 24, 45, 29, 38, 21, 46, 36, 51, 32, 37, 48, 40, 'VI', 39, 50,
       52], dtype=object)

In [12]:
data.head()

Unnamed: 0,namedCol,loanIdentifier,origChannel,sellerName,origIntRate,origUPB,origDate,firstPmtDate,origLTV,origCLTV,...,bestCreditScore,worstCreditScore,avgCreditScore,fmacRate,fredRate,rateDiff,rateDiffPct,rank,sellerNameCat,stateNumber
0,248147,212381983889,1,OTHER,7.25,70000,2001-03-01,2001-04-01,67.0,67.0,...,625.0,620.0,622.5,7.12,4.87,0.13,0.018258,3.0,54,2
1,1251128,590266713253,3,"JPMORGAN CHASE BANK, NA",6.5,123000,2001-12-01,2002-02-01,86.0,86.0,...,762.0,762.0,762.0,7.02,4.78,-0.52,-0.074074,3.0,44,15
2,5622227,790109185814,3,"BANK OF AMERICA, N.A.",6.75,401000,2007-09-01,2007-11-01,78.0,78.0,...,686.0,630.0,658.0,6.45,4.54,0.3,0.046512,3.0,4,5
3,6014547,571967892461,3,"JPMORGAN CHASE BANK, NATIONAL ASSOCIATION",7.125,215000,2008-06-01,2008-08-01,80.0,80.0,...,652.0,652.0,652.0,6.08,4.06,1.045,0.171875,4.0,45,9
4,8543620,192898085850,3,"JPMORGAN CHASE BANK, NATIONAL ASSOCIATION",3.375,215000,2013-02-01,2013-04-01,74.0,74.0,...,799.0,799.0,799.0,3.53,2.04,-0.155,-0.043909,3.0,45,19


In [13]:
#Checking for missing values
data.isnull().sum()

namedCol            0
loanIdentifier      0
origChannel         0
sellerName          0
origIntRate         0
origUPB             0
origDate            0
firstPmtDate        0
origLTV             0
origCLTV            0
numBorrowers        0
origDebtIncRatio    0
borrCreditScore     0
loanPurp            0
propState           0
zipCode             0
pMIperct            0
mortInsType         0
bestCreditScore     0
worstCreditScore    0
avgCreditScore      0
fmacRate            0
fredRate            0
rateDiff            0
rateDiffPct         0
rank                0
sellerNameCat       0
stateNumber         0
dtype: int64

In [14]:
#Checking datatypes of individual feature
data.dtypes

namedCol              int64
loanIdentifier        int64
origChannel           int64
sellerName           object
origIntRate         float64
origUPB               int64
origDate             object
firstPmtDate         object
origLTV             float64
origCLTV            float64
numBorrowers        float64
origDebtIncRatio    float64
borrCreditScore     float64
loanPurp              int64
propState            object
zipCode               int64
pMIperct            float64
mortInsType         float64
bestCreditScore     float64
worstCreditScore    float64
avgCreditScore      float64
fmacRate            float64
fredRate            float64
rateDiff            float64
rateDiffPct         float64
rank                float64
sellerNameCat          int8
stateNumber          object
dtype: object

In [15]:
#Dropping features  not required in model building and prediction
data.drop(['namedCol', 'loanIdentifier', 'sellerName', 'firstPmtDate', 'borrCreditScore', 'propState', 'bestCreditScore',
           'avgCreditScore', 'fredRate', 'rateDiff', 'rank'],1,inplace=True)

In [16]:
data.head()

Unnamed: 0,origChannel,origIntRate,origUPB,origDate,origLTV,origCLTV,numBorrowers,origDebtIncRatio,loanPurp,zipCode,pMIperct,mortInsType,worstCreditScore,fmacRate,rateDiffPct,sellerNameCat,stateNumber
0,1,7.25,70000,2001-03-01,67.0,67.0,2.0,16.0,2,281,0.0,0.0,620.0,7.12,0.018258,54,2
1,3,6.5,123000,2001-12-01,86.0,86.0,2.0,43.0,2,532,25.0,1.0,762.0,7.02,-0.074074,44,15
2,3,6.75,401000,2007-09-01,78.0,78.0,2.0,46.0,1,310,0.0,0.0,630.0,6.45,0.046512,4,5
3,3,7.125,215000,2008-06-01,80.0,80.0,2.0,32.0,1,779,0.0,0.0,652.0,6.08,0.171875,45,9
4,3,3.375,215000,2013-02-01,74.0,74.0,1.0,37.0,1,12,0.0,0.0,799.0,3.53,-0.043909,45,19


In [17]:
# #Dividing features into numerical and categorical features
# categorical=[]
# numerical=[]
# for col in list(data):
#     if(len(data[col].unique())<=30):
#         categorical.append(col)
#     else:
#         numerical.append(col)

### Chi-Square test for Feature Importance of Categorical Features

As majority of the features in the data are categorical, also the target feature is categorical we can use Chi-Square test for to get the feature importance.

In [18]:
def Chi_square(col_1,col_2):
    X=data[col_1].astype('str')
    Y=data[col_2].astype('str')
    observed_values=pd.crosstab(Y,X)
    chi2, p, dof, expected = ss.chi2_contingency(observed_values)
    if(p>0.05):
        print(col_1," is not required")
    else:
        print(col_1," is required")
        
for col in data:
    Chi_square(col,"origIntRate")

origChannel  is required
origIntRate  is required
origUPB  is required
origDate  is required
origLTV  is required
origCLTV  is required
numBorrowers  is required
origDebtIncRatio  is required
loanPurp  is required
zipCode  is required
pMIperct  is required
mortInsType  is required
worstCreditScore  is required
fmacRate  is required
rateDiffPct  is required
sellerNameCat  is required
stateNumber  is required


### Feature Selection using Backward Elimination for Numerical Features

Using Backward Elimination method for numerical features

In [19]:
# X=data.drop(['origIntRate'],1)
# y=le.fit_transform(data["origIntRate"])

# import statsmodels.api as sm
# cols = list(X.columns)
# pmax = 1
# while (pmax>0.05):
#     p=[]
#     X_1 = X[cols]
#     X_1 = sm.add_constant(X_1)
#     model = sm.OLS(y,X_1).fit()
#     p = pd.Series(model.pvalues.values[1:],index = cols)      
#     pmax = max(p)
#     feature_with_p_max = p.idxmax()
#     if(pmax>0.05):
#         cols.remove(feature_with_p_max)
#     else:
#         breakselected_features_BE = cols
# print("Best features using Backward Elimination: ",cols)

### Feature Importance using Random Forest Classifier

Random Forest is considered to be one of the most unbiased model. As it creates multiple Decision Trees taking into account Random Features for each Decision Tree.

Because of this randomness the Random Forest Classifier considerd to be giving most unbiased Feature Importance

In [20]:
# X_rfc=data.drop(["origIntRate"],1)
# y_rfc=data["origIntRate"]

In [21]:
# rfc=RandomForestClassifier(criterion='entropy',random_state=0,n_estimators=10)
# rfc.fit(X_rfc,y_rfc)

In [22]:
# plt.figure(figsize=(10,10))
# plt.barh(list(X_rfc),rfc.feature_importances_)
# plt.title("Feature Importance using Random Forest Classifier")
# plt.ylabel("Features")
# plt.xlabel('Feature Importance Value')

In [23]:
data.head()

Unnamed: 0,origChannel,origIntRate,origUPB,origDate,origLTV,origCLTV,numBorrowers,origDebtIncRatio,loanPurp,zipCode,pMIperct,mortInsType,worstCreditScore,fmacRate,rateDiffPct,sellerNameCat,stateNumber
0,1,7.25,70000,2001-03-01,67.0,67.0,2.0,16.0,2,281,0.0,0.0,620.0,7.12,0.018258,54,2
1,3,6.5,123000,2001-12-01,86.0,86.0,2.0,43.0,2,532,25.0,1.0,762.0,7.02,-0.074074,44,15
2,3,6.75,401000,2007-09-01,78.0,78.0,2.0,46.0,1,310,0.0,0.0,630.0,6.45,0.046512,4,5
3,3,7.125,215000,2008-06-01,80.0,80.0,2.0,32.0,1,779,0.0,0.0,652.0,6.08,0.171875,45,9
4,3,3.375,215000,2013-02-01,74.0,74.0,1.0,37.0,1,12,0.0,0.0,799.0,3.53,-0.043909,45,19


In [24]:
#Dropping 'gameId' feature as it's not required in model building and prediction
data.drop(['origIntRate'],1,inplace=True)

In [25]:
data.head()

Unnamed: 0,origChannel,origUPB,origDate,origLTV,origCLTV,numBorrowers,origDebtIncRatio,loanPurp,zipCode,pMIperct,mortInsType,worstCreditScore,fmacRate,rateDiffPct,sellerNameCat,stateNumber
0,1,70000,2001-03-01,67.0,67.0,2.0,16.0,2,281,0.0,0.0,620.0,7.12,0.018258,54,2
1,3,123000,2001-12-01,86.0,86.0,2.0,43.0,2,532,25.0,1.0,762.0,7.02,-0.074074,44,15
2,3,401000,2007-09-01,78.0,78.0,2.0,46.0,1,310,0.0,0.0,630.0,6.45,0.046512,4,5
3,3,215000,2008-06-01,80.0,80.0,2.0,32.0,1,779,0.0,0.0,652.0,6.08,0.171875,45,9
4,3,215000,2013-02-01,74.0,74.0,1.0,37.0,1,12,0.0,0.0,799.0,3.53,-0.043909,45,19


### Model building using Pycaret Library

In [None]:
models=setup(data, target = 'rateDiffPct', categorical_features = ['zipCode', 'stateNumber', 'sellerNameCat'])

In [None]:
model_results=compare_models(blacklist = ['tr'])
model_results

In [None]:
logreg_model=create_model('catboost')

In [None]:
interpret_model(estimator=logreg_model,plot='summary')

In [None]:
interpret_model(estimator=logreg_model, plot='correlation')

In [None]:
interpret_model(estimator=logreg_model, plot='reason')

In [None]:
logreg_model2=create_model('xgboost')

In [None]:
plot_model(estimator=logreg_model2, plot='feature')

In [None]:
plot_model(estimator=logreg_model2)

In [None]:
tunned_logreg_model=tune_model('catboost')

In [None]:
interpret_model(estimator=tunned_logreg_model,plot='summary')

In [None]:
plot_model(estimator=tunned_logreg_model,plot='feature')