#### Importing Required Packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as ss
from scipy.stats import chi2_contingency
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
from sklearn.ensemble import RandomForestClassifier
# !pip install pycaret
from pycaret.regression import *

#### Importing the data

In [None]:
data=pd.read_csv("data/FannieAcq2000-19FMac30-sample25.csv")
data.head()

In [None]:
data.rename(columns={'Unnamed: 0':'namedCol'}, inplace=True )

In [None]:
data.shape

In [None]:
data.columns

In [None]:
data['sellerNameCat'] = data['sellerName'].astype('category')

In [None]:
data['sellerNameCat'] = data['sellerNameCat'].cat.codes

In [None]:
data.head()

In [None]:
# Create categories out of loan purpose that can be fed into ML models: 
# data['loanPurpCat'] = data['loanPurp'].replace(['C','R'],[0,1])
# data['loanPurpCat'].unique()

In [None]:
# # Create categories out of Bank Name that can be fed into ML models: 
# data['bankNumber'] = data['sellerName'].replace(['UNITED SHORE FINANCIAL SERVICES, LLC DBA UNITED WHOLESALE MORTGAGE',
#        'QUICKEN LOANS INC.', 'PENNYMAC CORP.',
#        'FAIRWAY INDEPENDENT MORTGAGE CORPORATION', 'OTHER',
#        'JPMORGAN CHASE BANK, NATIONAL ASSOCIATION',
#        'WELLS FARGO BANK, N.A.', 'LAKEVIEW LOAN SERVICING, LLC',
#        'AMERIHOME MORTGAGE COMPANY, LLC', 'DITECH FINANCIAL LLC',
#        'FLAGSTAR BANK, FSB', 'FREEDOM MORTGAGE CORP.',
#        'TRUIST BANK (FORMERLY SUNTRUST BANK)', 'U.S. BANK N.A.',
#        'EAGLE HOME MORTGAGE, LLC', 'MOVEMENT MORTGAGE, LLC',
#        'NATIONSTAR MORTGAGE, LLC'],[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16])
# bankNumber = data['bankNumber'].unique()
# bankNumber

In [None]:
# Create categories out of State Name that can be fed into ML models: 
data['stateNumber'] = data['propState'].replace(['IN', 'CA', 'NC', 'LA', 'AZ', 'GA', 'CT', 'MN', 'MI', 'TX', 'TN',
       'MD', 'IL', 'KY', 'VA', 'WI', 'PA', 'OR', 'FL', 'MA', 'WA', 'MS',
       'SC', 'NJ', 'AL', 'RI', 'CO', 'NY', 'NE', 'NM', 'ID', 'AR', 'DC',
       'MO', 'IA', 'OH', 'DE', 'UT', 'MT', 'KS', 'ND', 'OK', 'WY', 'SD',
       'NV', 'ME', 'AK', 'NH', 'VT', 'HI', 'WV', 'PR', 'GU'],[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,4,43,44,45,46,47,48,49,50,51,52])
stateNumber = data['stateNumber'].unique()
stateNumber

In [None]:
data.head()

In [None]:
#Checking for missing values
data.isnull().sum()

In [None]:
#Checking datatypes of individual feature
data.dtypes

In [None]:
#Dropping features  not required in model building and prediction
data.drop(['namedCol', 'loanIdentifier', 'sellerName', 'firstPmtDate', 'borrCreditScore', 'propState', 'bestCreditScore',
           'avgCreditScore', 'fredRate', 'rateDiff', 'rank'],1,inplace=True)

In [None]:
data.head()

In [None]:
# #Dividing features into numerical and categorical features
# categorical=[]
# numerical=[]
# for col in list(data):
#     if(len(data[col].unique())<=30):
#         categorical.append(col)
#     else:
#         numerical.append(col)

### Chi-Square test for Feature Importance of Categorical Features

As majority of the features in the data are categorical, also the target feature is categorical we can use Chi-Square test for to get the feature importance.

In [None]:
def Chi_square(col_1,col_2):
    X=data[col_1].astype('str')
    Y=data[col_2].astype('str')
    observed_values=pd.crosstab(Y,X)
    chi2, p, dof, expected = ss.chi2_contingency(observed_values)
    if(p>0.05):
        print(col_1," is not required")
    else:
        print(col_1," is required")
        
for col in data:
    Chi_square(col,"origIntRate")

### Feature Selection using Backward Elimination for Numerical Features

Using Backward Elimination method for numerical features

In [None]:
# X=data.drop(['origIntRate'],1)
# y=le.fit_transform(data["origIntRate"])

# import statsmodels.api as sm
# cols = list(X.columns)
# pmax = 1
# while (pmax>0.05):
#     p=[]
#     X_1 = X[cols]
#     X_1 = sm.add_constant(X_1)
#     model = sm.OLS(y,X_1).fit()
#     p = pd.Series(model.pvalues.values[1:],index = cols)      
#     pmax = max(p)
#     feature_with_p_max = p.idxmax()
#     if(pmax>0.05):
#         cols.remove(feature_with_p_max)
#     else:
#         breakselected_features_BE = cols
# print("Best features using Backward Elimination: ",cols)

### Feature Importance using Random Forest Classifier

Random Forest is considered to be one of the most unbiased model. As it creates multiple Decision Trees taking into account Random Features for each Decision Tree.

Because of this randomness the Random Forest Classifier considerd to be giving most unbiased Feature Importance

In [None]:
# X_rfc=data.drop(["origIntRate"],1)
# y_rfc=data["origIntRate"]

In [None]:
# rfc=RandomForestClassifier(criterion='entropy',random_state=0,n_estimators=10)
# rfc.fit(X_rfc,y_rfc)

In [None]:
# plt.figure(figsize=(10,10))
# plt.barh(list(X_rfc),rfc.feature_importances_)
# plt.title("Feature Importance using Random Forest Classifier")
# plt.ylabel("Features")
# plt.xlabel('Feature Importance Value')

In [None]:
data.head()

In [None]:
#Dropping 'gameId' feature as it's not required in model building and prediction
data.drop(['origIntRate'],1,inplace=True)

In [None]:
data.head()

### Model building using Pycaret Library

In [None]:
models=setup(data, target = 'rateDiffPct', categorical_features = ['zipCode', 'stateNumber', 'sellerNameCat'])

In [None]:
model_results=compare_models(blacklist = ['tr'])
model_results

In [None]:
logreg_model=create_model('catboost')

In [None]:
interpret_model(estimator=logreg_model,plot='summary')

In [None]:
interpret_model(estimator=logreg_model, plot='correlation')

In [None]:
interpret_model(estimator=logreg_model, plot='reason')

In [None]:
logreg_model2=create_model('xgboost')

In [None]:
plot_model(estimator=logreg_model2, plot='feature')

In [None]:
plot_model(estimator=logreg_model2)

In [None]:
tunned_logreg_model=tune_model('catboost')

In [None]:
interpret_model(estimator=tunned_logreg_model,plot='summary')

In [None]:
plot_model(estimator=tunned_logreg_model,plot='feature')