# Fannie Mae

Final class project to predict the net loss on a mortgage loan, using public Fannie Mae data from 1991 - 2021. Started code provided by the professor, a group effort to finalize.

## Import Packages

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib as mpl
import seaborn as sns
import random

from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, LabelEncoder, StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_predict
from xgboost import XGBRegressor

from patsy import dmatrices, dmatrix, build_design_matrices

from xgboost.sklearn import XGBClassifier
import xgboost as xgb

# Set number of CPU cores for parallel algorithms
import os
if "CPU_LIMIT" in os.environ:
    # If you are on JupyterHub, this gives you the right number of CPUs for your virtual machine
    num_cpus = int(os.getenv("CPU_LIMIT").split('.')[0])
else:
    # If you are not on JupyterHub, this gives you the right number for your computer.
    num_cpus = os.cpu_count()

In [None]:
# This makes it so that the pandas dataframes don't get truncated horizontally.
pd.options.display.max_columns = 200

## Load the Data 

In [None]:
col_classes = {"LOAN_IDENTIFIER": np.character, 
               "CHANNEL": 'category', 
               "SELLER_NAME": np.character, 
               "ORIGINAL_INTEREST_RATE": np.float32, 
               "ORIGINAL_UPB": np.float64,
               "ORIGINAL_LOAN_TERM": "Int16", 
               "ORIGINATION_DATE": np.character,
               "FIRST_PAYMENT_DATE": np.character, 
               "ORIGINAL_LTV": np.float32, 
               "ORIGINAL_COMBINED_LTV": np.float32, 
               "NUMBER_OF_BORROWERS": 'category', 
               "DTI": np.float32, 
               "BORROWER_CREDIT_SCORE_AT_ORIGINATION": "UInt16", 
               "COBORROWER_CREDIT_SCORE_AT_ORIGINATION": 'UInt16', 
               "FIRST_TIME_HOME_BUYER_INDICATOR": 'category', 
               "LOAN_PURPOSE": 'category', 
               "PROPERTY_TYPE": 'category',
               "NUMBER_OF_UNITS": "UInt16", 
               "OCCUPANCY_STATUS": 'category', 
               "PROPERTY_STATE": 'category', 
               "MSA": 'category', 
               "ZIP_CODE_SHORT": 'category', 
               "MORTGAGE_INSURANCE_PERCENTAGE": np.float32, 
               "AMORTIZATION_TYPE": np.character,
               "MORTGAGE_INSURANCE_TYPE": 'category', 
               "RELOCATION_MORTGAGE_INDICATOR": 'category',
               "CREDIT_SCORE_MIN": "UInt16",
               "ORIGINAL_VALUE": float,
               "ZERO_BALANCE_CODE": 'category',
               "LOAN_AGE": "Int16",
               "NET_LOSS": float,
               "NET_SEVERITY": float,
               "LAST_STAT": 'category',
               "LOAN_MODIFICATION_COSTS": float,
               "TOTAL_LOSSES": float,
               "MSA_NAME": 'category',
               "CENSUS_2010_POP": float}

date_columns = ["ORIGINATION_DATE",
                "FIRST_PAYMENT_DATE"]

In [None]:
%%time

full_data_set = False

FILES_LOCATION = '/DataAnalyticsI/'

if not full_data_set:
    df_train = pd.read_csv(FILES_LOCATION + "FannieMaeSmallTrain.csv",
                           index_col="LOAN_IDENTIFIER",
                           dtype=col_classes,
                           parse_dates=date_columns,
                           sep='|')
    df_test = pd.read_csv(FILES_LOCATION + "FannieMaeSmallTest.csv",
                          index_col="LOAN_IDENTIFIER",
                          dtype=col_classes,
                          parse_dates=date_columns,
                          sep='|')

if full_data_set:
    # This p is the proportion of the training data you load.
    # You can set it anywhere from 0 to 1.
    p = 1
    random.seed(201)
    df_train = pd.read_csv(FILES_LOCATION + "FannieMaeTrain.csv",
                           index_col="LOAN_IDENTIFIER",
                           dtype=col_classes,
                           parse_dates=date_columns,
                           sep='|',
                           skiprows=lambda i: i>0 and random.random() > p)
    df_test = pd.read_csv(FILES_LOCATION + "FannieMaeTest.csv",
                          index_col="LOAN_IDENTIFIER",
                          dtype=col_classes,
                          parse_dates=date_columns,
                          sep='|')

In [None]:
df_train.shape

In [None]:
df_test.shape

## Summarize the Data

In [None]:
if 'ZERO_BALANCE_CODE' in df_train:
    df_train.drop(['ZERO_BALANCE_CODE', 'LOAN_AGE', 'NET_SEVERITY', 'LAST_STAT', 'LOAN_MODIFICATION_COSTS', 'TOTAL_LOSSES'],
                  axis=1,
                  inplace=True)
if 'ZERO_BALANCE_CODE' in df_test:
    df_test.drop(['ZERO_BALANCE_CODE', 'LOAN_AGE', 'NET_SEVERITY', 'LAST_STAT', 'LOAN_MODIFICATION_COSTS', 'TOTAL_LOSSES'],
                  axis=1,
                  inplace=True)

In [None]:
def summarize_dataframe(df):
    """Summarize a dataframe, and report missing values."""
    missing_values = pd.DataFrame({'Variable Name': df.columns,
                                   'Data Type': df.dtypes,
                                   'Missing Values': df.isnull().sum(),
                                   'Unique Values': [df[name].nunique() for name in df.columns]}
                                 ).set_index('Variable Name')
    with pd.option_context("display.max_rows", 1000):
        display(pd.concat([missing_values, df.describe(include='all', datetime_is_numeric=True).transpose()], axis=1).fillna(""))

In [None]:
summarize_dataframe(df_train)

In [None]:
summarize_dataframe(df_test)

## Engineer Row Based Features

In [None]:
df_train['ORIGINATION_DATE'] = pd.to_datetime(df_train['ORIGINATION_DATE'], format='%Y-%m-%d')

df_test['ORIGINATION_DATE'] = pd.to_datetime(df_test['ORIGINATION_DATE'], format='%Y-%m-%d')

In [None]:
df_train['YEAR'] = df_train['ORIGINATION_DATE'].dt.year
df_test['YEAR'] = df_test['ORIGINATION_DATE'].dt.year

In [None]:
date1 = "2005/07/01"
date2 = "2008/02/01"

newdate1 = pd.to_datetime(date1, format='%Y/%m/%d')
newdate2 = pd.to_datetime(date2, format='%Y/%m/%d')

In [None]:
df_train['BadYears'] = df_train['ORIGINATION_DATE'].apply(lambda x: True if (x>newdate1 and x<newdate2) else False)
df_test['BadYears'] = df_test['ORIGINATION_DATE'].apply(lambda x: True if (x>newdate1 and x<newdate2) else False)

In [None]:
df_train['BadState'] = df_train['PROPERTY_STATE'].apply(lambda x: True if (x=='NV' or x=='FL' or x=='AZ' or x=='CA' or x=='MI') else False)
df_test['BadState'] = df_test['PROPERTY_STATE'].apply(lambda x: True if (x=='NV' or x=='FL' or x=='AZ' or x=='CA' or x=='MI') else False)

## Split Into Training and Validation

In [None]:
df_smaller_train, df_validation = train_test_split(df_train, test_size = 0.25, random_state = 201)

In [None]:
df_smaller_train = df_smaller_train.copy()
df_validation = df_validation.copy()

## Impute Missing Values

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class CategoricalImputer(BaseEstimator, TransformerMixin):
    """
    Custom defined imputer for categorical data. This allows you to specify an 
    other class where any category that doesn't meet the requirements necessary to
    be in 
    """
    
    def __init__(self, other_threshold=0, 
                 other_label="OTHER",
                 missing_first=True,
                 missing_values=np.nan, 
                 strategy='constant', 
                 fill_value="MISSING", 
                 verbose=0, 
                 copy=True, 
                 add_indicator=False):
        self.add_indicator = add_indicator
        self.copy=copy
        self.verbose=verbose
        self.fill_value=fill_value
        self.missing_first=missing_first
        self.missing_values=missing_values
        self.other_label=other_label
        self.other_threshold=other_threshold
        self.strategy=strategy
        if hasattr(missing_values, "__iter__"):
            self.missing_values = missing_values
        else:
            self.missing_values = [missing_values]
        self._imputer = SimpleImputer(missing_values=missing_values, strategy=strategy, fill_value=fill_value, verbose=verbose, copy=copy, add_indicator=False)
        self._column_categories = {}

        
    def fit(self, X, y=None):
        if type(self.other_threshold) == int or type(self.other_threshold) == float:
            other_threshold = [self.other_threshold]*len(X.columns)
        elif len(self.other_threshold) == len(X.columns):
            other_threshold = self.other_threshold
        else:
            raise TypeError("other_threshold must be either a single number or a list of numbers equal to the number of columns.")

        i = 0
        X = X.copy()
        X = X[:].astype(object)
        if self.missing_first:
            X = pd.DataFrame(self._imputer.fit_transform(X), columns=X.columns, index=X.index)
        column_categories = {}
        for column in X.columns:
            if other_threshold[i] < 1:
                other_threshold[i] = other_threshold[i]*X[column].shape[0]
            
            value_counts = X[column].value_counts()
            categories = [category for category in value_counts.index if value_counts.loc[category] >= other_threshold[i]]
            if value_counts.iloc[-1] >= other_threshold[i]:
                categories[-1] = self.other_label
            else:
                categories.append(self.other_label)
            
            self._column_categories[column] = categories
            i = i + 1
        
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        X = X[:].astype(object)
        if self.missing_first:
            X = pd.DataFrame(self._imputer.fit_transform(X), columns=X.columns, index=X.index)
        for column in X.columns:
            X.loc[~X[column].isin(self._column_categories[column]) & ~X[column].isin(self.missing_values), column] = self.other_label
        return pd.DataFrame(self._imputer.fit_transform(X), columns=X.columns, index=X.index)[:].astype(str)

In [None]:
list(df_smaller_train.columns)

In [None]:
imputer_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer_zero = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
imputer_missing = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='missing')
categorical_imputer = CategoricalImputer(other_threshold=.01)

In [None]:
continuous_mean = ['ORIGINAL_INTEREST_RATE',
                   'ORIGINAL_UPB',
                   'ORIGINAL_LOAN_TERM',
                   'ORIGINAL_LTV',
                   'ORIGINAL_COMBINED_LTV',
                   'ORIGINAL_VALUE',
                   'YEAR']

continuous_zero = ['MORTGAGE_INSURANCE_PERCENTAGE', 
                   'BORROWER_CREDIT_SCORE_AT_ORIGINATION',
                   'COBORROWER_CREDIT_SCORE_AT_ORIGINATION',
                   'CREDIT_SCORE_MIN',
                   'DTI']

continuous_variables = continuous_mean + continuous_zero

categorical_missing = ['FIRST_TIME_HOME_BUYER_INDICATOR',
                       'MORTGAGE_INSURANCE_TYPE','BadYears','BadState']

categorical_variables = ['CHANNEL',
                         'SELLER_NAME',
                         'NUMBER_OF_BORROWERS',
                         'LOAN_PURPOSE',
                         'PROPERTY_TYPE',
                         'NUMBER_OF_UNITS',
                         'OCCUPANCY_STATUS',
                         'PROPERTY_STATE',
                         'ZIP_CODE_SHORT',
                         'AMORTIZATION_TYPE',
                         'RELOCATION_MORTGAGE_INDICATOR',
                         'MSA',
                         'MSA_NAME',
                         'CENSUS_2010_POP']

In [None]:
imputer_mean.fit(df_smaller_train[continuous_mean])
df_smaller_train[continuous_mean] = imputer_mean.transform(df_smaller_train[continuous_mean])
df_validation[continuous_mean] = imputer_mean.transform(df_validation[continuous_mean])

In [None]:
imputer_zero.fit(df_smaller_train[continuous_zero])
df_smaller_train[continuous_zero] = imputer_zero.transform(df_smaller_train[continuous_zero])
df_validation[continuous_zero] = imputer_zero.transform(df_validation[continuous_zero])

In [None]:
categorical_imputer.fit(df_smaller_train[categorical_variables])
df_smaller_train[categorical_variables] = categorical_imputer.transform(df_smaller_train[categorical_variables])
df_validation[categorical_variables] = categorical_imputer.transform(df_validation[categorical_variables])

In [None]:
imputer_missing.fit(df_smaller_train[categorical_missing])
df_smaller_train[categorical_missing] = imputer_missing.transform(df_smaller_train[categorical_missing])
df_validation[categorical_missing] = imputer_missing.transform(df_validation[categorical_missing])

In [None]:
summarize_dataframe(df_smaller_train)

## Set Up the Evaluation Metric

In [None]:
average_loss = df_train['NET_LOSS'].mean()

In [None]:
def accuracy(y_true, y_pred):
    """Function that returns a table showing RMSE and MAE."""
    RMSE = mean_squared_error(y_true, y_pred)**(1/2)
    naive_RMSE = mean_squared_error(y_true, [average_loss]*len(y_true))**(1/2)
    acc_df = pd.DataFrame(data = {"RMSE": [RMSE],
                                  "Naive - RMSE": [naive_RMSE - RMSE]})
    display(acc_df.style.hide_index())

## Feature Engineering for Tree Based Models

In [None]:
continuous_features_trees = ['ORIGINAL_LTV', 'DTI', 'CREDIT_SCORE_MIN','ORIGINAL_INTEREST_RATE','ORIGINAL_UPB','COBORROWER_CREDIT_SCORE_AT_ORIGINATION','BORROWER_CREDIT_SCORE_AT_ORIGINATION']
cat_ordinal_features_trees = ['PROPERTY_STATE','OCCUPANCY_STATUS','LOAN_PURPOSE','MORTGAGE_INSURANCE_TYPE','NUMBER_OF_BORROWERS','BadState']
cat_dummy_features_trees = ['FIRST_TIME_HOME_BUYER_INDICATOR']

In [None]:
X_tree_train = df_smaller_train[continuous_features_trees + cat_ordinal_features_trees]
y_tree_train = df_smaller_train['NET_LOSS']

In [None]:
formula_tree = "0 + " + " + ".join(cat_dummy_features_trees)  + " + YEAR:MORTGAGE_INSURANCE_TYPE" + " + BadYears:MSA" + " + YEAR:BadState" + " + YEAR:ORIGINAL_INTEREST_RATE" + " + YEAR:C(CHANNEL)" + " + FIRST_TIME_HOME_BUYER_INDICATOR:BORROWER_CREDIT_SCORE_AT_ORIGINATION:ORIGINAL_INTEREST_RATE"

In [None]:
formula_tree

In [None]:
X_tree_train_patsy = dmatrix(formula_tree, df_smaller_train, return_type="dataframe")

In [None]:
X_tree_train_patsy

In [None]:
X_tree_train = pd.concat([X_tree_train, X_tree_train_patsy], axis=1)

In [None]:
X_tree_train

In [None]:
ordinal_encoder = OrdinalEncoder()
ordinal_encoder.fit(X_tree_train[cat_ordinal_features_trees])
X_tree_train[cat_ordinal_features_trees] = ordinal_encoder.transform(X_tree_train[cat_ordinal_features_trees])

In [None]:
X_tree_train

In [None]:
X_tree_train.columns = X_tree_train.columns.str.replace('[', '(').str.replace(']', ')')

In [None]:
X_tree_train.columns

Let's go ahead and transform our validation set.

In [None]:
X_tree_validation = df_validation[continuous_features_trees + cat_ordinal_features_trees]
y_tree_validation = df_validation['NET_LOSS']

X_tree_validation_patsy = build_design_matrices([X_tree_train_patsy.design_info], df_validation, return_type="dataframe")[0]

X_tree_validation = pd.concat([X_tree_validation, X_tree_validation_patsy], axis=1)

X_tree_validation[cat_ordinal_features_trees] = ordinal_encoder.transform(X_tree_validation[cat_ordinal_features_trees])

X_tree_validation.columns = X_tree_validation.columns.str.replace('[', '(').str.replace(']', ')')

## Decision Tree

Grid Search--don't run during final export

In [None]:
parameters = {'max_depth': [5, 10, 15],
              'min_samples_split': [500, 1000, 2500, 7500, 10000, 2500],
              'min_impurity_decrease': [0, .001, .01, .1, 1]}

In [None]:
from sklearn.model_selection import ParameterGrid
list(ParameterGrid(parameters))

In [None]:
gs_rt_model = GridSearchCV(DecisionTreeRegressor(max_features = .5, ccp_alpha=10, random_state=201), param_grid=parameters, cv=4, n_jobs=num_cpus)

In [None]:
gs_rt_model.fit(X_tree_train, y_tree_train)

In [None]:
gs_rt_model.best_params_

Start running again here:

In [None]:
%%time
dt_model = DecisionTreeRegressor(max_depth=10,
                                 min_samples_split=1000,
                                 max_features=.5,
                                 min_impurity_decrease=0,
                                 random_state=201)
dt_model.fit(X_tree_train, y_tree_train)

In [None]:
pd.DataFrame({'Importance': dt_model.feature_importances_}, index=X_tree_train.columns).sort_values(['Importance'], ascending=False)

Predict on validation set:

In [None]:
dt_pred = dt_model.predict(X_tree_validation)

In [None]:
accuracy(df_validation['NET_LOSS'], dt_pred)

# Random forest

Grid Search--don't run during final export

In [None]:
param_grid = {'max_depth': [10, 15, 20],
              'min_samples_split': [100, 250, 500],
              'max_features': [10,  15,  20],
              'n_estimators': [75, 100, 200]}

In [None]:
from sklearn.model_selection import ParameterGrid
list(ParameterGrid(param_grid))

In [None]:
rt_model = GridSearchCV(RandomForestRegressor(max_features = .5, ccp_alpha=10, random_state=201), param_grid=param_grid, cv=2, n_jobs=num_cpus)

In [None]:
rt_model.fit(X_tree_train, y_tree_train)

In [None]:
rt_model.best_params_

Start running again here:

In [None]:
%%time
rf_model = RandomForestRegressor(n_estimators=200,
                                 max_features=20,
                                 max_depth=20,
                                 min_samples_split=100,
                                 min_impurity_decrease=0,
                                 random_state=201,
                                 n_jobs=num_cpus)

In [None]:
rf_model.fit(X_tree_train, y_tree_train)

In [None]:
pd.DataFrame({'Importance': rf_model.feature_importances_}, index=X_tree_train.columns).sort_values(['Importance'], ascending=False)

In [None]:
rf_pred = rf_model.predict(X_tree_validation)

In [None]:
accuracy(df_validation['NET_LOSS'], rf_pred)

# Boosted trees model

Grid Search--don't run during final export

In [None]:
parametersxb = {'max_depth': [3, 6, 10, 12],
              'n_estimators': [90, 100, 110, 130],
            'learning_rate': [.3, .2, .1]}

In [None]:
from sklearn.model_selection import ParameterGrid
list(ParameterGrid(parametersxb))

In [None]:
gs_rt_modelxb = GridSearchCV(XGBRegressor(),parametersxb, cv=2, n_jobs=5, verbose=True)

In [None]:
gs_rt_modelxb.fit(X_tree_train, y_tree_train)

In [None]:
gs_rt_modelxb.best_params_

Start running again here:

In [None]:
xgb_model = XGBRegressor(max_depth=3,
                         n_estimators = 90,
                         learning_rate=.2,
                         ccp_alpha=10,
                         random_state=201)
xgb_model.fit(X_tree_train, y_tree_train)
xgb_pred = xgb_model.predict(X_tree_validation)

In [None]:
pd.DataFrame({'Importance': xgb_model.feature_importances_}, index=X_tree_train.columns).sort_values(['Importance'], ascending=False)

In [None]:
xgb_pred = xgb_model.predict(X_tree_validation)

In [None]:
accuracy(df_validation['NET_LOSS'], xgb_pred)

## Ensemble the models

In [None]:
import statsmodels.formula.api as smf

In [None]:
lm_1= smf.ols(formula='NET_LOSS ~ xgb_pred + rf_pred + dt_pred', data=df_validation).fit()
lm_1.summary()

## Predict on the Test Set

First, refit the imputers and impute on `df_train` and `df_test`.

In [None]:
imputer_mean_final = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer_zero_final = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
categorical_imputer_final = CategoricalImputer(other_threshold=.01)
imputer_missing_final = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='missing')

In [None]:
imputer_mean_final.fit(df_train[continuous_mean])
df_train[continuous_mean] = imputer_mean_final.transform(df_train[continuous_mean])
df_test[continuous_mean] = imputer_mean_final.transform(df_test[continuous_mean])

In [None]:
imputer_zero_final.fit(df_train[continuous_zero])
df_train[continuous_zero] = imputer_zero_final.transform(df_train[continuous_zero])
df_test[continuous_zero] = imputer_zero_final.transform(df_test[continuous_zero])

In [None]:
categorical_imputer_final.fit(df_train[categorical_variables])
df_train[categorical_variables] = categorical_imputer_final.transform(df_train[categorical_variables])
df_test[categorical_variables] = categorical_imputer_final.transform(df_test[categorical_variables])

In [None]:
imputer_missing_final.fit(df_train[categorical_missing])
df_train[categorical_missing] = imputer_missing_final.transform(df_train[categorical_missing])
df_test[categorical_missing] = imputer_missing_final.transform(df_test[categorical_missing])

Now, recreate the tree based models data set.

In [None]:
y_linear_train_final, X_linear_train_final = dmatrices(formula_linear, df_train, return_type="dataframe")

In [None]:
X_test = build_design_matrices([X_linear_train_final.design_info], df_test, return_type="dataframe")[0]

In [None]:
X_tree_train_final = df_train[continuous_features_trees + cat_ordinal_features_trees]
y_tree_train_final = df_train['NET_LOSS']

In [None]:
X_tree_train_final = df_train[continuous_features_trees + cat_ordinal_features_trees]
y_tree_train_final = df_train['NET_LOSS']

In [None]:
X_tree_train_patsy_final = dmatrix(formula_tree, df_train, return_type="dataframe")
X_tree_train_final = pd.concat([X_tree_train_final, X_tree_train_patsy_final], axis=1)

In [None]:
ordinal_encoder_final = OrdinalEncoder()
ordinal_encoder_final.fit(X_tree_train_final[cat_ordinal_features_trees])
X_tree_train_final[cat_ordinal_features_trees] = ordinal_encoder_final.transform(X_tree_train_final[cat_ordinal_features_trees])

In [None]:
X_tree_train_final.columns = X_tree_train_final.columns.str.replace('[', '(').str.replace(']', ')')

In [None]:
X_tree_test = df_test[continuous_features_trees + cat_ordinal_features_trees]
y_tree_test = df_test['NET_LOSS']

X_tree_test_patsy = build_design_matrices([X_tree_train_patsy_final.design_info], df_test, return_type="dataframe")[0]

X_tree_test = pd.concat([X_tree_test, X_tree_test_patsy], axis=1)

X_tree_test[cat_ordinal_features_trees] = ordinal_encoder_final.transform(X_tree_test[cat_ordinal_features_trees])

X_tree_test.columns = X_tree_test.columns.str.replace('[', '(').str.replace(']', ')')

Decision tree final model:

In [None]:
dt_model_final = DecisionTreeRegressor(max_depth=10,
                                       min_samples_split=1000,
                                       max_features=.5,
                                       min_impurity_decrease=0,
                                       random_state=201)
dt_model_final.fit(X_tree_train_final, y_tree_train_final)
dt_pred_final = dt_model_final.predict(X_tree_test)

Random forest final model:

In [None]:
%%time
rf_model_final = RandomForestRegressor(n_estimators=200,
                                 max_features=20,
                                 max_depth=20,
                                 min_samples_split=100,
                                 min_impurity_decrease=0,
                                 random_state=201,
                                 n_jobs=num_cpus)
rf_model_final.fit(X_tree_train_final, y_tree_train_final)
rf_pred_final = rf_model_final.predict(X_tree_test)

Boosted tree final model:

In [None]:
xgb_model_final = XGBRegressor(max_depth=3,
                         n_estimators = 90,
                         learning_rate=.1,
                         ccp_alpha=10,
                         random_state=201)
xgb_model_final.fit(X_tree_train_final, y_tree_train_final)
xgb_pred_final = xgb_model_final.predict(X_tree_test)

## Final prediction code

In [None]:
final_pred = (1.1452*rf_pred_final + 0.0041*xgb_pred_final -0.0466*dt_pred_final)-65.4304

Now, see you skill score on your final prediction.

In [None]:
accuracy(df_test['NET_LOSS'], final_pred)

## Write out the data with predictions

In [None]:
df_test_fresh['PREDICTIONS_NET_LOSS'] = final_pred

In [None]:
df_test_fresh.to_csv('FannieMaeTestWithPredictionsNetLoss.csv', sep='|')