In [1]:
import pandas as pd
import numpy as np
import itertools
import time
import statsmodels.api as sm
from sklearn.ensemble import RandomForestRegressor
import sys
from sklearn import datasets
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error

df = pd.read_csv('historical_data1_Q12005_clean.csv')
df2 = pd.read_csv('historical_data1_Q22005_clean.csv')

  from pandas.core import datetools
  interactivity=interactivity, compiler=compiler, result=result)


In [2]:
# NOTE: DOES NOT USE mortgage_insurance_perc
dummies = pd.get_dummies(df[['first_time_homebuyer_flag','occupancy_status', 
                             'channel','prepayment_penalty_mortgage_flag',  'loan_purpose']])

y = df.orig_interest_rate
X_ = df.drop(['first_payment_date', 'first_time_homebuyer_flag', 'maturity_date',
              'metropolitan_stat_area', 
              'occupancy_status', 'channel', 'prepayment_penalty_mortgage_flag', 'product_type',
             'property_state', 'property_type', 'postal_code', 'loan_sequence_no',
             'loan_purpose', 'orig_loan_term', 'seller_name', 'service_name',
              'orig_interest_rate','mortgage_insurance_perc'], axis=1).astype('float64')
X = pd.concat([X_, dummies[['first_time_homebuyer_flag_Y', 'occupancy_status_I',
                            'occupancy_status_O', 'occupancy_status_S',
                           'channel_B', 'channel_C', 'channel_R', 'channel_T',
                           'prepayment_penalty_mortgage_flag_Y', 'loan_purpose_C',
                           'loan_purpose_N', 'loan_purpose_P']]], axis=1)

In [3]:
# NOTE: DOES NOT USE mortgage_insurance_perc
dummies2 = pd.get_dummies(df2[['first_time_homebuyer_flag','occupancy_status', 
                             'channel','prepayment_penalty_mortgage_flag',  'loan_purpose']])

y_val = df2.orig_interest_rate
X_val_ = df2.drop(['first_payment_date', 'first_time_homebuyer_flag', 'maturity_date',
              'metropolitan_stat_area', 
              'occupancy_status', 'channel', 'prepayment_penalty_mortgage_flag', 'product_type',
             'property_state', 'property_type', 'postal_code', 'loan_sequence_no',
             'loan_purpose', 'orig_loan_term', 'seller_name', 'service_name',
              'orig_interest_rate','mortgage_insurance_perc'], axis=1).astype('float64')
X_val = pd.concat([X_val_, dummies2[['first_time_homebuyer_flag_Y', 'occupancy_status_I',
                            'occupancy_status_O', 'occupancy_status_S',
                           'channel_B', 'channel_C', 'channel_R', 'channel_T',
                           'prepayment_penalty_mortgage_flag_Y', 'loan_purpose_C',
                           'loan_purpose_N', 'loan_purpose_P']]], axis=1)

In [4]:
def processSubset(feature_set):
# Fit model on feature_set and calculate RSS
    model = RandomForestRegressor(n_estimators=20)
    model.fit(X[list(feature_set)], y)
    
    #MAE, RMS, MAPE
    RMS_train = mean_squared_error(y, model.predict(X[list(feature_set)]))
    RMS_test = mean_squared_error(y_val, model.predict(X_val[list(feature_set)]))
    
    MAE_train = mean_absolute_error(y, model.predict(X[list(feature_set)]))
    MAE_test = mean_absolute_error(y_val, model.predict(X_val[list(feature_set)]))
    
    MAPE_train = np.mean(np.abs((y - model.predict(X[list(feature_set)])) / y)) * 100
    MAPE_test = np.mean(np.abs((y_val - model.predict(X_val[list(feature_set)])) / y_val)) * 100
    
    
    RSS = ((model.predict(X_val[list(feature_set)]) - y_val) ** 2).sum()
    return {"model":model, "RSS":RSS, "feature":X[list(feature_set)], 
            "RMS_train": RMS_train, "RMS_test": RMS_test,
            "MAE_train": MAE_train, "MAE_test": MAE_test,
            "MAPE_train": MAPE_train, "MAPE_test": MAPE_test}

In [None]:
def exhaustive(k):
    tic = time.time()
    results = []
    for combo in itertools.combinations(X.columns, k):
        results.append(processSubset(combo))
        
    # Wrap everything up in a nice dataframe
    models = pd.DataFrame(results)
    # Choose the model with the smallest RMS_test
    best_model = models.loc[models['RMS_test'].argmin()]
    toc = time.time()
    print("Processed ", models.shape[0], "models on", k, "predictors in", (toc-tic), "seconds.") # Return the best model, along with some other useful information about the model
    return best_model

# Could take quite awhile to complete...
models_exhaustive = pd.DataFrame(columns=["RSS", "model", "feature" , "RMS_train", 
                                "RMS_test", "MAE_train", "MAE_test", "MAPE_train", "MAPE_test"])
tic = time.time()
# exhaustive(1)
for i in range(1,10):
     models_exhaustive.loc[i] = exhaustive(i)
    
toc = time.time()
print("Total elapsed time:", (toc-tic), "seconds.")

In [None]:
def backward(predictors):
    tic = time.time()
    results = []
    for combo in itertools.combinations(predictors, len(predictors)-1):
        results.append(processSubset(combo)) 
        # Wrap everything up in a nice dataframe
    models = pd.DataFrame(results)
    
    # Choose the model with the smallest RMS_test
    best_model = models.loc[models['RMS_test'].argmin()]
    toc = time.time()
    print("Processed ", models.shape[0], "models on", len(predictors)-1, "predictors in", (toc-tic), "seconds.") 
    # Return the best model, along with some other useful information about the model
    return best_model

models_backward = pd.DataFrame(columns=["RSS", "model", "feature", "RMS_train", 
                                "RMS_test", "MAE_train", "MAE_test", "MAPE_train", "MAPE_test"], index = range(1,len(X.columns)))
tic = time.time()
predictors = X.columns
while(len(predictors) > 1):
    models_backward.loc[len(predictors)-1] = backward(predictors)
    predictors = list(models_backward.loc[len(predictors)-1]["feature"])

toc = time.time()
print("Total elapsed time:", (toc-tic), "seconds.")

('Processed ', 19, 'models on', 18, 'predictors in', 1127.1509869098663, 'seconds.')
('Processed ', 18, 'models on', 17, 'predictors in', 1077.7457530498505, 'seconds.')
('Processed ', 17, 'models on', 16, 'predictors in', 963.6655330657959, 'seconds.')


In [5]:
def forward(predictors):
    # Pull out predictors we still need to process
    remaining_predictors = [p for p in X.columns if p not in predictors]
    tic = time.time()
    results = []
    for p in remaining_predictors:
        results.append(processSubset(predictors+[p]))
        
    # Wrap everything up in a nice dataframe
    models = pd.DataFrame(results)
    # Choose the model with the highest RSS
    best_model = models.loc[models['RMS_test'].argmin()]
    toc = time.time()
    print("Processed ", models.shape[0], "models on", len(predictors)+1, "predictors in", (toc-tic), "seconds.")
    # Return the best model, along with some other useful information about the model
    return best_model

models_forward = pd.DataFrame(columns=["RSS", "model", "feature", "RMS_train", 
                                "RMS_test", "MAE_train", "MAE_test", "MAPE_train", "MAPE_test"])
tic = time.time()
predictors = []
for i in range(1,len(X.columns)+1):
    models_forward.loc[i] = forward(predictors)
    predictors = list(models_forward.loc[i]["feature"])
    
toc = time.time()
print("Total elapsed time:", (toc-tic), "seconds.")

('Processed ', 19, 'models on', 1, 'predictors in', 37.69409394264221, 'seconds.')
('Processed ', 18, 'models on', 2, 'predictors in', 111.45318484306335, 'seconds.')
('Processed ', 17, 'models on', 3, 'predictors in', 110.89107203483582, 'seconds.')
('Processed ', 16, 'models on', 4, 'predictors in', 127.88710403442383, 'seconds.')
('Processed ', 15, 'models on', 5, 'predictors in', 148.46694588661194, 'seconds.')
('Processed ', 14, 'models on', 6, 'predictors in', 143.9607231616974, 'seconds.')
('Processed ', 13, 'models on', 7, 'predictors in', 153.30555820465088, 'seconds.')
('Processed ', 12, 'models on', 8, 'predictors in', 150.9068820476532, 'seconds.')
('Processed ', 11, 'models on', 9, 'predictors in', 153.27757096290588, 'seconds.')
('Processed ', 10, 'models on', 10, 'predictors in', 147.73916602134705, 'seconds.')
('Processed ', 9, 'models on', 11, 'predictors in', 144.79130721092224, 'seconds.')
('Processed ', 8, 'models on', 12, 'predictors in', 155.8723180294037, 'second

In [6]:
models_forward

Unnamed: 0,RSS,model,feature,RMS_train,RMS_test,MAE_train,MAE_test,MAPE_train,MAPE_test
1,53974.294535,"(DecisionTreeRegressor(criterion='mse', max_de...",orig_loantovalue 0 5...,0.124567,0.133053,0.271053,0.282895,4.829393,4.81906
2,52560.641495,"(DecisionTreeRegressor(criterion='mse', max_de...",orig_loantovalue occupancy_status_I 0...,0.119862,0.129568,0.266778,0.281183,4.757825,4.793211
3,52188.992839,"(DecisionTreeRegressor(criterion='mse', max_de...",orig_loantovalue occupancy_status_I ...,0.119149,0.128652,0.264622,0.279431,4.719813,4.763449
4,51969.216929,"(DecisionTreeRegressor(criterion='mse', max_de...",orig_loantovalue occupancy_status_I ...,0.118086,0.12811,0.263119,0.278712,4.69364,4.751729
5,51904.157831,"(DecisionTreeRegressor(criterion='mse', max_de...",orig_loantovalue occupancy_status_I ...,0.117583,0.12795,0.262971,0.278924,4.691376,4.756085
6,51816.85543,"(DecisionTreeRegressor(criterion='mse', max_de...",orig_loantovalue occupancy_status_I ...,0.117032,0.127734,0.262114,0.278602,4.676208,4.751018
7,51790.751173,"(DecisionTreeRegressor(criterion='mse', max_de...",orig_loantovalue occupancy_status_I ...,0.117,0.12767,0.262037,0.278498,4.674984,4.749319
8,51857.988571,"(DecisionTreeRegressor(criterion='mse', max_de...",orig_loantovalue occupancy_status_I ...,0.116621,0.127836,0.261929,0.278711,4.673324,4.753228
9,51876.275639,"(DecisionTreeRegressor(criterion='mse', max_de...",orig_loantovalue occupancy_status_I ...,0.116618,0.127881,0.261955,0.278744,4.673818,4.753819
10,51880.741392,"(DecisionTreeRegressor(criterion='mse', max_de...",orig_loantovalue occupancy_status_I ...,0.116424,0.127892,0.261714,0.278658,4.669782,4.752646


In [7]:
def getBestModel(models):
    length = len(models.index)
    bestModel = models.loc[1]
    RMS = models.loc[1]["RMS_test"]
    for i in range (1, length + 1):
        if models.loc[i]["RMS_test"] < RMS:
            bestModel = models.loc[i]
            RMS = models.loc[i]["RMS_test"]
    return bestModel

In [8]:
bestmodel_forward = getBestModel(models_forward)
print (bestmodel_forward["RMS_test"])

0.127670027863
