In [14]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import Ridge, HuberRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings("ignore")

## 1.0 Data

In [2]:
data = pd.read_csv('input/processed_data_nyc.csv', index_col = 0)
data.head()

Unnamed: 0,latitude,longitude,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,all_year_avail,low_avail,...,neighbourhood_Williamsburg,neighbourhood_Willowbrook,neighbourhood_Windsor Terrace,neighbourhood_Woodhaven,neighbourhood_Woodlawn,neighbourhood_Woodrow,neighbourhood_Woodside,room_type_Entire home/apt,room_type_Private room,room_type_Shared room
0,40.64749,-73.97237,5.010635,1,9,2762,0.21,6,True,False,...,0,0,0,0,0,0,0,0,1,0
1,40.75362,-73.98377,5.420535,1,45,2976,0.38,2,True,False,...,0,0,0,0,0,0,0,1,0,0
2,40.80902,-73.9419,5.01728,3,0,0,0.0,1,True,False,...,0,0,0,0,0,0,0,0,1,0
3,40.68514,-73.95976,4.49981,1,270,3021,4.64,1,False,False,...,0,0,0,0,0,0,0,1,0,0
4,40.79851,-73.94399,4.394449,10,9,2793,0.1,1,False,True,...,0,0,0,0,0,0,0,1,0,0


In [3]:
y = data.price
data = data.drop(['price'], axis=1)

X = np.asarray(data).astype(np.float32)
y = np.asarray(y).ravel()

Data Splitting

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training Dataset: {}".format(X_train.shape))
print("Testing Dataset: {}".format(X_test.shape))

Training Dataset: (39014, 239)
Testing Dataset: (9754, 239)


Scaling Data:

In [5]:
scaler = preprocessing.RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

## 2.0 Models

Here, we combine the top models we tested in cross validation and we attempt to make an ensemble combination of them to hopefully achieve better results

In [6]:
randomForest = RandomForestRegressor(n_estimators=50)
ridge = Ridge(alpha=5)
huber = HuberRegressor(alpha=10, epsilon=3, max_iter=1000)

Let's try the best classifier on its own

In [7]:
randomForest.fit(X_train, y_train)
out_y = randomForest.predict(X_test)
mse = mean_squared_error(y_test, out_y)
print("Random forest mse = {}".format(mse))

Random forest mse = 0.1755652182613282


Now let's try the best 2 classifiers

In [8]:
randomForest.fit(X_train,y_train)
huber.fit(X_train, y_train)

out_y1 = randomForest.predict(X_test)
mse1 = mean_squared_error(y_test, out_y1)
print("random forest MSE = {}".format(mse1))

out_y2 = huber.predict(X_test)
mse2 = mean_squared_error(y_test, out_y2)
print("huber MSE = {}".format(mse2))

out_y = (out_y1 + out_y2)/2
mse = mean_squared_error(y_test,out_y)
print("Final MSE = {}".format(mse))

random forest MSE = 0.1766476578660399
huber MSE = 0.19541675105718745
Final MSE = 0.17356876440687005


Now let's try adding the third classifier!

In [9]:
ridge.fit(X_train, y_train)

out_y3 = ridge.predict(X_test)
mse3 = mean_squared_error(y_test, out_y3)
print("Ridge mse = {}".format(mse3))

out_y = (out_y1 + out_y2 + out_y3)/3
mse = mean_squared_error(y_test,out_y)
print("Final MSE = {}".format(mse))

Ridge mse = 0.19563520765613698
Final MSE = 0.17812616201200754


Based on the increase of final mse, we conclude that the ridge classifier should not be used in this ensemble model. But what if we use more than one random forest or huber model?

In [10]:
rf1 = RandomForestRegressor(n_estimators=50)
huber1 = HuberRegressor(alpha=10, epsilon=3, max_iter=1000)

rf2 = RandomForestRegressor(n_estimators=50)
huber2 = HuberRegressor(alpha=10, epsilon=3, max_iter=1000)

rf3 = RandomForestRegressor(n_estimators=50)
huber3 = HuberRegressor(alpha=10, epsilon=3, max_iter=1000)

# Training
rf1.fit(X_train, y_train)
rf2.fit(X_train, y_train)
rf3.fit(X_train, y_train)

huber1.fit(X_train, y_train)
huber2.fit(X_train, y_train)
huber3.fit(X_train, y_train)

# Testing
out_y1 = rf1.predict(X_test)
mse1 = mean_squared_error(y_test, out_y1)
print("random forest 1 MSE = {}".format(mse1))

out_y2 = huber1.predict(X_test)
mse2 = mean_squared_error(y_test, out_y2)
print("huber 1 MSE = {}".format(mse2))

out_y3 = rf2.predict(X_test)
mse3 = mean_squared_error(y_test, out_y3)
print("random forest 2 MSE = {}".format(mse3))

out_y4 = huber2.predict(X_test)
mse4 = mean_squared_error(y_test, out_y4)
print("huber 2 MSE = {}".format(mse4))

out_y5 = rf3.predict(X_test)
mse5 = mean_squared_error(y_test, out_y5)
print("random forest 3 MSE = {}".format(mse5))

out_y6 = huber3.predict(X_test)
mse6 = mean_squared_error(y_test, out_y6)
print("huber 3 MSE = {}".format(mse6))

out_y = (out_y1 + out_y2 + out_y3 + out_y4 + out_y5 + out_y6)/6
mse = mean_squared_error(y_test,out_y)
print("Final MSE = {}".format(mse))

random forest 1 MSE = 0.17549085851809917
huber 1 MSE = 0.19541675105718745
random forest 2 MSE = 0.17649253112290944
huber 2 MSE = 0.19541675105718745
random forest 3 MSE = 0.1765970759636616
huber 3 MSE = 0.19541675105718745
Final MSE = 0.17282650372522093


## 3.0 Ensemble Model
As we've seen from above, we can improve the performance by using multiple models together. But instead of training all the models on the same data, we will split the data to train each model on different portions of the data.

In [18]:
def buildEnsemble(models, X, y):
    numFolds = len(models)
    cv = KFold(numFolds, True, 1)
    
    modelIndex = 0
    
    for train, test in cv.split(X):
        X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
        
        models[modelIndex].fit(X_train, y_train)
        modelIndex = modelIndex + 1
        
    return models

In [26]:
def predictEnsemble(models, X_t, y_t):
    output = np.zeros(y_t.shape)
    
    for model in models:
        output = output + model.predict(X_t)
        
    output = output / len(models)
    return output

In [23]:
myModels = [
    RandomForestRegressor(n_estimators=100),
    HuberRegressor(alpha=10, epsilon=3, max_iter=1000),
    RandomForestRegressor(n_estimators=100),
    HuberRegressor(alpha=10, epsilon=3, max_iter=1000),
    RandomForestRegressor(n_estimators=100),
    HuberRegressor(alpha=10, epsilon=3, max_iter=1000),
]

myModels = buildEnsemble(myModels, X_train, y_train)
out = predictEnsemble(myModels, X_test, y_test)

mse = mean_squared_error(y_test,out)
print("Final MSE = {}".format(mse))

Final MSE = 0.1727522498218004
