In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import time
from sklearn.preprocessing import RobustScaler

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, HuberRegressor
from sklearn.model_selection import KFold

from lib import data_generation as dg
from lib import feature_processing as fp

## 1. Data
### 1.0 Loading Data
Here we load the raw data

In [2]:
data = pd.read_csv('input/AB_NYC_2019.csv')
data.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


### 1.1 Processing Data
The data is processed as explained in the data processing notebook of this project. The process function in feature_processing library is written based on that notebook.

In [3]:
data.shape

(48895, 16)

In [4]:
processed_data = fp.process(data)

(48895, 16)
(48895, 15)


In [5]:
processed_data.head()

Unnamed: 0,latitude,longitude,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,all_year_avail,low_avail,...,neighbourhood_Williamsburg,neighbourhood_Willowbrook,neighbourhood_Windsor Terrace,neighbourhood_Woodhaven,neighbourhood_Woodlawn,neighbourhood_Woodrow,neighbourhood_Woodside,room_type_Entire home/apt,room_type_Private room,room_type_Shared room
0,40.64749,-73.97237,5.010635,1,9,2762,0.21,6,True,False,...,0,0,0,0,0,0,0,0,1,0
1,40.75362,-73.98377,5.420535,1,45,2976,0.38,2,True,False,...,0,0,0,0,0,0,0,1,0,0
2,40.80902,-73.9419,5.01728,3,0,0,0.0,1,True,False,...,0,0,0,0,0,0,0,0,1,0
3,40.68514,-73.95976,4.49981,1,270,3021,4.64,1,False,False,...,0,0,0,0,0,0,0,1,0,0
4,40.79851,-73.94399,4.394449,10,9,2793,0.1,1,False,True,...,0,0,0,0,0,0,0,1,0,0


### 1.2 Split Data
We split the data 80-20 into training and testing data

In [6]:
y = processed_data.price
processed_data = processed_data.drop(['price'], axis=1)

X = np.asarray(processed_data)
y = np.asarray(y).ravel()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training Dataset: {}".format(X_train.shape))
print("Testing Dataset: {}".format(X_test.shape))

Training Dataset: (39014, 239)
Testing Dataset: (9754, 239)


### 1.3 Scaling Data
The final step in data processing is using the robust scaler to scale all data! But we will apply the scaler after new data is generated!

In [7]:
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

## 2. ML Models
In this project we tested multiple regressors at varying parameters. We concluded that the ideal model parameters were as follows for the top 3 regressors:

1) Random Forest: Number of estimators = 50  
2) Ridge Regression: Alpha = 5  
3) Huber Regression: Alpha = 10, Epsilon = 3  

As such, we will define our final models:

In [8]:
randomForest_final = RandomForestRegressor(n_estimators=50)
ridge_final = Ridge(alpha=5)
huber_final = HuberRegressor(alpha=10, epsilon=3, max_iter=1000)

Let's try the best of the three models, random forest regressor, with unprocessed and then processed data to see the difference! Of course, we still have to clean the data so the model can read it.

Unprocessed data:

In [9]:
data.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [10]:
cleanData = fp.clean(data)

y_unp = cleanData.price
cleanData = cleanData.drop(['price'], axis=1)

X_unp = np.asarray(cleanData)
y_unp = np.asarray(y_unp).ravel()

X_unp_train, X_unp_test, y_unp_train, y_unp_test = train_test_split(X_unp, y_unp, test_size=0.2, random_state=42)
X_unp_train = scaler.fit_transform(X_unp_train)
X_unp_test = scaler.fit_transform(X_unp_test)

randomForest_final.fit(X_unp_train, y_unp_train)
out = randomForest_final.predict(X_unp_test)

mse = mean_squared_error(y_unp_test,out)
r2 = r2_score(y_unp_test,out)
mae = mean_absolute_error(y_unp_test,out)

print("MSE = {}".format(mse))
print("R2 = {}".format(r2))
print("MAE = {}".format(mae))

MSE = 38934.30132839607
R2 = -0.06074625987159754
MAE = 66.79762684124385


Now we try the same with processed data:

In [11]:
randomForest_final.fit(X_train, y_train)
out = randomForest_final.predict(X_test)

mse = mean_squared_error(y_test,out)
r2 = r2_score(y_test,out)
mae = mean_absolute_error(y_test,out)

print("MSE = {}".format(mse))
print("R2 = {}".format(r2))
print("MAE = {}".format(mae))

MSE = 0.17571992310147683
R2 = 0.6142971645520457
MAE = 0.306458208743377


We can see by comparing the results, how big of an improvement processing the data made. Now we will try an ensemble model for further improvement.

### 2.1 Main Ensemble Model
We perform unit testing to check which samples perform the worst.

In [12]:
def buildEnsemble(models, X, y):
    numFolds = len(models)
    cv = KFold(numFolds, True, 1)
    
    modelIndex = 0
    
    for train, test in cv.split(X):
        X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
        
        models[modelIndex].fit(X_train, y_train)
        modelIndex = modelIndex + 1
        
    return models

In [13]:
def predictEnsemble(models, X_t, y_t):
    output = np.zeros(y_t.shape)
    
    for model in models:
        output = output + model.predict(X_t)
        
    output = output / len(models)
    return output

In [15]:
myModels = [
    RandomForestRegressor(n_estimators=100),
    HuberRegressor(alpha=10, epsilon=3, max_iter=1000),
    RandomForestRegressor(n_estimators=100),
    HuberRegressor(alpha=10, epsilon=3, max_iter=1000),
    RandomForestRegressor(n_estimators=100),
    HuberRegressor(alpha=10, epsilon=3, max_iter=1000),
]

myModels = buildEnsemble(myModels, X_train, y_train)
out = predictEnsemble(myModels, X_test, y_test)

mse = mean_squared_error(y_test,out)
r2 = r2_score(y_test,out)
mae = mean_absolute_error(y_test,out)

print("MSE = {}".format(mse))
print("R2 = {}".format(r2))
print("MAE = {}".format(mae))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


MSE = 0.17252630166770155
R2 = 0.6213071200574505
MAE = 0.30642343861266436
