In [1]:
import os
os.chdir('../')

In [2]:
from Helper.DataLoader import *
from Helper.StaticParameters import Parameters
from Helper.ModelProcessor import ModelProcessor

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

In [3]:
# Init model
parameter = Parameters()
modelProcessor = ModelProcessor()

# Get Data
combine_df, train_df_index, test_set1_index, test_set2_index = load_dataset(numeric=True, extract_dataset=False)
combine_df = clean_data(combine_df)
# combine_df = normalized_dataset(combine_df)

In [9]:
random_train_pred, random_test1_pred, random_test2_pred = modelProcessor.train_final_model(
                                                             RandomForestRegressor(max_features=10,n_estimators=320,
                                                                                   criterion='mse',bootstrap=True),
                                                             train_df= combine_df.loc[train_df_index], 
                                                             test_df=combine_df.loc[test_set1_index],
                                                             feature_columns = parameter.feature_columns,
                                                             train_target='rent', trainWithTest1 = False,
                                                             test2_df=combine_df.loc[test_set2_index]
                                                            )

Train Loss: 173603.3291
Test Set 1 Loss: 1743435.8523
Test Set 2 - Loss: 6583444.1690


In [5]:
gradient_train_pred, gradient_test1_pred, gradient_test2_pred = modelProcessor.train_final_model(
                                                     GradientBoostingRegressor(learning_rate=0.001,
                                                                               n_estimators=450,
                                                                               min_samples_split=10),
                                                     train_df= combine_df.loc[train_df_index], 
                                                     test_df=combine_df.loc[test_set1_index],
                                                     feature_columns = parameter.feature_columns,
                                                     train_target='rent', trainWithTest1 = False,
                                                     test2_df=combine_df.loc[test_set2_index]
                                                    )

Train Loss: 4886592.3184
Test Set 1 Loss: 4865627.4079
Test Set 2 - Loss: 1133123.8333


# Save the Result CSV File

In [10]:
test1_df = combine_df.loc[test_set1_index]
test1_df['predictions'] = random_test1_pred
test1_df['predictions'].to_csv("RandomForest_Prediction_TestSet1.csv", header=True)

In [8]:
test2_df = combine_df.loc[test_set2_index]
test2_df['predictions'] = random_test2_pred
test2_df['predictions'].to_csv("RandomForest_Prediction_TestSet2.csv", header=True)

In [23]:
test2_df['predictions'] = gradient_test2_pred
test2_df['predictions'].to_csv("GradientBoosting_Prediction_TestSet2.csv", header=True)

# Check Result - If Save Correctly 

In [11]:
save_test1_df = pd.read_csv('RandomForest_Prediction_TestSet1.csv', index_col=0)
new_df = pd.concat([test1_df['rent'],save_test1_df],axis=1)

In [12]:
new_df.shape

(2000, 2)

In [13]:
test1_df.shape

(2000, 27)

In [14]:
new_df.columns

Index(['rent', 'predictions'], dtype='object')

In [15]:
from sklearn.metrics import mean_squared_error
mean_squared_error(new_df.rent, new_df.predictions)

1743435.8523362575

# Analysis Loss 

In [9]:
train_df = combine_df.loc[train_df_index]
median_rents = np.ones(test2_df.shape[0]) * train_df.rent.median()

In [12]:
# compare median rent with test 1 actual rent
mean_squared_error(test1_df.rent, median_rents)

7828536.681

In [13]:
# compare median rent with test 1 prediction rent
mean_squared_error(test1_df.predictions, median_rents)

5253709.207441179

In [14]:
# compare test 1 actual rent and predict rent
mean_squared_error(test1_df.predictions, test1_df.rent)

1739729.2770655341

#### Conclusion: Median Rent do poorly on estimate the actual loss

# Normalization Data - Less Features

In [6]:
combine_df = normalized_dataset(combine_df)
random_train_pred, random_test1_pred, random_test2_pred = modelProcessor.train_final_model(
                                                             RandomForestRegressor(max_features=10,n_estimators=320,
                                                                                   criterion='mse',bootstrap=True),
                                                             train_df= combine_df.loc[train_df_index], 
                                                             test_df=combine_df.loc[test_set1_index],
                                                             feature_columns = parameter.feature_columns,
                                                             train_target='rent', trainWithTest1 = False,
                                                             test2_df=combine_df.loc[test_set2_index]
                                                            )

Train Loss: 178331.7395
Test Set 1 Loss: 1750304.2386
Test Set 2 - Loss: 6537426.7855


In [7]:
features = ['bedrooms','bathrooms','size_sqft','addr_zip','has_elevator','has_gym','allows_pets','no_fee','has_pool','floornumber']
random_train_pred, random_test1_pred, random_test2_pred = modelProcessor.train_final_model(
                                                             RandomForestRegressor(max_features=10,n_estimators=320,
                                                                                   criterion='mse',bootstrap=True),
                                                             train_df= combine_df.loc[train_df_index], 
                                                             test_df=combine_df.loc[test_set1_index],
                                                             feature_columns = features,
                                                             train_target='rent', trainWithTest1 = False,
                                                             test2_df=combine_df.loc[test_set2_index]
                                                            )

Train Loss: 198325.9440
Test Set 1 Loss: 1787680.1939
Test Set 2 - Loss: 6790996.1342


#### Normalization don't make any big difference, performance similar as the original data, less features will make the model performance worse.

# Final Prediction

In [8]:
import numpy as np
combine_df_index = np.concatenate([train_df_index,test_set1_index])
combine_df_index.shape

(14000,)

In [9]:
random_train_pred, random_test2_pred = modelProcessor.train_final_model(
                                                             RandomForestRegressor(max_features=10,n_estimators=320,
                                                                                   criterion='mse',bootstrap=True),
                                                             train_df= combine_df.loc[combine_df_index], 
                                                             test_df=combine_df.loc[test_set2_index],
                                                             feature_columns = parameter.feature_columns
                                                            )

Train Loss: 181415.2228
Test Set Loss: 6766959.5468


In [10]:
test2_df = combine_df.loc[test_set2_index]
test2_df['predictions'] = random_test2_pred
test2_df['predictions'].to_csv("RandomForest_Prediction_TestSet2_Normalized.csv", header=True)

In [11]:
combine_df, train_df_index, test_set1_index, test_set2_index = load_dataset(numeric=True, extract_dataset=False)
combine_df = clean_data(combine_df)

In [12]:
random_train_pred, random_test2_pred = modelProcessor.train_final_model(
                                                             RandomForestRegressor(max_features=10,n_estimators=320,
                                                                                   criterion='mse',bootstrap=True),
                                                             train_df= combine_df.loc[combine_df_index], 
                                                             test_df=combine_df.loc[test_set2_index],
                                                             feature_columns = parameter.feature_columns
                                                            )

Train Loss: 178825.0647
Test Set Loss: 6794227.2186


In [13]:
test2_df = combine_df.loc[test_set2_index]
test2_df['predictions'] = random_test2_pred
test2_df['predictions'].to_csv("RandomForest_Prediction_TestSet2_NotNormalized.csv", header=True)