In [1]:
import os
os.chdir('../')

In [2]:
from Helper.DataLoader import *
from Helper.StaticParameters import Parameters
from Helper.ModelProcessor import ModelProcessor

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

In [3]:
# Init model
parameter = Parameters()
modelProcessor = ModelProcessor()

# Get Data
combine_df, train_df_index, test_set1_index, test_set2_index = load_dataset(numeric=True, extract_dataset=False)
combine_df = clean_data(combine_df)
# combine_df = normalized_dataset(combine_df)

In [5]:
random_train_pred, random_test1_pred, random_test2_pred = modelProcessor.train_final_model(
                                                             RandomForestRegressor(max_features=10,n_estimators=320,
                                                                                   criterion='mse',bootstrap=True),
                                                             train_df= combine_df.loc[train_df_index], 
                                                             test_df=combine_df.loc[test_set1_index],
                                                             feature_columns = parameter.feature_columns,
                                                             train_target='rent', trainWithTest1 = False,
                                                             test2_df=combine_df.loc[test_set2_index]
                                                            )

Train Loss: 175379.2218
Test Set 2 - Loss: 6656865.7837
Test Set 1 Loss: 1755881.8033


In [6]:
gradient_train_pred, gradient_test1_pred, gradient_test2_pred = modelProcessor.train_final_model(
                                                     GradientBoostingRegressor(learning_rate=0.001,
                                                                               n_estimators=450,
                                                                               min_samples_split=10),
                                                     train_df= combine_df.loc[train_df_index], 
                                                     test_df=combine_df.loc[test_set1_index],
                                                     feature_columns = parameter.feature_columns,
                                                     train_target='rent', trainWithTest1 = False,
                                                     test2_df=combine_df.loc[test_set2_index]
                                                    )

Train Loss: 4886592.3184
Test Set 2 - Loss: 1133123.8333
Test Set 1 Loss: 4865627.4079


# Save the Result CSV File

In [10]:
test1_df = combine_df.loc[test_set1_index]
test1_df['predictions'] = random_test1_pred
test1_df['predictions'].to_csv("RandomForest_Prediction_TestSet1.csv", header=True)

In [9]:
test2_df = combine_df.loc[test_set2_index]
test2_df['predictions'] = random_test2_pred
test2_df['predictions'].to_csv("RandomForest_Prediction_TestSet2.csv", header=True)

In [23]:
test2_df['predictions'] = gradient_test2_pred
test2_df['predictions'].to_csv("GradientBoosting_Prediction_TestSet2.csv", header=True)

# Check Result - If Save Correctly 

In [36]:
save_test1_df = pd.read_csv('RandomForest_Prediction_TestSet1_Normalize.csv', index_col=0)
new_df = pd.concat([test1_df['rent'],save_test1_df],axis=1)

In [19]:
new_df.shape

(2000, 2)

In [20]:
test1_df.shape

(2000, 27)

In [21]:
new_df.columns

Index(['rent', 'predictions'], dtype='object')

In [37]:
from sklearn.metrics import mean_squared_error
mean_squared_error(new_df.rent, new_df.predictions)

1743405.7556120362

# Compare test1 rent and prediction with median train rent

In [28]:
combine_df.loc[train_df_index].rent.median()

2900.0

In [29]:

mean_squared_error(new_df.rent, np.ones(2000) * 2900)

7828536.681

In [30]:
mean_squared_error(new_df.predictions, np.ones(2000) * 2900)

5285432.61721352

# Normalization and Feature Test -  Decide Final Model

In [24]:
features = ['bedrooms','bathrooms','size_sqft','addr_zip','min_to_subway',
                                'has_elevator','is_furnished','has_gym','allows_pets',]

In [26]:
random_train_pred, random_test1_pred, random_test2_pred = modelProcessor.train_final_model(
                                                             RandomForestRegressor(n_estimators=320,
                                                                                   criterion='mse',bootstrap=True),
                                                             train_df= combine_df.loc[train_df_index], 
                                                             test_df=combine_df.loc[test_set1_index],
                                                             feature_columns = features,
                                                             train_target='rent', trainWithTest1 = False,
                                                             test2_df=combine_df.loc[test_set2_index]
                                                            )

Train Loss: 195749.3496
Test Set 2 - Loss: 6807182.0565
Test Set 1 Loss: 1846680.3587


In [31]:
combine_df = normalized_dataset(combine_df)

In [32]:
random_train_pred, random_test1_pred, random_test2_pred = modelProcessor.train_final_model(
                                                             RandomForestRegressor(n_estimators=320,
                                                                                   criterion='mse',bootstrap=True),
                                                             train_df= combine_df.loc[train_df_index], 
                                                             test_df=combine_df.loc[test_set1_index],
                                                             feature_columns = features,
                                                             train_target='rent', trainWithTest1 = False,
                                                             test2_df=combine_df.loc[test_set2_index]
                                                            )

Train Loss: 198218.1152
Test Set 2 - Loss: 6753358.6392
Test Set 1 Loss: 1830390.1794


In [33]:
random_train_pred, random_test1_pred, random_test2_pred = modelProcessor.train_final_model(
                                                             RandomForestRegressor(max_features=10,n_estimators=320,
                                                                                   criterion='mse',bootstrap=True),
                                                             train_df= combine_df.loc[train_df_index], 
                                                             test_df=combine_df.loc[test_set1_index],
                                                             feature_columns = parameter.feature_columns,
                                                             train_target='rent', trainWithTest1 = False,
                                                             test2_df=combine_df.loc[test_set2_index]
                                                            )

Train Loss: 172452.2758
Test Set 2 - Loss: 6630254.8910
Test Set 1 Loss: 1743405.7556


In [34]:
test1_df.shape

(2000, 27)

In [35]:
test1_df = combine_df.loc[test_set1_index]
test1_df['predictions'] = random_test1_pred
test1_df['predictions'].to_csv("RandomForest_Prediction_TestSet1_Normalize.csv", header=True)

In [38]:
test2_df = combine_df.loc[test_set2_index]
test2_df['predictions'] = random_test2_pred
test2_df['predictions'].to_csv("RandomForest_Prediction_TestSet2_Normalize.csv", header=True)

## Final CSV will be RandomForest_Prediction_TestSet2_Normalize.csv