In [1]:
from Helper.DataLoader import *
from Helper.StaticParameters import Parameters
from ModelProcessor import ModelProcessor

In [2]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor

In [3]:
# Init model
parameter = Parameters()
modelProcessor = ModelProcessor()

# Get Data
combine_df, train_df_index, test_set1_index, test_set2_index = load_dataset(numeric=True, extract_dataset=False)
combine_df = clean_data(combine_df)
combine_df = normalized_dataset(combine_df)

# Train Models - All Selected Features

In [4]:
#Decision Tree
modelProcessor.train_model(classifier=DecisionTreeClassifier(),parameters=parameter.dtree_parameter,
                           feature_columns= parameter.feature_columns,
                           train_df= combine_df.iloc[:14000], test_df=combine_df.iloc[14000:],
                           grid=False,train_target='rent',cv_split = 4)



Best Estimator DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=12, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=40,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
Best Parameters:  {'min_samples_split': 40, 'max_features': 12, 'max_depth': 5}
Best Score:  -3472691.418785714
Test Loss: 2656850.7625




In [5]:
#KNN
modelProcessor.train_model(classifier=KNeighborsRegressor(),parameters=parameter.knn_parameter,
                           feature_columns= parameter.feature_columns,
                           train_df= combine_df.iloc[:14000], test_df=combine_df.iloc[14000:],
                           grid=False,train_target='rent',cv_split = 4)

Best Estimator KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=None, n_neighbors=4, p=2,
          weights='uniform')
Best Parameters:  {'n_neighbors': 4}
Best Score:  -4075811.8896651785
Test Loss: 3792505.6058


In [6]:
#Linear Regression
modelProcessor.train_model(classifier=LinearRegression(),parameters=parameter.linreg_parameter,
                           feature_columns= parameter.feature_columns,
                           train_df= combine_df.iloc[:14000], test_df=combine_df.iloc[14000:],
                           grid=False,train_target='rent',cv_split = 4)

Best Estimator LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)
Best Parameters:  {'fit_intercept': True}
Best Score:  -2527357.885914485
Test Loss: 5727316.4732




In [7]:
#Gradient Boosting
modelProcessor.train_model(classifier=GradientBoostingRegressor(loss="lad"),parameters=parameter.gradient_parameter,
                           feature_columns= parameter.feature_columns,
                           train_df= combine_df.iloc[:14000], test_df=combine_df.iloc[14000:],
                           grid=False,train_target='rent',cv_split = 4)

Best Estimator GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.001, loss='lad', max_depth=3,
             max_features=None, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=110,
             min_weight_fraction_leaf=0.0, n_estimators=300,
             n_iter_no_change=None, presort='auto', random_state=None,
             subsample=1.0, tol=0.0001, validation_fraction=0.1, verbose=0,
             warm_start=False)
Best Parameters:  {'n_estimators': 300, 'min_samples_split': 110, 'learning_rate': 0.001}
Best Score:  -8002662.743059531
Test Loss: 46253.2311


In [8]:
# Ada Boosting with Linear Regression
modelProcessor.train_model(classifier=AdaBoostRegressor(base_estimator=LinearRegression()),
                           parameters=parameter.ada_parameter,
                           feature_columns= parameter.feature_columns,
                           train_df= combine_df.iloc[:14000], test_df=combine_df.iloc[14000:],
                           grid=False,train_target='rent',cv_split = 4)

Best Estimator AdaBoostRegressor(base_estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False),
         learning_rate=0.0001, loss='linear', n_estimators=100,
         random_state=None)
Best Parameters:  {'n_estimators': 100, 'learning_rate': 0.0001}
Best Score:  -2522361.641171052
Test Loss: 5719932.1218


# Train Model - Less Features

In [9]:
features = ['bedrooms', 'bathrooms', 'size_sqft', 'addr_zip', 'floor_count', 'min_to_subway', 'has_doorman',
                        'is_furnished', 'allows_pets', 'no_fee','floornumber']

In [10]:
#Decision Tree
modelProcessor.train_model(classifier=DecisionTreeClassifier(),parameters=parameter.dtree_less_parameter,
                           feature_columns= features,
                           train_df= combine_df.iloc[:14000], test_df=combine_df.iloc[14000:],
                           grid=False,train_target='rent',cv_split = 4)



Best Estimator DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=15,
            max_features=6, max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=10, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
Best Parameters:  {'min_samples_split': 10, 'max_features': 6, 'max_depth': 15}
Best Score:  -3150202.2895714287
Test Loss: 3681468.6285


In [11]:
#KNN
modelProcessor.train_model(classifier=KNeighborsRegressor(),parameters=parameter.knn_parameter,
                           feature_columns= features,
                           train_df= combine_df.iloc[:14000], test_df=combine_df.iloc[14000:],
                           grid=False,train_target='rent',cv_split = 4)

Best Estimator KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=None, n_neighbors=6, p=2,
          weights='uniform')
Best Parameters:  {'n_neighbors': 6}
Best Score:  -2543032.1381349205
Test Loss: 5149683.4701


In [12]:
#Linear Regression
modelProcessor.train_model(classifier=LinearRegression(),parameters=parameter.linreg_parameter,
                           feature_columns= features,
                           train_df= combine_df.iloc[:14000], test_df=combine_df.iloc[14000:],
                           grid=False,train_target='rent',cv_split = 4)

Best Estimator LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)
Best Parameters:  {'fit_intercept': True}
Best Score:  -2574970.6086334577
Test Loss: 5709539.9990




In [13]:
#Gradient Boosting
modelProcessor.train_model(classifier=GradientBoostingRegressor(loss="lad"),parameters=parameter.gradient_parameter,
                           feature_columns= features,
                           train_df= combine_df.iloc[:14000], test_df=combine_df.iloc[14000:],
                           grid=False,train_target='rent',cv_split = 4)

Best Estimator GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.001, loss='lad', max_depth=3,
             max_features=None, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=130,
             min_weight_fraction_leaf=0.0, n_estimators=250,
             n_iter_no_change=None, presort='auto', random_state=None,
             subsample=1.0, tol=0.0001, validation_fraction=0.1, verbose=0,
             warm_start=False)
Best Parameters:  {'n_estimators': 250, 'min_samples_split': 130, 'learning_rate': 0.001}
Best Score:  -8108274.603773771
Test Loss: 34569.2963


In [14]:
# Ada Boosting with Linear Regression
modelProcessor.train_model(classifier=AdaBoostRegressor(base_estimator=LinearRegression()),
                           parameters=parameter.ada_parameter,
                           feature_columns= features,
                           train_df= combine_df.iloc[:14000], test_df=combine_df.iloc[14000:],
                           grid=False,train_target='rent',cv_split = 4)

Best Estimator AdaBoostRegressor(base_estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False),
         learning_rate=1e-05, loss='linear', n_estimators=250,
         random_state=None)
Best Parameters:  {'n_estimators': 250, 'learning_rate': 1e-05}
Best Score:  -2573662.845970374
Test Loss: 5688203.4539


# Test Features

In [15]:
features = [
    'bedrooms', 'bathrooms', 'year_built','addr_zip', 'bathrooms','size_sqft', 'no_fee'
]

In [18]:
#Decision Tree
parameter.dtree_less_parameter['max_features'] = np.arange(2, 6, 2)
modelProcessor.train_model(classifier=DecisionTreeClassifier(),parameters=parameter.dtree_less_parameter,
                           feature_columns= features,
                           train_df= combine_df.iloc[:14000], test_df=combine_df.iloc[14000:],
                           grid=False,train_target='rent',cv_split = 4)



Best Estimator DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features=4, max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=20, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
Best Parameters:  {'min_samples_split': 20, 'max_features': 4, 'max_depth': 10}
Best Score:  -3298075.4839285715
Test Loss: 6614827.2700


