In [1]:
%matplotlib inline

import datetime
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import PolynomialFeatures as PF
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.pipeline import Pipeline
from sklearn import linear_model as lm
from sklearn import preprocessing
from sklearn import metrics
import seaborn as sns
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt

In [27]:
from sklearn.ensemble import RandomForestRegressor as RFR, ExtraTreesRegressor as ETR, GradientBoostingRegressor as GBR
from sklearn.neighbors import KNeighborsRegressor as KNR
from sklearn.tree import DecisionTreeRegressor as DTR
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor as GPR
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error
import time

train = pd.read_csv('data/train_fe.csv',delimiter=',',header=0)
train_labels = train["SalePrice"].copy()
train = train.drop( ["SalePrice"], axis=1 )
test = pd.read_csv('data/test_fe.csv',delimiter=',',header=0)

def simple_rmse(model, X, Y):
    y_pred = model.predict(X)
    rmse = -np.sqrt(mean_squared_error(y_pred, Y))
    return rmse

def rmse_cv(model, X_train, y):
    rmse= np.mean(np.sqrt(-cross_val_score(model, X_train, y, scoring="neg_mean_squared_error", cv = 5)))
    return(rmse)

print train.shape
print test.shape

(1460, 663)
(1459, 814)


In [25]:
rfr_model = RFR() #n_estimators, max_features, max_depth, bootstrap
etr_model = ETR() #n_estimatros, criterion, max_features, max_depth, bootstrap
gbr_model = GBR() #loss, learning_rate, n_estimators, max_depth, max_features, alpha
knr_model = KNR() #n_neighbors, weights
dtr_model = DTR() #max_features, max_depth
lsvr_model = SVR(kernel="linear") #C, epsilon, kernel, degree
psvr_model = SVR(kernel="poly") #C, epsilon, kernel, degree
rsvr_model = SVR(kernel="rbf") #C, epsilon, kernel, degree
gpr_model = GPR() #none

x = train
y = train_labels


In [28]:
##rfr
start = time.time()
rfr_params = {"n_estimators": [1,5,10,15,20,25,30,35,40,45,50],
             "max_features": ["auto","log2",None]}
gs1 = GridSearchCV(rfr_model, rfr_params, scoring = simple_rmse)
gs1.fit(x,y)
end = time.time()
print end-start
print gs1.best_estimator_
print gs1.best_score_
print gs1.best_params_
print gs1.cv_results_['mean_test_score']

83.3079998493
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=None, max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=30, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)
-0.153664545677
{'max_features': None, 'n_estimators': 30}
[-0.23330051 -0.182618   -0.16335608 -0.15953965 -0.15967473 -0.15559503
 -0.15560438 -0.15513566 -0.15443269 -0.1558685  -0.1553004  -0.31186511
 -0.21273691 -0.18286196 -0.18366161 -0.17875186 -0.17964826 -0.17630899
 -0.17405626 -0.17683347 -0.17347614 -0.17130541 -0.23712801 -0.17350446
 -0.16457753 -0.1587088  -0.15601605 -0.15708441 -0.15366455 -0.15502137
 -0.15542548 -0.15489488 -0.15559301]


In [None]:
##etr
start = time.time()
etr_params = {"n_estimators": [1,5,10,15,20,25,30,35,40,45,50],
             "max_features": ["auto","log2",None],
             "criterion": ["mse", "mae"],
             "bootstrap": [True, False]}
gs2 = GridSearchCV(etr_model, etr_params, scoring = simple_rmse)
gs2.fit(x,y)
end = time.time()
print end-start
print gs2.best_estimator_
print gs2.best_score_
print gs2.best_params_
print gs2.cv_results_['mean_test_score']


In [None]:
##gbr
start = time.time()
gbr_params = {"n_estimators": [10,50,100,150,200,250],
             "max_features": ["auto","log2",None],
             "max_depth": [1,2,3,4,5,6,7,8,9,10],
             "learning_rate": [.001, .01, .1, 1, 10],
             "loss": ["ls", "lad", "huber"]}
gs3 = GridSearchCV(gbr_model, gbr_params, scoring = simple_rmse)
gs3.fit(x,y)
end = time.time()
print end-start
print gs3.best_estimator_
print gs3.best_score_
print gs3.best_params_
print gs3.cv_results_['mean_test_score']

In [None]:
##knr
start = time.time()
knr_params = {"n_neighbors": [1,2,3,4,5,6,7,8,9,10],
             "weights": ["uniform", "distance"]}
gs4 = GridSearchCV(knr_model, knr_params, scoring = simple_rmse)
gs4.fit(x,y)
end = time.time()
print end-start
print gs4.best_estimator_
print gs4.best_score_
print gs4.best_params_
print gs4.cv_results_['mean_test_score']

In [None]:
##dtr
start = time.time()
dtr_params = {"max_features": ["auto","log2",None],
             "criterion": ["mse", "mae"]}
gs5 = GridSearchCV(dtr_model, dtr_params, scoring = simple_rmse)
gs5.fit(x,y)
end = time.time()
print end-start
print gs5.best_estimator_
print gs5.best_score_
print gs5.best_params_
print gs5.cv_results_['mean_test_score']

In [None]:
##linear svr
start = time.time()
lsvr_params = {"C": [.001,.01,.1,1,10,100]}
gs6 = GridSearchCV(lsvr_model, lsvr_params, scoring = simple_rmse)
gs6.fit(x,y)
end = time.time()
print end-start
print gs6.best_estimator_
print gs6.best_score_
print gs6.best_params_
print gs6.cv_results_['mean_test_score']

In [None]:
##poly svr
start = time.time()
psvr_params = {"C": [.001,.01,.1,1,10,100],
              "degree": [1,2,3]}
gs7 = GridSearchCV(psvr_model, psvr_params, scoring = simple_rmse)
gs7.fit(x,y)
end = time.time()
print end-start
print gs7.best_estimator_
print gs7.best_score_
print gs7.best_params_
print gs7.cv_results_['mean_test_score']

In [22]:
##rbf svr
start = time.time()
rsvr_params = {"C": [.001,.01,.1,1,10,100]}
gs8 = GridSearchCV(rsvr_model, rsvr_params, scoring = simple_rmse)
gs8.fit(x,y)
end = time.time()
print end-start
print gs8.best_estimator_
print gs8.best_score_
print gs8.best_params_
print gs8.cv_results_['mean_test_score']

96.7350001335
['mean_fit_time', 'mean_score_time', 'mean_test_score', 'mean_train_score', 'param_C', 'params', 'rank_test_score', 'split0_test_score', 'split0_train_score', 'split1_test_score', 'split1_train_score', 'split2_test_score', 'split2_train_score', 'std_fit_time', 'std_score_time', 'std_test_score', 'std_train_score']


In [None]:
##gpr
print rmse_cv(gpr_model, x, y)
#print sorted(gs9.cv_results_.keys())