In [48]:
from clean_data import *

In [49]:
import seaborn as sns
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn.svm import SVC, SVR
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [50]:
X1.columns

Index(['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearRemodAdd',
       'KitchenQual', 'Fireplaces', 'GarageArea', 'MiscVal', 'bathrm_cnt',
       'patioSF', 'dist', 'income', 'hood_Blmngtn', 'hood_BrDale',
       'hood_BrkSide', 'hood_ClearCr', 'hood_CollgCr', 'hood_Crawfor',
       'hood_Edwards', 'hood_Gilbert', 'hood_Greens', 'hood_IDOTRR',
       'hood_MeadowV', 'hood_Mitchel', 'hood_NPkVill', 'hood_NWAmes',
       'hood_NoRidge', 'hood_NridgHt', 'hood_OldTown', 'hood_SWISU',
       'hood_Sawyer', 'hood_SawyerW', 'hood_Somerst', 'hood_StoneBr',
       'hood_Timber', 'hood_Veenker', 'MSZoning_RH', 'MSZoning_RL',
       'MSZoning_RM', 'NearRR', 'NearPos', 'Artery', 'BldgType_2fmCon',
       'BldgType_Duplex', 'BldgType_Twnhs', 'BldgType_TwnhsE',
       'HouseStyle_1.5Fin', 'HouseStyle_2Story', 'HouseStyle_SFoyer',
       'HouseStyle_SLvl', 'Bsmt_ratio', 'Bedr_ratio', 'TotalBsmtSF',
       'bsmt_above_ratio'],
      dtype='object')

In [51]:
X1, X_test, Y, Y_test = train_test_split(X1, Y, test_size = 0.3)

In [52]:
scaler = StandardScaler().fit(X1)
features = scaler.transform(X1)
X1_std = pd.DataFrame(features, columns = X1.columns)

In [53]:
svm_model = SVR()
test = svm_model.fit(X1_std, Y)

In [54]:
svm_model.score(X1_std, Y)

0.5102606238230014

In [55]:
test.get_params()

{'C': 1.0,
 'cache_size': 200,
 'coef0': 0.0,
 'degree': 3,
 'epsilon': 0.1,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [56]:
params = {'C':np.linspace(1,100,20), 'gamma': ['auto', 'scale'], 
          'kernel': ['poly', 'linear', 'rbf']}
rand = RandomizedSearchCV(svm_model, params, n_iter = 15, cv = 5, n_jobs = -1)
rand.fit(X1_std, Y)

RandomizedSearchCV(cv=5, estimator=SVR(), n_iter=15, n_jobs=-1,
                   param_distributions={'C': array([  1.        ,   6.21052632,  11.42105263,  16.63157895,
        21.84210526,  27.05263158,  32.26315789,  37.47368421,
        42.68421053,  47.89473684,  53.10526316,  58.31578947,
        63.52631579,  68.73684211,  73.94736842,  79.15789474,
        84.36842105,  89.57894737,  94.78947368, 100.        ]),
                                        'gamma': ['auto', 'scale'],
                                        'kernel': ['poly', 'linear', 'rbf']})

In [57]:
rand.score(X1_std, Y)

0.8757517573590359

In [58]:
rand.best_params_

{'kernel': 'rbf', 'gamma': 'scale', 'C': 27.052631578947366}

In [59]:
predictions = rand.predict(X1_std)
print(np.sqrt(mean_squared_error(predictions, Y)))

5.2138365493642285


In [60]:
features = scaler.transform(X_test)
X_test = pd.DataFrame(features, columns = X1_std.columns)

In [61]:
predictions = rand.predict(X_test)
print(np.sqrt(mean_squared_error(predictions, Y_test)))
rand.score(X_test, Y_test)

7.979269933767032


0.7032858173000318

In [62]:
paramDict = {'C':np.linspace(1,100,20), 'gamma':np.linspace(1e-4, 1e-2, 10), 'kernel':['rbf', 'poly', 'linear']}
grid = GridSearchCV(svm_model, paramDict, cv=3, return_train_score = True)
grid.fit(X1_std, Y)

GridSearchCV(cv=3, estimator=SVR(),
             param_grid={'C': array([  1.        ,   6.21052632,  11.42105263,  16.63157895,
        21.84210526,  27.05263158,  32.26315789,  37.47368421,
        42.68421053,  47.89473684,  53.10526316,  58.31578947,
        63.52631579,  68.73684211,  73.94736842,  79.15789474,
        84.36842105,  89.57894737,  94.78947368, 100.        ]),
                         'gamma': array([0.0001, 0.0012, 0.0023, 0.0034, 0.0045, 0.0056, 0.0067, 0.0078,
       0.0089, 0.01  ]),
                         'kernel': ['rbf', 'poly', 'linear']},
             return_train_score=True)

In [63]:
grid.best_params_

{'C': 100.0, 'gamma': 0.0034, 'kernel': 'rbf'}

In [64]:
print(grid.best_score_)
predictions = grid.predict(X1_std)
np.sqrt(mean_squared_error(predictions, Y))

0.731032216016474


6.214219729782248

In [65]:
print(grid.score(X_test, Y_test))
predictions2 = grid.predict(X_test)
np.sqrt(mean_squared_error(predictions2, Y_test))

0.7369316767443672


7.513257658358812

In [66]:
grid.best_estimator_.feature_importances_

AttributeError: 'SVR' object has no attribute 'feature_importances_'