# Linear/ElasticNet Regression

In [11]:
import pandas as pd
import sklearn.model_selection as ms
import numpy as np

In [2]:
train = pd.read_csv('data_train.csv')
test = pd.read_csv('data_test.csv')

In [3]:
X = train.iloc[:,:-1].copy()
y = train['SalePrice'].copy()

In [4]:
#USING CV, NOT TEST SET
X_train, X_test, y_train, y_test = ms.train_test_split(X, y, test_size=.3, random_state=0)

In [5]:
from sklearn import linear_model

In [6]:
elastic = linear_model.ElasticNet()

In [7]:
elastic.fit(X_train,y_train)

ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [9]:
scores = ms.cross_val_score(estimator=elastic, X=X_train, y=y_train, cv=5)
print('Scores: {}'.format(scores))
print('Score mean: {}'.format(scores.mean()))
print('Score std deviation: {}'.format(scores.std()))

Scores: [0.80215875 0.86509    0.81748019 0.81152618 0.85397359]
Score mean: 0.830045741538693
Score std deviation: 0.024816096358561482


In [12]:
grid_param = {'alpha': np.logspace(-2, 4, 100),
             'l1_ratio': np.linspace(0,1,20)}

In [13]:
from sklearn.model_selection import GridSearchCV
para_search = GridSearchCV(estimator=elastic, param_grid=grid_param, cv=5, return_train_score=True)
para_search = para_search.fit(X_train, y_train)















































In [15]:
para_search.best_params_

{'alpha': 86.97490026177834, 'l1_ratio': 1.0}

In [16]:
para_search.best_score_

0.8924658801385176

In [18]:
best_elastic = linear_model.ElasticNet(alpha=86.97490026177834, l1_ratio=1.0)
best_elastic = elastic.fit(X_train,y_train)

In [23]:
best_elastic.predict(test)

array([114042.94245364, 162334.42406523, 175009.4025238 , ...,
       160427.31036634, 119622.13776133, 217796.49603476])

In [24]:
#let's see how this does on Kaggle, .165 regular ols performed better

# Random Forest

In [26]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()

In [29]:
from sklearn.model_selection import RandomizedSearchCV

In [30]:
par_grid = {'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

In [32]:
rf_random = RandomizedSearchCV(estimator = rf, 
                               param_distributions = par_grid, 
                               n_iter = 100, 
                               cv = 5, 
                               verbose=2, 
                               random_state=42, 
                               n_jobs = -1)

In [34]:
rf_random.fit(X, y)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   35.6s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed: 12.0min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 17.8min finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=100, n_jobs=-1,
          param_distributions={'bootstrap': [True, False], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], 'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [1, 2, 4], 'min_samples_split': [2, 5, 10], 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [36]:
rf_random.best_params_

{'bootstrap': False,
 'max_depth': 90,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 400}

In [41]:
best_rf = RandomForestRegressor(bootstrap= False,
  max_depth = 90,
  max_features = 'sqrt',
  min_samples_leaf = 1,
  min_samples_split = 2,
  n_estimators = 400)

In [42]:
best_rf.fit(X,y)

RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=90,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [45]:
best_rf.predict(test)

array([127882.3825, 161839.605 , 177784.9375, ..., 161025.395 ,
       114284.89  , 211403.635 ])

In [46]:
#let's see how this does on kaggle: .152 slightly better but very diminishing returns
submission = pd.DataFrame()
submission['Id'] = pd.read_csv('test.csv')['Id']
submission['SalePrice'] = best_rf.predict(test)
submission.to_csv('submission.csv',index=False)