In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.set_option('display.max_columns', 90)
pd.set_option('display.max_rows', 90)

from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

In [2]:
df = pd.read_csv('../datasets/df_engineered.csv', index_col=0)
holdout = pd.read_csv('../datasets/holdout_engineered.csv', index_col=0)

In [3]:
df.shape

(2050, 197)

## Ridge Regression

In [4]:
def ridge_reg_score(X, y = df['saleprice']):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    
    ss = StandardScaler()
    ss.fit(X_train)
    X_train_sc = ss.transform(X_train)
    X_test_sc = ss.transform(X_test)
    
    l_alphas = np.linspace(0, 100, 100)
    ridge_model = RidgeCV(cv=5)
    ridge_model.fit(X_train_sc, y_train)
    print('optimal alpha: ', ridge_model.alpha_)
        
    ridge = Ridge(alpha = ridge_model.alpha_)
    ridge.fit(X_train_sc, y_train)
    print('cv score: ', round(cross_val_score(ridge, X_train_sc, y_train, cv=5).mean(), 4))

    # Generate predictions
    ridge_preds = ridge.predict(X_test_sc)
    ridge_preds_train = ridge.predict(X_train_sc)

    # Evaluate model.
    print('r2 train: ', r2_score(y_train, ridge_preds_train))
    print('r2 test: ', r2_score(y_test, ridge_preds))
    print('RMSE: ', round(mean_squared_error(y_test, ridge_preds)**0.5, 0))
    
    coefs = pd.DataFrame({'predictor': X.columns, 'ridge coef': abs(ridge.coef_)})
    return ridge

In [5]:
ridge_reg_score(df.drop(columns='saleprice'))

optimal alpha:  10.0
cv score:  0.8267
r2 train:  0.9251163421190819
r2 test:  0.9125755987510147
RMSE:  23374.0


Ridge(alpha=10.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

## Lasso Regression

In [6]:
def lasso_reg_score(X, y = df['saleprice']):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    
    ss = StandardScaler()
    ss.fit(X_train)
    X_train_sc = ss.transform(X_train)
    X_test_sc = ss.transform(X_test)
    
    l_alphas = np.linspace(0, 500, 100)
    lasso_model = LassoCV(cv=5)
    lasso_model.fit(X_train_sc, y_train)
    print('optimal alpha: ', lasso_model.alpha_)
        
    lasso = Lasso(alpha = lasso_model.alpha_)
    lasso.fit(X_train_sc, y_train)
    print('cv score: ', round(cross_val_score(lasso, X_train_sc, y_train, cv=5).mean(), 4))

    # Generate predictions
    lasso_preds = lasso.predict(X_test_sc)
    lasso_preds_train = lasso.predict(X_train_sc)

    # Evaluate model.
    print('r2 train: ', r2_score(y_train, lasso_preds_train))
    print('r2 test: ', r2_score(y_test, lasso_preds))
    print('RMSE: ', round(mean_squared_error(y_test, lasso_preds)**0.5, 0))
    
    coefs = pd.DataFrame({'predictor': X.columns, 'lasso coef': abs(lasso.coef_)})
    return lasso

In [7]:
lasso_reg_score(df.drop(columns='saleprice'))

optimal alpha:  751.8060998981042
cv score:  0.8346
r2 train:  0.8960624228111449
r2 test:  0.9030776248853241
RMSE:  24611.0


Lasso(alpha=751.8060998981042, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

## Elastic Net Regression

In [8]:
def enet_reg_score(X, y = df['saleprice']):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    
    ss = StandardScaler()
    ss.fit(X_train)
    X_train_sc = ss.transform(X_train)
    X_test_sc = ss.transform(X_test)
    
    l_alphas = np.linspace(0, 300, 100)
    enet_model = ElasticNetCV(cv=5)
    enet_model.fit(X_train_sc, y_train)
    print('optimal alpha: ', enet_model.alpha_)
        
    enet = ElasticNet(alpha = enet_model.alpha_)
    enet.fit(X_train_sc, y_train)
    print('cv score: ', round(cross_val_score(enet, X_train_sc, y_train, cv=5).mean(), 4))

    # Generate predictions
    enet_preds = enet.predict(X_test_sc)
    enet_preds_train = enet.predict(X_train_sc)

    # Evaluate model.
    print('r2 train: ', r2_score(y_train, enet_preds_train))
    print('r2 test: ', r2_score(y_test, enet_preds))
    print('RMSE: ', round(mean_squared_error(y_test, enet_preds)**0.5, 0))
    
    coefs = pd.DataFrame({'predictor': X.columns, 'enet coef': abs(enet.coef_)})
    return enet

In [9]:
enet_reg_score(df.drop(columns='saleprice'))

optimal alpha:  130.77652110966824
cv score:  0.3047
r2 train:  0.31019023197451034
r2 test:  0.3121475466065734
RMSE:  65564.0


ElasticNet(alpha=130.77652110966824, copy_X=True, fit_intercept=True,
      l1_ratio=0.5, max_iter=1000, normalize=False, positive=False,
      precompute=False, random_state=None, selection='cyclic', tol=0.0001,
      warm_start=False)