In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler

In [2]:
df_train = pd.read_csv('./datasets/train_cleaned_eng.csv')
df_test = pd.read_csv('./datasets/test_cleaned_eng.csv')

In [3]:
def feat_sel(df):
    
    # Feature Selection for tuning
    X_features = ['Id',
#                 'lot_area', 
                'street', 
#                 'land_cont', 
                'neighborhood', 
                'cond_1',
#                 'cond_2', 
                'bldg_type', 
                'style', 
                'overall_qual',
                'overall_cond',
                'yr_built', 
                'yr_remodeled', 
                'roof_style', 
#                 'roof_mater', 
                'exter_1',
#                 'exter_qual',   #
                'exter_cond', 
                'foundation', 
                'bsmt_cond', 
                'bsmt_fin_1',
#                 'bsmt_fin_1_sf', 
#                 'bsmt_fin_2', 
#                 'bsmt_fin_2_sf', 
#                 'bsmt_sf', 
#                 'heat',
                'cent_air', 
#                 'gr_liv_area', 
                'full_bath', 
                'half_bath', 
                'bedrooms_gr',
#                 'kitchen', 
                'kitch_qual', 
                'tot_rooms_gr', 
                'fireplaces', 
                'garage_type',
                'garage_car_size', 
                'garage_cond', 
                'paved_drive', 
#                 'misc_val',
#                 'year_sold',
#                 'sale_price', 
#                 'has_pool', 
#                 'full_bath_shift',
#                 'half_bath_shift', 
#                 'bedrooms_gr_shift', 
#                 'fireplaces_shift',
#                 'garage_car_size_shift', 
#                 'lot_area_scaled_mm',
#                 'bsmt_fin_1_sf_scaled_mm', 
#                 'bsmt_fin_2_sf_scaled_mm',
#                 'bsmt_sf_scaled_mm', 
#                 'gr_liv_area_scaled_mm',
#                 'tot_rooms_gr_gr_liv_area', 
#                 'tot_rooms_gr_bedroom_gr',
#                 'full_bath_gr_liv_area', 
                'gar_car_size_overall_qual',
#                 'yr_built_overall_qual', 
                'full_bath_gr_liv_area_log',
#                 'tot_rooms_gr_bedroom_gr_log', 
#                 'lot_area_scaled_mm_log',
#                 'gr_liv_area_scaled_mm_log', 
                'bsmt_fin_1_sf_scaled_mm_log',
#                 'bsmt_fin_2_sf_scaled_mm_log', 
                'bsmt_sf_scaled_mm_log',
#                 'bedrooms_gr_shift_log', 
#                 'tot_rooms_gr_log',
                'tot_rooms_gr_gr_liv_area_log',
                  'lot_area_log',
                  'gr_liv_area_log',
#                   'mas_vnr_type'
               ]

    # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.select_dtypes.html
    dummies_columns = list(df[X_features].select_dtypes(include='object'))
    
    
    if 'sale_price' not in df.columns:
        X_test = df[X_features]
        X_test = pd.get_dummies(data=X_test, columns=dummies_columns, drop_first=True)
        return X_test
    else:
        X_train = df[X_features]
        return X_train

y = df_train['sale_price']

In [22]:
X_train = feat_sel(df_train)
X_test = feat_sel(df_test)

In [None]:
# IMPORT PICKLE

In [23]:
scalar_columns = list(X_train.select_dtypes(exclude='object'))


X_train, X_validation, y_train, y_validation = train_test_split(X_train, y, random_state=42)
sc = StandardScaler()                  

Z_train = sc.fit_transform(X_train[scalar_columns])
Z_validation = sc.transform(X_validation[scalar_columns]) 

In [24]:
Z_train.shape

(1536, 18)

In [25]:
Z_validation.shape

(513, 18)

In [44]:
ridge_model = Ridge(alpha=1)
ridge_model.fit(Z_train, y_train)

Ridge(alpha=1)

In [49]:
print('Training Ridge score:', ridge_model.score(Z_train, y_train))
print('Test Ridge score:', ridge_model.score(Z_validation, y_validation))  

Training Ridge score: 0.8813161938540166
Test Ridge score: 0.8954756019175981


In [46]:
r_alphas = np.logspace(0, 5, 100)   # Get 100 values from 10^0 to 10^5
ridge_cv = RidgeCV(alphas=r_alphas, scoring='r2', cv=5)   # 5-fold CV
ridge_cv.fit(Z_train, y_train);

In [47]:
# ridge_cv.alpha_
ridge_cv.best_score_ 

0.8762554244920997

In [48]:
ridge_model.score(Z_train, y_train)

0.8813161938540166

In [50]:
print('Training RidgeCV score: ', ridge_cv.score(Z_train, y_train))
print('Test RidgeCV score: ', ridge_cv.score(Z_validation, y_validation))

Training RidgeCV score:  0.8813161938540166
Test RidgeCV score:  0.8954756019175981


In [14]:
l_alphas = np.logspace(-3, 0, 100)   # 10^-3 = .001 up to 10^0 = 1
lasso_cv = LassoCV(alphas=l_alphas, cv=5, max_iter=50000)
lasso_cv.fit(Z_train, y_train);

In [15]:
lasso_cv.alpha_

1.0

In [51]:
print('Training LassoCV score: ', lasso_cv.score(Z_train, y_train))
print('Test LassoCV score: ', lasso_cv.score(Z_validation, y_validation))

Training LassoCV score:  0.8816577047040306
Test LassoCV score:  0.8947289572783704


In [17]:
ridge_cv.coef_

array([  1144.64876423, -25645.01942111,   7595.11815838,  11298.83145143,
         3102.23709454,  15031.65111386,    403.72346257,  -5539.90756224,
        30207.89675564,   3751.34455271, -46911.21563705,  96488.886475  ,
       -32426.96044208,   9826.5092647 ,   9123.62802456, -58681.16502173,
         8853.97097899,  78508.1412782 ])

In [18]:
lasso_cv.coef_

array([  1100.21823093, -27434.55148592,   7752.49818572,  11619.35521023,
         3040.81327515,  22884.87650116,    169.63801539,  -5220.87278266,
        33735.05398082,   3740.04818375, -48570.52914549,  99568.6613803 ,
       -50438.53910444,   9746.60965819,   9065.62454563, -68011.14349897,
         8731.18952565,  96586.67539116])

In [19]:
Z_train.shape

(1536, 18)

In [20]:
Z_validation.shape

(513, 18)