In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
df_train = pd.read_csv('./datasets/train_cleaned_eng.csv')
df_test = pd.read_csv('./datasets/test_cleaned_eng.csv')

In [80]:
def feat_sel(df):
    
    # Feature Selection for tuning
    X_features = ['Id', 
#                 'lot_area', 
                'street', 
#                 'land_cont', 
                'neighborhood', 
                'cond_1',
#                 'cond_2', 
                'bldg_type', 
                'style', 
                'overall_qual',
                'overall_cond',
                'yr_built', 
                'yr_remodeled', 
                'roof_style', 
#                 'roof_mater', 
                'exter_1',
#                 'exter_qual',   #
                'exter_cond', 
                'foundation', 
                'bsmt_cond', 
                'bsmt_fin_1',
#                 'bsmt_fin_1_sf', 
#                 'bsmt_fin_2', 
#                 'bsmt_fin_2_sf', 
#                 'bsmt_sf', 
#                 'heat',
                'cent_air', 
#                 'gr_liv_area', 
                'full_bath', 
                'half_bath', 
                'bedrooms_gr',
#                 'kitchen', 
                'kitch_qual', 
                'tot_rooms_gr', 
                'fireplaces', 
                'garage_type',
                'garage_car_size', 
                'garage_cond', 
                'paved_drive', 
#                 'misc_val',
#                 'year_sold',
#                 'sale_price', 
#                 'has_pool', 
#                 'full_bath_shift',
#                 'half_bath_shift', 
#                 'bedrooms_gr_shift', 
#                 'fireplaces_shift',
#                 'garage_car_size_shift', 
#                 'lot_area_scaled_mm',
#                 'bsmt_fin_1_sf_scaled_mm', 
#                 'bsmt_fin_2_sf_scaled_mm',
#                 'bsmt_sf_scaled_mm', 
#                 'gr_liv_area_scaled_mm',
#                 'tot_rooms_gr_gr_liv_area', 
#                 'tot_rooms_gr_bedroom_gr',
#                 'full_bath_gr_liv_area', 
                'gar_car_size_overall_qual',
#                 'yr_built_overall_qual', 
                'full_bath_gr_liv_area_log',
#                 'tot_rooms_gr_bedroom_gr_log', 
#                 'lot_area_scaled_mm_log',
#                 'gr_liv_area_scaled_mm_log', 
                'bsmt_fin_1_sf_scaled_mm_log',
#                 'bsmt_fin_2_sf_scaled_mm_log', 
                'bsmt_sf_scaled_mm_log',
#                 'bedrooms_gr_shift_log', 
#                 'tot_rooms_gr_log',
                'tot_rooms_gr_gr_liv_area_log',
                  'lot_area_log',
                  'gr_liv_area_log',
#                   'mas_vnr_type'
               ]

    # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.select_dtypes.html
    dummies_columns = list(df[X_features].select_dtypes(include='object'))
    scalar_columns = list(df[X_features].select_dtypes(exclude='object'))
    
    if 'sale_price' not in df.columns:
        X_test = df[X_features]
        X_test = pd.get_dummies(data=X_test, columns=dummies_columns, drop_first=True)
        return X_test
    else:
        Z_train = df[X_features]
        Z_train = pd.get_dummies(data=Z_train, columns=dummies_columns, drop_first=True)
 
####################
#         X_train, X_validation, y_train, y_validation = train_test_split(Z_train, y, random_state=42)
#         sc = StandardScaler()                  
        
#         Z_train = sc.fit_transform(X_train)
#         Z_validation = sc.transform(X_validation)                
####################

#         Z_train_zip = list(zip(scalar_columns,Z_train))
#         Z_validation_zip = list(zip(scalar_columns,Z_validation))
        
#         Z_train = pd.DataFrame(Z_train_zip, columns = scalar_columns) 
#         Z_validation = pd.DataFrame(Z_validation_zip, columns = scalar_columns) 
        
#         print(Z_train)
#         print(Z_validation)
        
        return Z_train#, Z_validation

y = df_train['sale_price']

In [83]:
Z_train_df = feat_sel(df_train)

In [84]:
X_train, X_validation, y_train, y_validation = train_test_split(Z_train_df, y, random_state=42)
sc = StandardScaler()                  
        
Z_train = sc.fit_transform(X_train)
Z_validation = sc.transform(X_validation)  

In [85]:
Z_train.shape

(1536, 94)

In [86]:
Z_validation.shape

(513, 94)

In [88]:
ridge_model = Ridge(alpha=10)
ridge_model.fit(Z_train, y_train)

Ridge(alpha=10)

In [89]:
print('Training score:', ridge_model.score(Z_train, y_train))
print('Test score:', ridge_model.score(Z_validation, y_validation))  

Training score: 0.9101423525889385
Test score: 0.9152084449554241


In [93]:
r_alphas = np.logspace(0, 5, 100)   # Get 100 values from 10^0 to 10^5
ridge_cv = RidgeCV(alphas=r_alphas, scoring='r2', cv=5)   # 5-fold CV
ridge_cv.fit(Z_train, y_train);

In [99]:
ridge_cv.alpha_
ridge_cv.best_score_ 

0.8988683153328948

In [97]:
ridge_model.score(Z_train, y_train)

0.9101423525889385

In [101]:
print('Training score: ', ridge_cv.score(Z_train, y_train))
print('Test score: ', ridge_cv.score(Z_validation, y_validation))

Training score:  0.9140056063325047
Test score:  0.9174877007727654


In [105]:
l_alphas = np.logspace(-3, 0, 100)   # 10^-3 = .001 up to 10^0 = 1
lasso_cv = LassoCV(alphas=l_alphas, cv=5, max_iter=50000)
lasso_cv.fit(Z_train, y_train);

In [106]:
lasso_cv.alpha_

1.0

In [107]:
print('Training score: ', lasso_cv.score(Z_train, y_train))
print('Test score: ', lasso_cv.score(Z_validation, y_validation))

Training score:  0.9143310868288729
Test score:  0.9169528888296705


In [108]:
ridge_cv.coef_

array([ 1.57721788e+03, -1.73253889e+04,  7.05140098e+03,  1.03303546e+04,
        2.57385279e+03,  1.87554486e+04,  2.20613969e+03, -3.63804464e+03,
        2.12644619e+04,  2.89258113e+03, -3.08027263e+04,  6.28955137e+04,
       -3.68954378e+04,  9.61161272e+03,  1.17369640e+04, -4.14164012e+04,
        8.20379550e+03,  6.74174575e+04,  6.71378107e+02,  2.63118334e+03,
        2.30102050e+03,  1.90915297e+03,  6.27445900e+02,  4.23420301e+03,
        8.94423575e+02,  7.45975318e+02,  4.80623677e+02, -3.64762249e+02,
        9.71796903e+02,  9.33587497e+02,  2.31880671e+03,  5.68735795e+03,
        7.13842883e+03,  6.98487345e+02,  1.43086077e+03,  1.33916473e+03,
        1.17849113e+03,  3.40162973e+03,  6.03785357e+03,  1.49874484e+03,
        2.14211265e+02,  1.44381669e+03,  1.63490041e+03, -9.09190763e+02,
        2.39525419e+02, -9.63636749e+02, -3.30560043e+03, -2.51070710e+03,
       -3.02072573e+03,  5.54614760e+02, -1.40193063e+02,  6.75116991e+02,
        1.01396538e+03,  

In [109]:
lasso_cv.coef_

array([ 1.55136020e+03, -1.93382549e+04,  7.10555740e+03,  1.04728642e+04,
        2.48923573e+03,  2.79293117e+04,  1.91926909e+03, -3.37163549e+03,
        2.46644038e+04,  2.87601635e+03, -3.32320416e+04,  6.68719985e+04,
       -5.76922293e+04,  9.63637956e+03,  1.15348866e+04, -5.01636695e+04,
        7.94829593e+03,  8.70318041e+04,  6.56826805e+02,  2.63221098e+03,
        2.25540709e+03,  1.83983879e+03,  6.94253168e+02,  4.24137255e+03,
        8.17551811e+02,  9.18761887e+02,  3.74553907e+02, -4.74238265e+02,
        9.96954886e+02,  9.81000115e+02,  2.29069849e+03,  5.51954628e+03,
        7.10424997e+03,  6.55215076e+02,  1.36051556e+03,  1.34135500e+03,
        1.19449116e+03,  3.34738370e+03,  5.83230311e+03,  1.53994315e+03,
        1.20612627e+02,  1.46697710e+03,  1.70765249e+03, -8.61575586e+02,
        2.38452272e+02, -9.36085058e+02, -3.33254813e+03, -2.59807345e+03,
       -3.05992613e+03,  5.20161541e+02, -1.06123558e+02,  6.23540289e+02,
        1.00307278e+03,  

In [74]:
Z_train_df, Z_validation_df = feat_sel(df_train)
# X_test_df = feat_sel(df_test)

In [75]:
Z_train_df.shape

(1536, 94)

In [76]:
Z_validation_df.shape

(513, 94)