In [1]:
import pandas as pd
import numpy as np
from scipy.stats import boxcox, skew

from itertools import chain


from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV, LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error


import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df_train = pd.read_csv('../data/ames_clean_dropped.csv', index_col='id')
df_kag = pd.read_csv('../data/ames_clean_test_dropped.csv', index_col='id')



In [3]:
target = 'saleprice'
y = df_train[target]
X = df_train.drop(target,1)

In [4]:
def dummy_wrapper(X):
    
    X_obj_cols = X.select_dtypes(include='object').columns
    garage_cols = [col for col in X_obj_cols if 'garage' in col]
    bsmt_cols = [col for col in X_obj_cols if 'bsmt' in col]

    
    obj_cols = []
    for col in X_obj_cols:
        if (not col in bsmt_cols) and (not col in garage_cols):
            obj_cols.append(col)
    
    X = pd.get_dummies(data=X, columns=obj_cols,  prefix_sep='__', drop_first=True)
    
    garage = pd.get_dummies(data=X[garage_cols], prefix_sep='__', drop_first=False)
    bsmt = pd.get_dummies(data=X[bsmt_cols], prefix_sep='__', drop_first=False)

    
    X.drop(garage_cols,1, inplace=True)
    X.drop(bsmt_cols,1, inplace=True)
    no_garage_cols = [col for col in garage.columns if 'No Garage' in col]
    no_bsmt_cols = [col for col in bsmt.columns if 'No Basement' in col]

    
    
    keep_garage = no_garage_cols.pop()
    keep_bsmt = no_bsmt_cols.pop()

    garage.drop(labels=no_garage_cols, axis=1, inplace=True)
    bsmt.drop(labels=no_bsmt_cols, axis=1, inplace=True)

    return pd.concat([X, garage, bsmt], axis=1)

In [5]:
X.select_dtypes(exclude='object').apply(skew, 0)

lot_frontage      1.793047
lot_area          9.371408
overall_qual      0.156640
year_built       -0.596243
year_remod/add   -0.435334
mas_vnr_area      2.597052
bsmtfin_sf_1      1.616458
total_bsmt_sf     1.394309
1st_flr_sf        1.552764
gr_liv_area       1.274022
bsmt_full_bath    0.632617
full_bath         0.102101
half_bath         0.737156
totrms_abvgrd     0.841278
fireplaces        0.732460
garage_cars      -0.219673
garage_area       0.204486
wood_deck_sf      2.014519
open_porch_sf     2.315150
dtype: float64

In [6]:
s = X.describe().loc['min'] > 0 

s = [col for col in s.index if s[col]]
for col in s:
    if abs(skew(X[col])) > .25:
        X[col] = pd.Series(np.log1p(X[col]), name=col)
        df_kag[col] = pd.Series(np.log1p(df_kag[col]), name=col)
        

In [7]:
X.select_dtypes(exclude='object').apply(skew, 0)

lot_frontage     -0.995561
lot_area         -0.577488
overall_qual      0.156640
year_built       -0.622709
year_remod/add   -0.442313
mas_vnr_area      2.597052
bsmtfin_sf_1      1.616458
total_bsmt_sf     1.394309
1st_flr_sf        0.004449
gr_liv_area       0.001568
bsmt_full_bath    0.632617
full_bath         0.102101
half_bath         0.737156
totrms_abvgrd     0.087463
fireplaces        0.732460
garage_cars      -0.219673
garage_area       0.204486
wood_deck_sf      2.014519
open_porch_sf     2.315150
dtype: float64

In [8]:
X_dummies = dummy_wrapper(X)
kag_dummies = dummy_wrapper(df_kag)

In [9]:
X_dummies.shape

(2016, 245)

In [10]:
kag_dummies.shape

(879, 228)

In [11]:
for col in kag_dummies.columns:
    if col not in X_dummies.columns:
        print(col)

roof_matl__Metal
roof_matl__Roll
exterior_1st__PreCast
exterior_2nd__Other
exterior_2nd__PreCast
mas_vnr_type__CBlock
heating__GasA
kitchen_qual__Po
sale_type__VWD


In [12]:
for col in kag_dummies.columns:
    if col not in X_dummies.columns:
        kag_dummies.drop(col,1, inplace=True)
        
for col in X_dummies.columns:
    if col not in kag_dummies.columns:
        kag_dummies[col] = pd.Series([0 for _ in range(kag_dummies.shape[0])], index = kag_dummies.index)


In [13]:
X_dummies.sort_index(axis=1, inplace=True)
kag_dummies.sort_index(axis=1, inplace=True)


In [14]:
pf = PolynomialFeatures()
X_dummies_pf = pf.fit_transform(X_dummies)
kag_dummies_pf = pf.fit_transform(kag_dummies)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_dummies, y)

ss = StandardScaler()
ss.fit(X_train)

X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

y_train = np.log1p(y_train)
y_test = np.log1p(y_test)

kag_sc = ss.transform(kag_dummies)

In [16]:
model = LinearRegression()

model.fit(X_train_sc, y_train)
model.score(X_test_sc, y_test)

-8.600255296127391e+20

In [17]:
cross_val_score(LinearRegression(), X_train_sc, y_train, cv = 10)

array([-6.90546452e+19, -1.68828119e+26, -8.42545358e+18,  8.80959551e-01,
       -3.16057518e+21, -3.73334197e+25, -4.72071184e+20,  9.27513208e-01,
       -5.56689964e+21, -1.22779791e+20])

In [34]:
n_alphas = 10000

lasso_model = LassoCV(eps=.0001, n_alphas=n_alphas, max_iter=1000)
lasso_model.fit(X_train_sc, y_train)
lasso_model.score(X_test_sc, y_test)

0.8896267252791521

In [35]:
mean_squared_error(np.exp(lasso_model.predict(X_test_sc)) - 1, np.exp(y_test) - 1) **.5

27038.559473130255

In [30]:
y_pred = lasso_model.predict(kag_sc)
submission = pd.DataFrame(data=
                         {
                             'SalePrice':y_pred
                         },
                         index=df_kag.index)
submission = np.exp(submission) - 1
submission.sort_index().to_csv('lasso_submission.csv')

In [21]:
ridge_model = RidgeCV(alphas=np.linspace(4780, 4800, 1000))
ridge_model.fit(X_train_sc, y_train)
ridge_model.score(X_test_sc, y_test)

0.8424306408853481

In [22]:
ridge_model.alpha_

4780.0

In [23]:
mean_squared_error(np.exp(ridge_model.predict(X_test_sc)) - 1, np.exp(y_test) - 1) **.5

36577.41463911993

In [24]:
y_pred = ridge_model.predict(kag_sc)
submission = pd.DataFrame(data=
                         {
                             'SalePrice':y_pred
                         },
                         index=df_kag.index)
submission = np.exp(submission) - 1
submission.sort_index().to_csv('ridge_submission.csv')

In [25]:
l1_ratio = [*chain(np.linspace(.01, .3, 1), np.linspace(.6,1, 2))]
n_alphas = 3
max_iter = 1000

elastic_model = ElasticNetCV(l1_ratio=l1_ratio, n_alphas=n_alphas, max_iter=max_iter)
elastic_model.fit(X_train_sc, y_train)
elastic_model.score(X_test_sc, y_test)

0.8910679899322105

In [26]:
mean_squared_error(np.exp(elastic_model.predict(X_test_sc)) - 1, np.exp(y_test) - 1) **.5

28107.981968156684

In [27]:
y_pred = elastic_model.predict(kag_sc)
submission = pd.DataFrame(data=
                         {
                             'SalePrice':y_pred
                         },
                         index=df_kag.index)
submission = np.exp(submission) - 1
submission.sort_index().to_csv('elastic_submission.csv')