# Ames Housing Sales Price Model-Optimization

### Paul Schimek

March 25, 2019

In [3]:
import numpy as np
import pandas as pd
import patsy
import pickle

from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler;

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [2]:
df = pd.read_csv('./datasets/train.csv')

In [7]:
# variable transformations
def transform(df):
    df['culdesac'] = df['Lot Config'].apply(lambda x: 1 if x == 'CulDSac' else 0)
    df['fancysuburb'] = df['Neighborhood'].apply(lambda x: 1 if x in ['GrnHill','NoRidge','NridgHt','StoneBr'] else 0)
    df['suburb'] = df['Neighborhood'].apply(lambda x: 1 if x in ['Blmngtn','ClearCr','CollgCr','Crawfor','Gilbert',
                                                             'Greens','NWAmes','SawyerW','Somerst','Timber',
                                                             'Veenker','Mitchel'] else 0)
    df['near_road'] = df['Condition 1'].apply(lambda x: 1 if x in ['Artery','Feedr'] else 0)
    df['bad_qual'] = df['Overall Qual'].apply(lambda x: 1 if x < 4 else 0)
    df['exc_qual'] = df['Overall Qual'].apply(lambda x: 1 if x > 8 else 0)
    df['good_qual'] = df['Overall Qual'].apply(lambda x: 1 if  x == 7 else 0)
    df['very_good_qual'] = df['Overall Qual'].apply(lambda x: 1 if x == 8 else 0)
    df['below_avg_qual'] = df['Overall Qual'].apply(lambda x: 1 if x == 4 else 0) # 5 = average - reference case
    df['above_avg_qual'] = df['Overall Qual'].apply(lambda x: 1 if x == 6 else 0)
    df['lot_ar_sq'] = df['Lot Area']**2
    df['lot_ar_cu'] = df['Lot Area']**3
    df['live_ar_sq'] = df['Gr Liv Area']**2
    df['live_ar_cu'] = df['Gr Liv Area']**3
    df['year_built_sq'] = df['Year Built']**2
    df['frontageLT35'] = df['Lot Frontage'].apply(lambda x: 1 if x <35 else 0)
    df['frontageLT50'] = df['Lot Frontage'].apply(lambda x: 1 if 35 <= x < 50 else 0)
    df['ac'] = df['Central Air'].apply(lambda x: 1 if x =='Y' else 0)
    df['zoning'] = df['MS Zoning'].apply(lambda x: 1 if x in ['FV','RL'] else 0)
    df['concrete'] = df['Foundation'].apply(lambda x: 1 if x == 'PConc' else 0)
    df['exc_basement'] = df['Bsmt Qual'].apply(lambda x: 1 if x == 'Ex' else 0)
    df['new'] = df['Year Built'].apply(lambda x: 1 if x > 2005 else 0)
    df['remodel'] = df['Year Remod/Add'].apply(lambda x: 1 if x > 2005 else 0)
    df['new_const'] = df['Sale Type'].apply(lambda x: 1 if x == 'New' else 0)
    df['exc_kitchen'] = df['Kitchen Qual'].apply(lambda x: 1 if x == 'Ex' else 0)
    df['good_kitchen'] = df['Kitchen Qual'].apply(lambda x: 1 if x == 'Gd' else 0)
    df['num_cars'] = df['Garage Cars'].apply(lambda x: 2 if x == 5 else x) # the single case with 5 seems to be an error
    df['damage'] = df['Functional'].apply(lambda x: 0 if x == 'Typ' else 1)
    df['num_cars'] = df['num_cars'].replace(np.NaN, 0)
    df['0_garage'] = df['num_cars'].apply(lambda x: 1 if x == 0 else 0)
    df['1_garage'] = df['num_cars'].apply(lambda x: 1 if x == 1 else 0)
    df['3_garage'] = df['num_cars'].apply(lambda x: 1 if x == 3 else 0)
    df['4_garage'] = df['num_cars'].apply(lambda x: 1 if x == 4 else 0)    
    df['bad_condition'] = df['Overall Cond'].apply(lambda x: 1 if x <5 else 0)
    df['good_basement'] = df['Bsmt Qual'].apply(lambda x: 1 if x == 'Gd' else 0)
    df['old_1story'] = df['MS SubClass'].apply(lambda x: 1 if x ==  30 else 0)
    df['new_2story'] = df['MS SubClass'].apply(lambda x: 1 if x ==  60 else 0)
    df['hill'] = df['Land Contour'].apply(lambda x: 1 if x == 'HLS' else 0)
    df['corner'] = df['Lot Config'].apply(lambda x: 1 if x == 'Corner' else 0)
    df['near_artery'] = df['Condition 1'].apply(lambda x: 1 if x == 'Artery' else 0)
    df['near_feeder'] = df['Condition 1'].apply(lambda x: 1 if x == 'Feedr' else 0)
    df['near_park'] = df['Condition 1'].apply(lambda x: 1 if x in ['PosA','PosN'] else 0)
    df['hip_roof'] = df['Roof Style'].apply(lambda x: 1 if x == 'Hip' else 0)
    df['l_lot_area'] = np.log(df['Lot Area'])
    df['attached_garage'] =  df['Garage Type'].apply(lambda x: 1 if x in ['Attchd','BuiltIn'] else 0)
    df['full_bath_0'] = df['Full Bath'].apply(lambda x: 1 if x == 0 else 0) 
    df['full_bath_2'] = df['Full Bath'].apply(lambda x: 1 if x == 2 else 0)
    df['full_bath_3'] = df['Full Bath'].apply(lambda x: 1 if x == 3 else 0)
    df['full_bath_4'] = df['Full Bath'].apply(lambda x: 1 if x > 3 else 0)
    df['half_bath_1'] = df['Half Bath'].apply(lambda x: 1 if x == 1 else 0) 
    df['half_bath_2'] = df['Half Bath'].apply(lambda x: 1 if x > 1 else 0)
    df['breakers'] = df['Electrical'].apply(lambda x: 1 if x == 'SBrkr' else 0)
    df['unf_garage']=df['Garage Finish'].apply(lambda x: 1 if x == 'Unf' else 0)
    df['paved_driveway']=df['Paved Drive'].apply(lambda x: 1 if x == 'Y' else 0)
    df['two_fam']=df['Bldg Type'].apply(lambda x: 1 if x in ['2fmCon','Duplex','Twnhs'] else 0)
    df['two_story']=df['House Style'].apply(lambda x: 1 if x in ['2Story','2.5Fin','2.5Unf'] else 0)
    df['l_living_area'] = np.log(df['Gr Liv Area'])
    df['2010'] = df['Yr Sold'].apply(lambda x: 1 if x == 2010 else 0) # reference is 2006
    df['2009'] = df['Yr Sold'].apply(lambda x: 1 if x == 2009 else 0)
    df['2008'] = df['Yr Sold'].apply(lambda x: 1 if x == 2008 else 0)
    df['2007'] = df['Yr Sold'].apply(lambda x: 1 if x == 2007 else 0)
    df['cement_board'] = df['Exterior 1st'].apply(lambda x: 1 if x == 'CemntBd' else 0)
    df['vinyl_siding'] = df['Exterior 1st'].apply(lambda x: 1 if x == 'VinylSd' else 0)
    df['basement_area'] = df['Total Bsmt SF'].replace(np.NaN, 0) # recode single NaN as 0
    

In [8]:
transform(df)

In [11]:
 # read list of features from basemodel notebook
with open('features', 'rb') as f:
    features = pickle.load(f)

In [12]:
# set X and Y for full training sample
X = df[features]
y = df['SalePrice']
l_y = np.log(y)

In [13]:
# create instances of different models
lr = LinearRegression()
lasso = LassoCV(cv=5,max_iter=5000)
ridge = RidgeCV(cv=5)
elastic = ElasticNetCV(cv=5,max_iter=5000)

In [14]:
# scale Xs
ss = StandardScaler()
ss.fit(X) # learning mean and s.d. for every column
X_sc = ss.transform(X)

In [15]:
cross_val_score(lr, X_sc, l_y, cv=5).mean()

0.8781069765747412

In [16]:
cross_val_score(lasso, X_sc, l_y, cv=5).mean()

0.8780093264970461

In [17]:
cross_val_score(ridge, X_sc, l_y, cv=5).mean()

0.8780974010944174

In [18]:
lasso.fit(X_sc,l_y)
lasso.alpha_

0.0002828238779940122

In [19]:
lasso.score(X_sc,l_y)

0.8907277515231047

In [20]:
ridge.fit(X_sc,l_y)
ridge.alpha_

0.1

In [21]:
ridge.score(X_sc,l_y)

0.8907813965041373

### Polynomial and interaction transformations

In [22]:
poly = PolynomialFeatures(degree=2,include_bias=False) 
X_poly = poly.fit_transform(X_sc)

In [23]:
cross_val_score(lasso, X_poly, l_y, cv=5).mean()

0.8773800867756861

In [24]:
lasso.fit(X_poly,l_y)
lasso.alpha_

0.007994106175169066

In [56]:
lasso.score(X_poly,l_y)

0.9014507316755946

In [25]:
# identify which coefficients are non-zero after using LASSO on poly model
coeffs = [coef for coef in lasso.coef_]

model_coef = {}
for index, coef in enumerate(coeffs):
    if np.abs(coef) >.001:
        model_coef.update({index:coef})
for key in model_coef.keys():
    for index, item in enumerate(poly.get_feature_names(X.columns)):
        if index == key:
            model_coef[item] = model_coef.pop(key)
            print (index,item)

  for c in combinations)


0 Gr Liv Area
2 basement_area
3 Lot Area
15 suburb
19 Year Built
25 Fireplaces
27 unf_garage
28 num_cars
30 concrete
40 good_kitchen
90 live_ar_sq^2
91 live_ar_sq basement_area
94 live_ar_sq TotRms AbvGrd
219 frontageLT35^2
238 frontageLT35 zoning
271 TotRms AbvGrd near_artery
276 TotRms AbvGrd two_fam
284 TotRms AbvGrd paved_driveway
291 TotRms AbvGrd bad_qual
320 Bedroom AbvGr breakers
368 full_bath_0 bad_qual
414 full_bath_3^2
420 full_bath_3 suburb
505 half_bath_2 exc_qual
516 half_bath_2 bad_condition
519 hill^2
552 culdesac^2
1052 remodel^2
1059 exc_kitchen^2
1070 damage^2
1074 bad_condition^2
1076 bad_condition 2010


In [26]:
ridge.fit(X_poly,y)

RidgeCV(alphas=(0.1, 1.0, 10.0), cv=5, fit_intercept=True, gcv_mode=None,
    normalize=False, scoring=None, store_cv_values=False)

In [27]:
ridge.score(X_poly,y)

0.9681351504716545

In [29]:
ridge.alpha_

10.0

In [32]:
# use CV to test models
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

y_train = np.log(y_train)  # estimating log-linear models
y_test = np.log(y_test)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)
poly = PolynomialFeatures(degree=2,interaction_only=False,include_bias=False) 
X_train_poly = poly.fit_transform(X_train_sc)
X_test_poly = poly.fit_transform(X_test_sc)

In [30]:
poly.n_output_features_

1080

In [33]:
lasso.fit(X_train_poly,y_train)
lasso.score(X_train_poly,y_train)

0.9081166685486177

In [35]:
lasso.score(X_test_poly,y_test)

0.8798964828299061

In [36]:
lasso.alpha_

0.008971436886568667

In [34]:
ridge.fit(X_train_poly,y_train)
ridge.score(X_train_poly,y_train)

0.9729343485596982

In [38]:
ridge.score(X_test_poly,y_test)

0.4836514580767832

In [37]:
elastic.fit(X_train_poly,y_train)
elastic.score(X_train_poly,y_train)

0.9079418497850235

In [39]:
elastic.score(X_test_poly,y_test)

0.879736098280242

In [40]:
elastic.alpha_

0.017942873773137337

#### The lasso model seems to work best. Apply it to the full training data, and use the resulting model to predict SalePrice in the test data.

In [42]:
# Read and transform test data, including scaling and polynomials
test = pd.read_csv('./datasets/test.csv')
transform(test)
X_kaggle = test[features]
# X_kaggle = sm.add_constant(X_kaggle)  # just to have the same size array as X
X_kaggle_sc = ss.transform(X_kaggle) # using mean & s.d. from TRAIN set to scale test set
X_kaggle_poly = poly.fit_transform(X_kaggle_sc)

In [44]:
lasso.fit(X_poly,l_y)
lasso.alpha_

0.007994106175169066

In [45]:
# generate predicted y (SalePrice)
test['SalePrice'] = np.exp(lasso.predict(X_kaggle_poly))

In [46]:
submit = test[['Id','SalePrice']]
submit.to_csv('submit21.csv',index=False)