In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from math import exp

from sklearn.linear_model import Ridge
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 300)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [2]:
from jj_imputer import HousingImpute
from jj_dummification import *

In [3]:
housing = HousingImpute('train.csv')
housing.run_imputers()
housing.left_to_impute()

LotFrontage 259
Alley 1369
MasVnrType 8
MasVnrArea 8
BsmtQual 37
BsmtCond 37
BsmtExposure 38
BsmtFinType1 37
BsmtFinType2 38
Electrical 1
FireplaceQu 690
GarageType 81
GarageYrBlt 81
GarageFinish 81
GarageQual 81
GarageCond 81
PoolQC 1453
Fence 1179
MiscFeature 1406
MasVnrArea imputer embedded in another imputer
BsmtCond imputer embedded in another imputer
BsmtExposure imputer embedded in another imputer
BsmtFinType1 imputer embedded in another imputer
BsmtFinType2 imputer embedded in another imputer
GarageYrBlt imputer embedded in another imputer
GarageFinish imputer embedded in another imputer
GarageQual imputer embedded in another imputer
GarageCond imputer embedded in another imputer
BsmtExposure 1
--------------------
Id
949    NaN
Name: BsmtExposure, dtype: object
--------------------
BsmtFinType2 1
--------------------
Id
333    NaN
Name: BsmtFinType2, dtype: object
--------------------


In [4]:
housing.df.loc[949, 'BsmtExposure'] = 'No'
housing.df.loc[333, 'BsmtFinType2'] = 'Rec'

In [5]:
#final check for missing values
housing.left_to_impute()
housing.df.shape

(1460, 80)

In [6]:
housing.df.drop('Utilities', axis=1, inplace=True)
#drop utilities since 1459 out of 1600 share same value
housing.df.drop('TotalBsmtSF', axis=1, inplace=True)
#drop TotalBsmtSF for multicollinearity since it is the sum of the 3 other BsmtSF columns

housing.df.loc[:,'Exterior2nd'].replace('CmentBd','CemntBd', inplace=True)
#correcting for spelling errors to make the types of values in Exterior1st and Exterior2nd the same

In [7]:
#jimmy's list of outliers by eye test. manually placing here for easy reference in future
final_outlier_id = {935: 'LotFrontage',
                     1299: ['LotFrontage', 'BsmtFinSF1', '1stFlrSF', 'GrLivArea', 'GarageArea'],
                     250: 'LotArea',
                     314: 'LotArea',
                     336: 'LotArea',
                     707: 'LotArea',
                     298: 'MasVnrArea',
                     1170: 'MasVnrArea',
                     186: 'LowQualFinSF',
                     524: 'GrLivArea',
                     582: 'GarageArea',
                     1062: 'GarageArea',
                     1191: 'GarageArea',
                     54: 'WoodDeckSF',
                     496: 'OpenPorchSF',
                     1183: 'PoolArea'}

In [8]:
new_housing = HousingCategorical()
new_housing.df = housing.df
new_housing.df.shape
new_housing.ohe_features = new_housing.ohe_features + new_housing.label_encode_features

# new_housing.list_checker()
new_housing.one_hot_encode()

(1460, 78)

In [9]:
new_housing.df.shape

(1460, 284)

In [10]:
new_housing.df.drop(final_outlier_id, axis=0, inplace=True)

In [13]:
new_housing.df.shape

(1444, 284)

Outliers have been removed in jj_lr and df is fully OHE. Probably use min_max scaler (retest this at the end). Now we optimize for lasso/ridge alphas. See which features can be removed.

In [11]:
from sklearn.linear_model import Lasso, Ridge
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

continuous = ['LotFrontage','LotArea','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','1stFlrSF',
                 '2ndFlrSF','LowQualFinSF','GrLivArea','GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch',
                  '3SsnPorch','ScreenPorch','PoolArea','MiscVal','YearBuilt','YearRemodAdd','GarageYrBlt']

min_max = MinMaxScaler()
standard_scaler = StandardScaler()

In [14]:
housing = new_housing.df

In [15]:
housing.iloc[:,:-1] = min_max.fit_transform(housing.iloc[:,:-1])

In [16]:
housing.sample(2)

Unnamed: 0_level_0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MSSubClass_30,MSSubClass_40,MSSubClass_45,MSSubClass_50,MSSubClass_60,MSSubClass_70,MSSubClass_75,MSSubClass_80,MSSubClass_85,MSSubClass_90,MSSubClass_120,MSSubClass_160,MSSubClass_180,MSSubClass_190,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Pave,Alley_No_Alley,Alley_Pave,LotShape_IR2,LotShape_IR3,LotShape_Reg,LandContour_HLS,LandContour_Low,LandContour_Lvl,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,LandSlope_Mod,LandSlope_Sev,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,Condition1_Feedr,Condition1_Norm,Condition1_PosA,Condition1_PosN,Condition1_RRAe,Condition1_RRAn,Condition1_RRNe,Condition1_RRNn,Condition2_Feedr,Condition2_Norm,Condition2_PosA,Condition2_PosN,Condition2_RRAe,Condition2_RRAn,Condition2_RRNn,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,RoofStyle_Gable,RoofStyle_Gambrel,RoofStyle_Hip,RoofStyle_Mansard,RoofStyle_Shed,RoofMatl_CompShg,RoofMatl_Membran,RoofMatl_Metal,RoofMatl_Roll,RoofMatl_Tar&Grv,RoofMatl_WdShake,RoofMatl_WdShngl,Exterior1st_AsphShn,Exterior1st_BrkComm,Exterior1st_BrkFace,Exterior1st_CBlock,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_ImStucc,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_Stone,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing,Exterior2nd_AsphShn,Exterior2nd_Brk Cmn,Exterior2nd_BrkFace,Exterior2nd_CBlock,Exterior2nd_CemntBd,Exterior2nd_HdBoard,Exterior2nd_ImStucc,Exterior2nd_MetalSd,Exterior2nd_Other,Exterior2nd_Plywood,Exterior2nd_Stone,Exterior2nd_Stucco,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,Exterior2nd_Wd Shng,MasVnrType_BrkFace,MasVnrType_None,MasVnrType_Stone,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,BsmtFinType1_BLQ,BsmtFinType1_GLQ,BsmtFinType1_LwQ,BsmtFinType1_No_Bsmt,BsmtFinType1_Rec,BsmtFinType1_Unf,BsmtFinType2_BLQ,BsmtFinType2_GLQ,BsmtFinType2_LwQ,BsmtFinType2_No_Bsmt,BsmtFinType2_Rec,BsmtFinType2_Unf,Heating_GasA,Heating_GasW,Heating_Grav,Heating_OthW,Heating_Wall,CentralAir_Y,Electrical_FuseF,Electrical_FuseP,Electrical_Mix,Electrical_SBrkr,Functional_Maj2,Functional_Min1,Functional_Min2,Functional_Mod,Functional_Sev,Functional_Typ,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageType_No_G,PavedDrive_P,PavedDrive_Y,Fence_GdWo,Fence_MnPrv,Fence_MnWw,Fence_No_Fence,MiscFeature_No_MF,MiscFeature_Othr,MiscFeature_Shed,MiscFeature_TenC,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,MoSold_2,MoSold_3,MoSold_4,MoSold_5,MoSold_6,MoSold_7,MoSold_8,MoSold_9,MoSold_10,MoSold_11,MoSold_12,YrSold_2007,YrSold_2008,YrSold_2009,YrSold_2010,ExterQual_Fa,ExterQual_Gd,ExterQual_TA,ExterCond_Fa,ExterCond_Gd,ExterCond_Po,ExterCond_TA,BsmtQual_Fa,BsmtQual_Gd,BsmtQual_No_Bsmt,BsmtQual_TA,BsmtCond_Gd,BsmtCond_No_Bsmt,BsmtCond_Po,BsmtCond_TA,BsmtExposure_Gd,BsmtExposure_Mn,BsmtExposure_No,BsmtExposure_No_Bsmt,HeatingQC_Fa,HeatingQC_Gd,HeatingQC_Po,HeatingQC_TA,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA,FireplaceQu_Fa,FireplaceQu_Gd,FireplaceQu_No_FP,FireplaceQu_Po,FireplaceQu_TA,GarageFinish_No_G,GarageFinish_RFn,GarageFinish_Unf,GarageQual_Fa,GarageQual_Gd,GarageQual_No_G,GarageQual_Po,GarageQual_TA,GarageCond_Fa,GarageCond_Gd,GarageCond_No_G,GarageCond_Po,GarageCond_TA,PoolQC_Fa,PoolQC_Gd,PoolQC_No_Pool,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1,Unnamed: 196_level_1,Unnamed: 197_level_1,Unnamed: 198_level_1,Unnamed: 199_level_1,Unnamed: 200_level_1,Unnamed: 201_level_1,Unnamed: 202_level_1,Unnamed: 203_level_1,Unnamed: 204_level_1,Unnamed: 205_level_1,Unnamed: 206_level_1,Unnamed: 207_level_1,Unnamed: 208_level_1,Unnamed: 209_level_1,Unnamed: 210_level_1,Unnamed: 211_level_1,Unnamed: 212_level_1,Unnamed: 213_level_1,Unnamed: 214_level_1,Unnamed: 215_level_1,Unnamed: 216_level_1,Unnamed: 217_level_1,Unnamed: 218_level_1,Unnamed: 219_level_1,Unnamed: 220_level_1,Unnamed: 221_level_1,Unnamed: 222_level_1,Unnamed: 223_level_1,Unnamed: 224_level_1,Unnamed: 225_level_1,Unnamed: 226_level_1,Unnamed: 227_level_1,Unnamed: 228_level_1,Unnamed: 229_level_1,Unnamed: 230_level_1,Unnamed: 231_level_1,Unnamed: 232_level_1,Unnamed: 233_level_1,Unnamed: 234_level_1,Unnamed: 235_level_1,Unnamed: 236_level_1,Unnamed: 237_level_1,Unnamed: 238_level_1,Unnamed: 239_level_1,Unnamed: 240_level_1,Unnamed: 241_level_1,Unnamed: 242_level_1,Unnamed: 243_level_1,Unnamed: 244_level_1,Unnamed: 245_level_1,Unnamed: 246_level_1,Unnamed: 247_level_1,Unnamed: 248_level_1,Unnamed: 249_level_1,Unnamed: 250_level_1,Unnamed: 251_level_1,Unnamed: 252_level_1,Unnamed: 253_level_1,Unnamed: 254_level_1,Unnamed: 255_level_1,Unnamed: 256_level_1,Unnamed: 257_level_1,Unnamed: 258_level_1,Unnamed: 259_level_1,Unnamed: 260_level_1,Unnamed: 261_level_1,Unnamed: 262_level_1,Unnamed: 263_level_1,Unnamed: 264_level_1,Unnamed: 265_level_1,Unnamed: 266_level_1,Unnamed: 267_level_1,Unnamed: 268_level_1,Unnamed: 269_level_1,Unnamed: 270_level_1,Unnamed: 271_level_1,Unnamed: 272_level_1,Unnamed: 273_level_1,Unnamed: 274_level_1,Unnamed: 275_level_1,Unnamed: 276_level_1,Unnamed: 277_level_1,Unnamed: 278_level_1,Unnamed: 279_level_1,Unnamed: 280_level_1,Unnamed: 281_level_1,Unnamed: 282_level_1,Unnamed: 283_level_1,Unnamed: 284_level_1
1018,0.24946,0.06499,0.77778,0.5,0.81159,0.56667,0.0,0.47349,0.0,0.07877,0.35453,0.0,0.0,0.25766,0.33333,0.0,0.33333,0.0,0.125,0.33333,0.16667,0.33333,0.81159,0.5,0.46311,0.0856,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,187500
1163,0.54658,0.10688,0.44444,0.5,0.69565,0.3,0.0,0.22486,0.0,0.17209,0.1935,0.0,0.0,0.14063,0.0,0.0,0.33333,0.0,0.375,0.33333,0.25,0.33333,0.69565,0.5,0.36885,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,129000


In [17]:
housing.shape

(1444, 284)

In [18]:
random_state = 24
def score_log_lasso(alphas):
    score_log = []
    
    k6 = KFold(n_splits=6, shuffle=True, random_state=random_state)

    train = []
    test = []
    for train_idx, val_idx in k6.split(housing):
        train.append(train_idx)
        test.append(val_idx)
    
    for alpha in alphas:
        lasso = Lasso(random_state=random_state, alpha=alpha, max_iter=10000)
        for i in range(6):
            single_log = []
           
            X_train = housing.iloc[train[i],:-1]
            X_test = housing.iloc[test[i],:-1]
            y_train = np.log(housing.iloc[train[i],-1])
            y_test = housing.iloc[test[i],-1]
            
            lasso.fit(X_train, y_train)
            
            single_log.append(random_state)
            single_log.append(alpha*10000)
            single_log.append(i)
            single_log.append(lasso.score(X_train, y_train))
            single_log.append(lasso.score(X_test, np.log(y_test)))
            y_pred = list(map(lambda x: exp(x),lasso.predict(X_test)))
            
            single_log.append(mean_squared_error(y_test,y_pred)**0.5)
            score_log.append(single_log)
    
    score_df = pd.DataFrame(score_log, columns=['RandomState','Alpha','Fold','Train','Test','MSE'])
    
    return score_df
            
            

In [19]:
lasso_stan_all_scale= score_log_lasso(np.linspace(0.0012,0.002,19))
lasso_stan_all_scale.groupby('Alpha').mean()['MSE']

Alpha
12.00000   21446.88788
12.44444   21567.33539
12.88889   21684.89834
13.33333   21802.18405
13.77778   21919.45778
14.22222   22038.45374
14.66667   22147.32614
15.11111   22251.21970
15.55556   22356.28582
16.00000   22462.24344
16.44444   22569.66701
16.88889   22672.46425
17.33333   22774.02808
17.77778   22871.67743
18.22222   22968.88165
18.66667   23067.54741
19.11111   23164.65317
19.55556   23262.40563
20.00000   23362.58625
Name: MSE, dtype: float64

In [20]:
lasso_stan_cont_scale= score_log_lasso(np.linspace(0.00020,0.0005,19))
lasso_stan_cont_scale.groupby('Alpha').mean()['MSE']

Alpha
2.00000   19226.78744
2.16667   19241.13947
2.33333   19256.51914
2.50000   19269.55664
2.66667   19288.64811
2.83333   19311.69075
3.00000   19335.12753
3.16667   19350.55897
3.33333   19366.39091
3.50000   19384.15681
3.66667   19404.46402
3.83333   19426.14474
4.00000   19444.67019
4.16667   19466.75202
4.33333   19492.16993
4.50000   19518.25198
4.66667   19547.41807
4.83333   19578.91566
5.00000   19609.59360
Name: MSE, dtype: float64

In [21]:
lasso_cont_scale = score_log_lasso(np.linspace(0.00015,0.00020,19))
lasso_cont_scale.groupby('Alpha').mean()['MSE']

Alpha
1.50000   19250.39450
1.52778   19246.45712
1.55556   19243.07292
1.58333   19239.85664
1.61111   19237.19642
1.63889   19235.12941
1.66667   19233.90384
1.69444   19232.84314
1.72222   19231.45439
1.75000   19230.12711
1.77778   19228.54712
1.80556   19227.32373
1.83333   19226.27623
1.86111   19225.39594
1.88889   19225.05727
1.91667   19224.85767
1.94444   19224.83824
1.97222   19225.38939
2.00000   19226.78744
Name: MSE, dtype: float64

In [22]:
lasso_min_max_all_scale = score_log_lasso(np.linspace(0.00017,0.00023,19))
lasso_min_max_all_scale.groupby('Alpha').mean()['MSE']

Alpha
1.70000   19232.52483
1.73333   19230.90752
1.76667   19229.16335
1.80000   19227.50400
1.83333   19226.27623
1.86667   19225.31057
1.90000   19224.98406
1.93333   19224.80795
1.96667   19225.08829
2.00000   19226.78744
2.03333   19228.86661
2.06667   19231.64113
2.10000   19234.80494
2.13333   19237.89181
2.16667   19241.13947
2.20000   19244.57210
2.23333   19248.07311
2.26667   19251.88920
2.30000   19254.83333
Name: MSE, dtype: float64

In [23]:
from sklearn.model_selection import train_test_split
random_state =28
# min_max =  MinMaxScaler()
# housing[continuous] = min_max.fit_transform(housing[continuous])

X_train, X_test, y_train, y_test = train_test_split(housing.iloc[:,:-1], housing.iloc[:,-1],
                                                    test_size=0.25, random_state=random_state)

lasso = Lasso(alpha=0.00019, random_state=random_state)
lasso.fit(X_train, np.log(y_train))
# ridge = Ridge(alpha=1.6, random_state=random_state)
# ridge.fit(X_train, y_train)

# print('Lasso Test Score: {}'.format(lasso.score(X_test,y_test)))

y_pred = list(map(lambda x: exp(x), lasso.predict(X_test)))
print('Lasso MSE: {}'.format(mean_squared_error(y_test,y_pred)**0.5))

# print('Ridge Test Score: {}'.format(ridge.score(X_test,y_test)))
# print('Ridge MSE: {}'.format(mean_squared_error(y_test,ridge.predict(X_test))**0.5))

Lasso(alpha=0.00019, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=28,
      selection='cyclic', tol=0.0001, warm_start=False)

Lasso MSE: 18514.30873975893


In [24]:
drop_idx=[]
for k in range(len(lasso.coef_)):
#     print(k)
    if (lasso.coef_[k] ==0):
        drop_idx.append(k)

In [25]:
columns_to_drop = []
for i in drop_idx:
    columns_to_drop.append(housing.columns[i])

In [26]:
columns_to_drop

['MasVnrArea',
 '2ndFlrSF',
 'LowQualFinSF',
 'BsmtHalfBath',
 'BedroomAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'PoolArea',
 'MiscVal',
 'MSSubClass_40',
 'MSSubClass_45',
 'MSSubClass_60',
 'MSSubClass_75',
 'MSSubClass_85',
 'MSSubClass_180',
 'Street_Pave',
 'Alley_Pave',
 'LotShape_IR3',
 'LotShape_Reg',
 'LandContour_HLS',
 'LandContour_Lvl',
 'LotConfig_FR3',
 'Neighborhood_Blueste',
 'Neighborhood_BrDale',
 'Neighborhood_NWAmes',
 'Neighborhood_SWISU',
 'Neighborhood_Sawyer',
 'Neighborhood_Timber',
 'Condition1_PosA',
 'Condition1_RRAe',
 'Condition1_RRNe',
 'Condition1_RRNn',
 'Condition2_Feedr',
 'Condition2_PosA',
 'Condition2_PosN',
 'Condition2_RRAe',
 'Condition2_RRAn',
 'Condition2_RRNn',
 'BldgType_2fmCon',
 'HouseStyle_1.5Unf',
 'HouseStyle_1Story',
 'HouseStyle_SFoyer',
 'HouseStyle_SLvl',
 'RoofStyle_Gable',
 'RoofStyle_Shed',
 'RoofMatl_CompShg',
 'RoofMatl_Membran',
 'RoofMatl_Metal',
 'RoofMatl_Roll',
 'RoofMatl_WdShake',
 'RoofMatl_WdShngl',
 'Exterior1st_AsphShn

In [27]:
housing.drop(columns=columns_to_drop, inplace=True)

In [28]:
housing.shape

(1444, 167)

In [32]:
df = housing

In [41]:
# random_state=
k6 = KFold(n_splits=6, shuffle=True, random_state=random_state)
train = []
test = []
for train_idx, val_idx in k6.split(housing):
        train.append(train_idx)
        test.append(val_idx)

def ridge_tuning(alphas):
    score_log = []

    for alpha in alphas:
        for random_state in range(30,91,2):

            ridge = Ridge(alpha=alpha, random_state=random_state, max_iter=10000)
            for i in range(6):
                single_log = []

                X_train = df.iloc[train[i],:-1]
                X_test = df.iloc[test[i],:-1]
                y_train = np.log(df.iloc[train[i],-1])
                y_test = df.iloc[test[i],-1]


                ridge.fit(X_train,y_train) 
                single_log.append(i)
                single_log.append(random_state)
                single_log.append(alpha)

                y_pred = list(map(lambda x: exp(x),ridge.predict(X_test)))
                single_log.append(mean_squared_error(y_test, y_pred)**0.5)

                score_log.append(single_log)

    score_df = pd.DataFrame(score_log, columns=['Fold','RandomState','Alpha','MSE'])
    return score_df


    
        
    
    
    

In [39]:
ridge1 = ridge_tuning(np.logspace(-5,1,20))
ridge1.groupby('Alpha')['MSE'].mean()

Alpha
0.01000       19349.99275
0.02069       19349.98265
0.04281       19349.96176
0.08859       19349.91854
0.18330       19349.82914
0.37927       19349.64435
0.78476       19349.26272
1.62378       19348.47618
3.35982       19346.86195
6.95193       19343.57767
14.38450      19337.01370
29.76351      19324.36365
61.58482      19301.70865
127.42750     19266.64379
263.66509     19226.34820
545.55948     19207.47306
1128.83789    19258.15517
2335.72147    19467.35736
4832.93024    20038.52389
10000.00000   21340.64480
Name: MSE, dtype: float64

In [42]:
ridge2 = ridge_tuning(np.linspace(0.4,0.6,10))
ridge2.groupby('Alpha')['MSE'].mean()

Alpha
0.40000   19210.31036
0.42222   19209.12063
0.44444   19208.23849
0.46667   19207.63861
0.48889   19207.29839
0.51111   19207.19764
0.53333   19207.31826
0.55556   19207.64399
0.57778   19208.16019
0.60000   19208.85366
Name: MSE, dtype: float64

In [52]:

random_state = 88

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,:-1], df.iloc[:,-1],
                                                    test_size=0.25, random_state=random_state)

# lasso = Lasso(alpha=60, random_state=random_state)
# lasso.fit(X_train, y_train)
ridge = Ridge(alpha=0.5, random_state=random_state)
ridge.fit(X_train, np.log(y_train))

# print('Lasso Test Score: {}'.format(lasso.score(X_test,y_test)))
# print('Lasso MSE: {}'.format(mean_squared_error(y_test,lasso.predict(X_test))**0.5))
print('Ridge Test Score: {}'.format(ridge.score(X_test,y_test)))
y_pred = list(map(lambda x: exp(x),ridge.predict(X_test)))
                                                    
print('Ridge MSE: {}'.format(mean_squared_error(y_test,y_pred)**0.5))

Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=88, solver='auto', tol=0.001)

Ridge Test Score: -5.492475502867066
Ridge MSE: 19087.035197468405
