# Setup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from math import exp

from sklearn.linear_model import Ridge
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 300)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [2]:
from jj_imputer import HousingImpute
from jj_dummification import *

# Imputation

In [3]:
housing = HousingImpute('train.csv')
housing.run_imputers()
housing.left_to_impute()

LotFrontage 259
Alley 1369
MasVnrType 8
MasVnrArea 8
BsmtQual 37
BsmtCond 37
BsmtExposure 38
BsmtFinType1 37
BsmtFinType2 38
Electrical 1
FireplaceQu 690
GarageType 81
GarageYrBlt 81
GarageFinish 81
GarageQual 81
GarageCond 81
PoolQC 1453
Fence 1179
MiscFeature 1406
MasVnrArea imputer embedded in another imputer
BsmtCond imputer embedded in another imputer
BsmtExposure imputer embedded in another imputer
BsmtFinType1 imputer embedded in another imputer
BsmtFinType2 imputer embedded in another imputer
GarageYrBlt imputer embedded in another imputer
GarageFinish imputer embedded in another imputer
GarageQual imputer embedded in another imputer
GarageCond imputer embedded in another imputer
BsmtExposure 1
--------------------
Id
949    NaN
Name: BsmtExposure, dtype: object
--------------------
BsmtFinType2 1
--------------------
Id
333    NaN
Name: BsmtFinType2, dtype: object
--------------------


indx 949 has NaN BsmtExposure but a basement condition/qual with 0 finished area. Will impute as 'No'. <br>
index 333 is imputed as Rec via median value since the missing competely at random BsmFinType2 value .

In [4]:
housing.df.loc[949, 'BsmtExposure'] = 'No'
housing.df.loc[333, 'BsmtFinType2'] = 'Rec'

In [5]:
#final check for missing values
housing.left_to_impute()

In [6]:
housing.df.drop('Utilities', axis=1, inplace=True)
#drop utilities since 1459 out of 1600 share same value
housing.df.drop('TotalBsmtSF', axis=1, inplace=True)
#drop TotalBsmtSF for multicollinearity since it is the sum of the 3 other BsmtSF columns

housing.df.loc[:,'Exterior2nd'].replace('CmentBd','CemntBd', inplace=True)
#correcting for spelling errors to make the types of values in Exterior1st and Exterior2nd the same

# Dummification

In [7]:
new_housing = HousingCategorical()
new_housing.df = housing.df
new_housing.ohe_features = new_housing.ohe_features + new_housing.label_encode_features

# new_housing.list_checker()
new_housing.one_hot_encode()
new_housing.df.shape

(1460, 331)

## Outliers

In [8]:
#jimmy's list of outliers by eye test. manually placing here for easy reference in future
final_outlier_id = {935: 'LotFrontage',
                     1299: ['LotFrontage', 'BsmtFinSF1', '1stFlrSF', 'GrLivArea', 'GarageArea'],
                     250: 'LotArea',
                     314: 'LotArea',
                     336: 'LotArea',
                     707: 'LotArea',
                     298: 'MasVnrArea',
                     1170: 'MasVnrArea',
                     186: 'LowQualFinSF',
                     524: 'GrLivArea',
                     582: 'GarageArea',
                     1062: 'GarageArea',
                     1191: 'GarageArea',
                     54: 'WoodDeckSF',
                     496: 'OpenPorchSF',
                     1183: 'PoolArea'}

final_df = new_housing.df.drop(final_outlier_id, axis=0)

## Scaling

In [9]:
min_max = MinMaxScaler()
final_df.iloc[:,:-1] = min_max.fit_transform(final_df.iloc[:,:-1])

# Feature Selection
### Via Lasso

In [10]:
columns_to_drop = ['MasVnrArea','2ndFlrSF', 'LowQualFinSF', 'BsmtHalfBath', 'BedroomAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
                   'PoolArea', 'MiscVal', 'MSSubClass_40', 'MSSubClass_45', 'MSSubClass_60', 'MSSubClass_75', 'MSSubClass_85',
                   'MSSubClass_180', 'Street_Pave', 'Alley_Pave', 'LotShape_IR3', 'LotShape_Reg', 'LandContour_HLS', 
                   'LandContour_Lvl', 'LotConfig_FR3', 'Neighborhood_Blueste', 'Neighborhood_BrDale', 'Neighborhood_NWAmes',
                   'Neighborhood_SWISU', 'Neighborhood_Sawyer', 'Neighborhood_Timber', 'Condition1_PosA', 'Condition1_RRAe',
                   'Condition1_RRNe', 'Condition1_RRNn', 'Condition2_Feedr', 'Condition2_PosA', 'Condition2_PosN', 'Condition2_RRAe',
                   'Condition2_RRAn', 'Condition2_RRNn', 'BldgType_2fmCon', 'HouseStyle_1.5Unf', 'HouseStyle_1Story',
                   'HouseStyle_SFoyer', 'HouseStyle_SLvl', 'RoofStyle_Gable', 'RoofStyle_Shed', 'RoofMatl_CompShg', 'RoofMatl_Membran',
                   'RoofMatl_Metal', 'RoofMatl_Roll', 'RoofMatl_WdShake', 'RoofMatl_WdShngl', 'Exterior1st_AsphShn', 'Exterior1st_CBlock',
                   'Exterior1st_CemntBd', 'Exterior1st_HdBoard', 'Exterior1st_ImStucc', 'Exterior1st_Plywood', 'Exterior1st_Stone',
                   'Exterior1st_VinylSd', 'Exterior2nd_AsphShn', 'Exterior2nd_Brk Cmn', 'Exterior2nd_CBlock', 'Exterior2nd_ImStucc',
                   'Exterior2nd_MetalSd', 'Exterior2nd_Other', 'Exterior2nd_Plywood', 'Exterior2nd_Stone', 'Foundation_Slab', 'Foundation_Stone',
                   'BsmtFinType1_BLQ', 'BsmtFinType1_Rec', 'BsmtFinType2_LwQ', 'BsmtFinType2_Unf', 'Heating_GasA', 'Heating_OthW',
                   'Heating_Wall', 'Electrical_FuseF', 'Electrical_FuseP', 'Electrical_Mix', 'Functional_Min2', 'Functional_Sev',
                   'GarageType_Basment', 'GarageType_No_G', 'PavedDrive_P', 'Fence_GdWo', 'Fence_MnPrv', 'Fence_MnWw', 'MiscFeature_No_MF',
                   'MiscFeature_Othr', 'MiscFeature_TenC', 'SaleType_CWD', 'SaleType_Con', 'SaleType_ConLI', 'SaleType_ConLw', 'SaleType_Oth',
                   'SaleCondition_AdjLand', 'SaleCondition_Partial', 'MoSold_3', 'MoSold_11', 'ExterQual_Fa', 'ExterCond_Gd', 'ExterCond_Po',
                   'BsmtQual_Fa', 'BsmtCond_Po', 'BsmtExposure_No', 'HeatingQC_Po', 'FireplaceQu_TA', 'GarageFinish_No_G', 'GarageQual_Gd',
                   'GarageQual_No_G', 'GarageQual_Po', 'GarageQual_TA', 'GarageCond_Gd', 'GarageCond_No_G', 'GarageCond_Po','PoolQC_Fa','PoolQC_Gd']

In [11]:
final_df.drop(columns=columns_to_drop, inplace=True)

In [12]:
final_df.shape

(1444, 196)

# Optimized Hyperparameter

In [13]:
random_state=88
X_train, X_test, y_train, y_test = train_test_split(final_df.iloc[:,:-1],final_df.iloc[:,-1], 
                                                    test_size=0.25, random_state=random_state)

ridge = Ridge(alpha=0.5, random_state=random_state, max_iter=10000)
ridge.fit(X_train, np.log(y_train))
y_pred = list(map(lambda x: exp(x), ridge.predict(X_test)))
print('RMSE: {}'.format(mean_squared_error(y_test,y_pred)**0.5))

Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=10000,
      normalize=False, random_state=88, solver='auto', tol=0.001)

RMSE: 19088.95895126684


# True Test Set
### Processing

In [14]:
test_housing = HousingImpute('imputed_test.csv')
test_dummies = HousingCategorical()
test_dummies.df = test_housing.df
# test_dummies.df.shape
test_dummies.mode = 'Test'
test_dummies.df.drop('Utilities', axis=1, inplace=True)
test_dummies.df.drop('TotalBsmtSF', axis=1, inplace=True)
test_dummies.df.loc[:,'Exterior2nd'].replace('CmentBd','CemntBd', inplace=True)
test_dummies.ohe_features.remove('GarageQual')

test_dummies.one_hot_encode()
# test_dummies.df.shape

#OHE for featuer GarageQual keeps dropping Fa as first element but final model needs feature GarageQual_Fa
from sklearn.preprocessing import OneHotEncoder
garagequal_encoder = OneHotEncoder(drop='first', sparse=False)
garagequal_encoder.fit(np.array(test_dummies.df['GarageQual']).reshape(-1,1))
garagequal_encoder.categories_ = [np.array(['TA','No_G','Po','Gd','Fa'], dtype=object)]

garage_array = garagequal_encoder.transform(np.array(test_dummies.df['GarageQual']).reshape(-1,1))
garage_encoded = pd.DataFrame(garage_array, index=list(range(1461,2920)), columns=['GarageQual_No_G','GarageQual_Po','GarageQual_Gd','GarageQual_Fa'])
test_dummies.df = pd.merge(test_dummies.df.drop(columns='GarageQual'),garage_encoded, left_index=True, right_index=True)

test_dummies.df.shape
test_dummies.df['HouseStyle_2.5Fin'] = 0
final_features = list(final_df.columns)
final_features.remove('SalePrice') #no target variable in test set

final_test_df = test_dummies.df[final_features]
final_test_input = min_max.fit_transform(final_test_df)
final_test_df.shape

OneHotEncoder(categorical_features=None, categories=None, drop='first',
              dtype=<class 'numpy.float64'>, handle_unknown='error',
              n_values=None, sparse=False)

(1459, 267)

(1459, 195)

# Final Model

In [15]:
final_ridge = Ridge(alpha=0.5, random_state=88, max_iter=10000)
final_ridge.fit(final_df.iloc[:,:-1], np.log(final_df.iloc[:,-1]))
final_pred = list(map(lambda x: exp(x), final_ridge.predict(final_test_input)))


Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=10000,
      normalize=False, random_state=88, solver='auto', tol=0.001)

## Predictions

In [16]:
answer_df = pd.DataFrame(final_pred, index=final_test_df.index, columns=['SalePrice'])

In [17]:
answer_df.to_csv('SalePrice.csv')