In [140]:
import csv
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from scipy.stats import norm, skew
from scipy.special import boxcox1p
from sklearn import linear_model
from sklearn.metrics import mean_squared_error


In [124]:
train = pd.read_csv('data/train.csv')
train.drop("Id", axis = 1, inplace = True)

In [125]:
train.shape

(1460, 80)

In [126]:
list(train)

['MSSubClass',
 'MSZoning',
 'LotFrontage',
 'LotArea',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'MasVnrArea',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinSF1',
 'BsmtFinType2',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'KitchenQual',
 'TotRmsAbvGrd',
 'Functional',
 'Fireplaces',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageCars',
 'GarageArea',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'WoodDeckSF',
 'OpenPorchSF',
 'Enc

In [127]:
missing_data = (train.isnull().sum()/len(train))
missing_data = missing_data.drop(missing_data[missing_data==0].index).sort_values(ascending=False)
print(missing_data)

PoolQC          0.995205
MiscFeature     0.963014
Alley           0.937671
Fence           0.807534
FireplaceQu     0.472603
LotFrontage     0.177397
GarageYrBlt     0.055479
GarageType      0.055479
GarageFinish    0.055479
GarageQual      0.055479
GarageCond      0.055479
BsmtFinType2    0.026027
BsmtExposure    0.026027
BsmtFinType1    0.025342
BsmtCond        0.025342
BsmtQual        0.025342
MasVnrArea      0.005479
MasVnrType      0.005479
Electrical      0.000685
dtype: float64


NA handling

In [128]:
fill_nones = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu','GarageYrBlt', 
              'GarageType', 'GarageFinish','GarageQual', 'GarageCond', 'BsmtFinType2', 
              'BsmtExposure', 'BsmtFinType1', 'BsmtCond', 'BsmtQual','MasVnrType']
for fill_none in fill_nones:
    train[fill_none]=train[fill_none].fillna('None')
train['LotFrontage'] = train.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))
train['MasVnrArea'] = train['MasVnrArea'].fillna(0)
train = train.dropna(axis=0, how='any')

In [129]:
missing_data = (train.isnull().sum()/len(train))
missing_data = missing_data.drop(missing_data[missing_data==0].index).sort_values(ascending=False)
print(missing_data)

Series([], dtype: float64)


Categorical Data handling

In [130]:
num_to_str = ["MSSubClass","MSZoning", "BsmtFullBath","BsmtHalfBath",
              "HalfBath","BedroomAbvGr","KitchenAbvGr", "Utilities", "LotConfig",
             "Neighborhood","Condition1", "Condition2", "BldgType", "HouseStyle",
             "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType",
             "Foundation", "Heating", "Electrical"]
for col in num_to_str:
    train[col]=train[col].astype(str)

encode label for categorical data

In [131]:
col_cats = ['FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir','OverallCond', 
        'YrSold', 'MoSold']

for col_cat in col_cats:
    lab = LabelEncoder()
    train[col_cat]=lab.fit_transform(train[col_cat])
train.shape

(1459, 80)

create feature

In [123]:
train['GarageAreaperCar'] = train['GarageArea']/train['GarageCars']
train[train['GarageCars']==0]['GarageAreaperCar'] = 0 
train['TotalArea'] = train['LotArea'] + train['MasVnrArea'] + \
                    train['TotalBsmtSF'] + train['GrLivArea'] + \
                    train['GarageArea'] + train['WoodDeckSF'] +  \
                    train['OpenPorchSF'] + train['EnclosedPorch'] + \
                    train['PoolArea'] + train['3SsnPorch']

create dummy variables for categorical data

In [132]:
numeric_feats = train.dtypes[train.dtypes != "object"].index

# Check the skew of all numerical features
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness


Skew in numerical features: 



Unnamed: 0,Skew
MiscVal,24.443278
PoolArea,14.807992
LotArea,12.190881
3SsnPorch,10.290132
LowQualFinSF,8.998885
LandSlope,4.806884
BsmtFinSF2,4.249219
ScreenPorch,4.116334
EnclosedPorch,3.085342
MasVnrArea,2.673798


In [133]:
skewness = skewness[abs(skewness) > 1]

skewed_features = skewness.index
for feat in skewed_features:
    train[feat] = boxcox1p(train[feat], 0.15)

In [134]:
numeric_feats = train.dtypes[train.dtypes != "object"].index

# Check the skew of all numerical features
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness


Skew in numerical features: 



Unnamed: 0,Skew
PoolArea,14.358427
3SsnPorch,7.825015
LowQualFinSF,7.590566
MiscVal,5.503462
LandSlope,4.34414
ScreenPorch,3.183036
BsmtFinSF2,2.618092
EnclosedPorch,2.159803
LotArea,0.751906
MasVnrArea,0.603135


In [135]:
train = pd.get_dummies(train)
train.shape

(1459, 351)

In [136]:
train.corr()['SalePrice'].sort_values()

BsmtQual                -0.591244
ExterQual               -0.559220
KitchenQual             -0.524632
HeatingQC               -0.444778
GarageFinish            -0.419237
MasVnrType_None         -0.388186
GarageType_Detchd       -0.387858
Foundation_CBlock       -0.342857
MSZoning_RM             -0.341641
HalfBath_0              -0.326407
MSSubClass_30           -0.316530
GarageYrBlt_None        -0.310714
GarageType_None         -0.310714
BsmtExposure            -0.276055
LotShape                -0.271459
Foundation_BrkTil       -0.246113
BsmtFullBath_0          -0.239891
Electrical_FuseA        -0.233358
Neighborhood_OldTown    -0.228358
Neighborhood_IDOTRR     -0.222335
SaleType_WD             -0.218273
Neighborhood_Edwards    -0.208832
BedroomAbvGr_2          -0.206034
EnclosedPorch           -0.199250
RoofStyle_Gable         -0.195406
Exterior1st_Wd Sdng     -0.185645
Exterior2nd_Wd Sdng     -0.181814
HouseStyle_1.5Fin       -0.180356
Exterior1st_MetalSd     -0.174603
MSZoning_C (al

In [137]:
train_sample, test_sample = train_test_split(train, test_size=0.3, random_state = 1)
X_train = train_sample.drop(['SalePrice'], axis=1)
X_test = test_sample.drop(['SalePrice'], axis=1)
y_train = train_sample['SalePrice']
y_test = test_sample['SalePrice']

In [138]:
missing_data = (train.isnull().sum()/len(train))
missing_data = missing_data.drop(missing_data[missing_data==0].index).sort_values(ascending=False)
print(missing_data)

Series([], dtype: float64)


In [147]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error

In [143]:
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
    rmse= np.sqrt(-cross_val_score(model, train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [145]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))


1.0331500391893367

In [None]:
score = rmsle_cv(lasso)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
f, ax = plt.subplots(figsize=(15,12))
train_corr = train.corr()['SalePrice'].sort_values()
train_corr.plot.bar()
plt.show()
train_corr

In [None]:
attribute_of_interest = train_corr[-15:].index
train_subset = train_split_train[attribute_of_interest]
list(attribute_of_interest)


In [None]:
f, ax = plt.subplots(figsize=(15,12))
sns.heatmap(train_subset.corr(), vmax=.8, square=True, annot=True)

In [None]:
remove_features = ['TotalBsmtSF', 'GarageCars', 'GarageYrBlt']
train_subset = train_subset.drop(remove_features, axis=1)
attribute_of_interest = [feature for feature in attribute_of_interest if feature not in remove_features]

In [None]:
train_subset_filtered = train_subset.dropna()
train_subset_filtered = train_subset_filtered[train_subset_filtered['SalePrice']<600000]
train_subset_filtered = train_subset_filtered[train_subset_filtered['BsmtFinSF1']<2000]
train_subset_filtered = train_subset_filtered[train_subset_filtered['TotRmsAbvGrd']<13]
train_subset_filtered = train_subset_filtered[train_subset_filtered['1stFlrSF']<3000]
train_subset_filtered = train_subset_filtered[train_subset_filtered['MasVnrArea']<1200]

In [None]:
fig, axs = plt.subplots(ncols=1, nrows=len(attribute_of_interest)-1, figsize=(10, 100))
for i,index in enumerate(attribute_of_interest):
    if index!='SalePrice':
        if index == 'OverallQual':
            plt.xlim(0, 12)
        sns.regplot(train_subset_filtered[index], train_subset_filtered['SalePrice'], fit_reg=False,ax=axs[i])


In [None]:
X_train = train_subset_filtered.drop(['SalePrice'], axis=1)
y_train = train_subset_filtered['SalePrice']