In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import math
import matplotlib.pyplot as plt


pd.options.display.max_rows = 1000
pd.options.display.max_columns = 200
plt.rcParams['figure.figsize'] = [10, 4]
plt.rcParams['figure.dpi'] = 100

## Import data, extract target, merge test & train

In [2]:
missing_values = ["n/a", "na", "--"]

train = pd.read_csv("data/train.csv", na_values = missing_values)
test = pd.read_csv("data/test.csv", na_values = missing_values)

# Set flag to discriminate between test and train
train['test_data'] = False
test['test_data'] = True

# Concatenate datasets and renumber the index
full_data = pd.concat([train, test]).reset_index(drop=True)

In [3]:
full_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,test_data
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706.0,Unf,0.0,150.0,856.0,GasA,Ex,Y,SBrkr,856,854,0,1710,1.0,0.0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2.0,548.0,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500.0,False
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978.0,Unf,0.0,284.0,1262.0,GasA,Ex,Y,SBrkr,1262,0,0,1262,0.0,1.0,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2.0,460.0,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500.0,False
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486.0,Unf,0.0,434.0,920.0,GasA,Ex,Y,SBrkr,920,866,0,1786,1.0,0.0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2.0,608.0,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500.0,False
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216.0,Unf,0.0,540.0,756.0,GasA,Gd,Y,SBrkr,961,756,0,1717,1.0,0.0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3.0,642.0,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000.0,False
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655.0,Unf,0.0,490.0,1145.0,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1.0,0.0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3.0,836.0,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000.0,False


## Drop useless columns, bad row

In [4]:
from src.preprocess import clean

drops = ['PoolQC', 'MiscFeature', 'FireplaceQu', 'Id', 'Utilities']

elec_na = full_data["Electrical"].isna()
full_data.drop(elec_na.loc[elec_na].index, inplace=True)

full_data = clean(full_data, drop_list=drops)

In [5]:
full_data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,Fence,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,test_data
0,60,RL,65.0,8450,Pave,,Reg,Lvl,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706.0,Unf,0.0,150.0,856.0,GasA,Ex,Y,SBrkr,856,854,0,1710,1.0,0.0,2,1,3,1,Gd,8,Typ,0,Attchd,2003.0,RFn,2.0,548.0,TA,TA,Y,0,61,0,0,0,0,,0,2,2008,WD,Normal,208500.0,False
1,20,RL,80.0,9600,Pave,,Reg,Lvl,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978.0,Unf,0.0,284.0,1262.0,GasA,Ex,Y,SBrkr,1262,0,0,1262,0.0,1.0,2,0,3,1,TA,6,Typ,1,Attchd,1976.0,RFn,2.0,460.0,TA,TA,Y,298,0,0,0,0,0,,0,5,2007,WD,Normal,181500.0,False
2,60,RL,68.0,11250,Pave,,IR1,Lvl,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486.0,Unf,0.0,434.0,920.0,GasA,Ex,Y,SBrkr,920,866,0,1786,1.0,0.0,2,1,3,1,Gd,6,Typ,1,Attchd,2001.0,RFn,2.0,608.0,TA,TA,Y,0,42,0,0,0,0,,0,9,2008,WD,Normal,223500.0,False
3,70,RL,60.0,9550,Pave,,IR1,Lvl,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216.0,Unf,0.0,540.0,756.0,GasA,Gd,Y,SBrkr,961,756,0,1717,1.0,0.0,1,0,3,1,Gd,7,Typ,1,Detchd,1998.0,Unf,3.0,642.0,TA,TA,Y,0,35,272,0,0,0,,0,2,2006,WD,Abnorml,140000.0,False
4,60,RL,84.0,14260,Pave,,IR1,Lvl,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655.0,Unf,0.0,490.0,1145.0,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1.0,0.0,2,1,4,1,Gd,9,Typ,1,Attchd,2000.0,RFn,3.0,836.0,TA,TA,Y,192,84,0,0,0,0,,0,12,2008,WD,Normal,250000.0,False


## Match null count of sibling columns

In [6]:
from src.preprocess import null_match

siblings = [
    ["BsmtExposure", "BsmtCond", "BsmtQual", "BsmtFinType1", "BsmtFinType2"],
    ["GarageFinish", "GarageYrBlt", "GarageQual", "GarageCond", "GarageType"],
    ["MasVnrType", "MasVnrArea"]   
]   

full_data = null_match(full_data, siblings)

## Fill null values

In [7]:
# Create lists of variables names for each data type: integer, float and categorical (objects)
ints = [col for col in full_data.columns if full_data.dtypes[col] == "int64"]
floats =  [col for col in full_data.columns if full_data.dtypes[col] == "float64"]
cats =  [col for col in full_data.columns if full_data.dtypes[col] == "object"]

fill_dict = {0: ints, 0.0: floats, "None": cats}

full_data = clean(full_data, fill_na=fill_dict)

# Let's confirm we've removed all nulls:
full_data.isna().sum().sum()

0

In [8]:
full_data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,Fence,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,test_data
0,60,RL,65.0,8450,Pave,,Reg,Lvl,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706.0,Unf,0.0,150.0,856.0,GasA,Ex,Y,SBrkr,856,854,0,1710,1.0,0.0,2,1,3,1,Gd,8,Typ,0,Attchd,2003.0,RFn,2.0,548.0,TA,TA,Y,0,61,0,0,0,0,,0,2,2008,WD,Normal,208500.0,False
1,20,RL,80.0,9600,Pave,,Reg,Lvl,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978.0,Unf,0.0,284.0,1262.0,GasA,Ex,Y,SBrkr,1262,0,0,1262,0.0,1.0,2,0,3,1,TA,6,Typ,1,Attchd,1976.0,RFn,2.0,460.0,TA,TA,Y,298,0,0,0,0,0,,0,5,2007,WD,Normal,181500.0,False
2,60,RL,68.0,11250,Pave,,IR1,Lvl,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486.0,Unf,0.0,434.0,920.0,GasA,Ex,Y,SBrkr,920,866,0,1786,1.0,0.0,2,1,3,1,Gd,6,Typ,1,Attchd,2001.0,RFn,2.0,608.0,TA,TA,Y,0,42,0,0,0,0,,0,9,2008,WD,Normal,223500.0,False
3,70,RL,60.0,9550,Pave,,IR1,Lvl,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216.0,Unf,0.0,540.0,756.0,GasA,Gd,Y,SBrkr,961,756,0,1717,1.0,0.0,1,0,3,1,Gd,7,Typ,1,Detchd,1998.0,Unf,3.0,642.0,TA,TA,Y,0,35,272,0,0,0,,0,2,2006,WD,Abnorml,140000.0,False
4,60,RL,84.0,14260,Pave,,IR1,Lvl,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655.0,Unf,0.0,490.0,1145.0,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1.0,0.0,2,1,4,1,Gd,9,Typ,1,Attchd,2000.0,RFn,3.0,836.0,TA,TA,Y,192,84,0,0,0,0,,0,12,2008,WD,Normal,250000.0,False


## Feature Engineering

In [30]:
from src.preprocess import feat_create
from src.preprocess import ordinal_create

ordinal_vars = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
        'HeatingQC', 'KitchenQual', 'GarageQual', 'GarageCond'] # May want to add BsmtExposure

new_feats = {
        "Total_Bath": 
            {
                1:['BsmtFullBath','FullBath'], 
                0.5: ['BsmtHalfBath', 'HalfBath']
            },
        "Porch_SF":
            {
                1: ['WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch',
                    '3SsnPorch', 'ScreenPorch']
            },
        "Total_SF":
            {
                1: ['TotalBsmtSF', '1stFlrSF', '2ndFlrSF'] 
            }
}

swap_subclass = {20:'1story 1946+', 
                 30:'1story 1946-', 
                 40:'1story w attic', 
                 45:'1halfstory unfinish', 
                 50:'1halfstory finish', 
                 60:'2story 1946+', 
                 70:'2story 1946-', 
                 75:'2halfstory', 
                 80:'split multi-level', 
                 85:'split foyer', 
                 90:'duplex', 
                 120:'1story PUD 1946+', 
                 150:'1halfstory PUD', 
                 160:'2story PUD 1946+', 
                 180:'PUD multilevel', 
                 190:'2 family conv'}

full_data = feat_create(full_data, new_feats)
full_data = ordinal_create(full_data, ordinal_vars)
full_data['MSSubClass'] = full_data['MSSubClass'].map(swap_subclass)

# May be useful to get the dummy variables upfront here before 
# we split test and train to both dataframes have all dummy columns

In [10]:
final_train = full_data.loc[(full_data.test_data == False), :].copy()
final_train.drop(columns=['test_data'], inplace = True)
final_train.reset_index(drop=True, inplace=True)

final_test = full_data.loc[(full_data.test_data == True), :].copy()
final_test.drop(columns=['test_data', 'SalePrice'], inplace = True)
final_test.reset_index(drop=True, inplace=True)


In [11]:
final_train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,Fence,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,Total_Bath,Porch_SF,Total_SF
0,2story 1946+,RL,65.0,8450,Pave,,Reg,Lvl,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,4,3,PConc,4,3,No,GLQ,706.0,Unf,0.0,150.0,856.0,GasA,5,Y,SBrkr,856,854,0,1710,1.0,0.0,2,1,3,1,4,8,Typ,0,Attchd,2003.0,RFn,2.0,548.0,3,3,Y,0,61,0,0,0,0,,0,2,2008,WD,Normal,208500.0,3.5,61.0,2566.0
1,1story 1946+,RL,80.0,9600,Pave,,Reg,Lvl,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,3,3,CBlock,4,3,Gd,ALQ,978.0,Unf,0.0,284.0,1262.0,GasA,5,Y,SBrkr,1262,0,0,1262,0.0,1.0,2,0,3,1,3,6,Typ,1,Attchd,1976.0,RFn,2.0,460.0,3,3,Y,298,0,0,0,0,0,,0,5,2007,WD,Normal,181500.0,2.5,298.0,2524.0
2,2story 1946+,RL,68.0,11250,Pave,,IR1,Lvl,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,4,3,PConc,4,3,Mn,GLQ,486.0,Unf,0.0,434.0,920.0,GasA,5,Y,SBrkr,920,866,0,1786,1.0,0.0,2,1,3,1,4,6,Typ,1,Attchd,2001.0,RFn,2.0,608.0,3,3,Y,0,42,0,0,0,0,,0,9,2008,WD,Normal,223500.0,3.5,42.0,2706.0
3,2story 1946-,RL,60.0,9550,Pave,,IR1,Lvl,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,3,3,BrkTil,3,4,No,ALQ,216.0,Unf,0.0,540.0,756.0,GasA,4,Y,SBrkr,961,756,0,1717,1.0,0.0,1,0,3,1,4,7,Typ,1,Detchd,1998.0,Unf,3.0,642.0,3,3,Y,0,35,272,0,0,0,,0,2,2006,WD,Abnorml,140000.0,2.0,307.0,2473.0
4,2story 1946+,RL,84.0,14260,Pave,,IR1,Lvl,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,4,3,PConc,4,3,Av,GLQ,655.0,Unf,0.0,490.0,1145.0,GasA,5,Y,SBrkr,1145,1053,0,2198,1.0,0.0,2,1,4,1,4,9,Typ,1,Attchd,2000.0,RFn,3.0,836.0,3,3,Y,192,84,0,0,0,0,,0,12,2008,WD,Normal,250000.0,3.5,276.0,3343.0


In [12]:
final_test.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,Fence,MiscVal,MoSold,YrSold,SaleType,SaleCondition,Total_Bath,Porch_SF,Total_SF
0,1story 1946+,RH,80.0,11622,Pave,,Reg,Lvl,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,5,6,1961,1961,Gable,CompShg,VinylSd,VinylSd,,0.0,3,3,CBlock,3,3,No,Rec,468.0,LwQ,144.0,270.0,882.0,GasA,3,Y,SBrkr,896,0,0,896,0.0,0.0,1,0,2,1,3,5,Typ,0,Attchd,1961.0,Unf,1.0,730.0,3,3,Y,140,0,0,0,120,0,MnPrv,0,6,2010,WD,Normal,1.0,260.0,1778.0
1,1story 1946+,RL,81.0,14267,Pave,,IR1,Lvl,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,6,1958,1958,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,108.0,3,3,CBlock,3,3,No,ALQ,923.0,Unf,0.0,406.0,1329.0,GasA,3,Y,SBrkr,1329,0,0,1329,0.0,0.0,1,1,3,1,4,6,Typ,0,Attchd,1958.0,Unf,1.0,312.0,3,3,Y,393,36,0,0,0,0,,12500,6,2010,WD,Normal,1.5,429.0,2658.0
2,2story 1946+,RL,74.0,13830,Pave,,IR1,Lvl,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,5,5,1997,1998,Gable,CompShg,VinylSd,VinylSd,,0.0,3,3,PConc,4,3,No,GLQ,791.0,Unf,0.0,137.0,928.0,GasA,4,Y,SBrkr,928,701,0,1629,0.0,0.0,2,1,3,1,3,6,Typ,1,Attchd,1997.0,Fin,2.0,482.0,3,3,Y,212,34,0,0,0,0,MnPrv,0,3,2010,WD,Normal,2.5,246.0,2557.0
3,2story 1946+,RL,78.0,9978,Pave,,IR1,Lvl,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,6,1998,1998,Gable,CompShg,VinylSd,VinylSd,BrkFace,20.0,3,3,PConc,3,3,No,GLQ,602.0,Unf,0.0,324.0,926.0,GasA,5,Y,SBrkr,926,678,0,1604,0.0,0.0,2,1,3,1,4,7,Typ,1,Attchd,1998.0,Fin,2.0,470.0,3,3,Y,360,36,0,0,0,0,,0,6,2010,WD,Normal,2.5,396.0,2530.0
4,1story PUD 1946+,RL,43.0,5005,Pave,,IR1,HLS,Inside,Gtl,StoneBr,Norm,Norm,TwnhsE,1Story,8,5,1992,1992,Gable,CompShg,HdBoard,HdBoard,,0.0,4,3,PConc,4,3,No,ALQ,263.0,Unf,0.0,1017.0,1280.0,GasA,5,Y,SBrkr,1280,0,0,1280,0.0,0.0,2,0,2,1,4,5,Typ,0,Attchd,1992.0,RFn,2.0,506.0,3,3,Y,0,82,0,0,144,0,,0,1,2010,WD,Normal,2.0,226.0,2560.0


## Drop highly correlated 'sibling' variables

In [13]:
# drop_list = ['TotRmsAbvGrd', 'GarageArea', 'FullBath', 
#              'HalfBath','BsmtHalfBath', 'BsmtFullBath',
#              'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
#              'BsmtUnfSF', 'BsmtFinSF1', 'BsmtFinSF2',
#              'LowQualFinSF','WoodDeckSF', 'OpenPorchSF',
#              'EnclosedPorch','3SsnPorch', 'ScreenPorch' ]

# final_feats = clean(feat_eng, drop_list=drop_list)

# Drop Outliers

In [14]:
# sns.scatterplot(x=full_data.Total_SF, y=full_data.SalePrice)

In [15]:
# full_data[(full_data.Total_SF > 7000) & (full_data.SalePrice < 250000)]

Hence we will drop samples 523 and 1298

## Scale (standardise) and Transform (normalise) numeric variables

In [16]:
# from src.analyse import analyse
# from src.analyse import test_trans
from src.preprocess import preprocess

ordinal_vars.append('OverallQual')
ordinal_vars.append('OverallCond')

scale_feats =  [col for col in final_train.columns if (final_train.dtypes[col] != "object") and (col not in ordinal_vars)]
trans_feats = ['SalePrice', 'LotArea', 'Total_SF', 'GrLivArea', 'LotFrontage', 'GarageArea']

final_train = final_train.drop([523, 1298]) # Drop two massive outliers, see above

final_train, pipelines = preprocess(final_train, scale_list=scale_feats, transform_list=trans_feats)

# final_train = final_feats[final_feats['test_data'] == False]
# final_test = final_feats[final_feats['test_data'] == True]
# final_target, _ = preprocess(pd.DataFrame(target), scale_list=['SalePrice'], transform_list=['SalePrice'])
# final_target = final_target['SalePrice']
# final_target = final_target.drop([523, 1298]) # Drop two massive outliers

## Drop low value dummy variables

In [17]:
dummies = [col for col in final_train.columns if final_train.dtypes[col] == "uint8"]
# counts = final_train[dummies].sum()
# dummy_drops = list(counts[counts < 20].index)
# final_train = final_train.drop(dummy_drops, axis=1)
# final_train = final_train.drop(['test_data', 'MoSold'], axis=1)#, 'YearBuilt', 'YearRemodAdd'], axis=1)
correls = abs(final_train.corrwith(final_train.SalePrice)).sort_values()
low_corr = list(correls[correls < 0.02].index)
final_train = final_train.drop(low_corr, axis=1)
# sns.histplot(final_target)
# final_train


In [18]:
target = final_train.loc[:, 'SalePrice']
final_train.drop(columns=['SalePrice'], inplace=True)
final_train.reset_index(drop=True, inplace=True)

final_test = pd.get_dummies(final_test, drop_first=True)

overlap = list(set(final_test.columns).intersection(set(final_train.columns)))
final_train = final_train.loc[:, overlap]
final_test = final_test.loc[:, overlap]


## Modelling

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Lasso, Ridge

RANDOM_SEED = 42

forest = RandomForestRegressor(n_jobs=-1, 
                               random_state=RANDOM_SEED, 
                               n_estimators=200, 
                               max_features=50, 
                               min_samples_leaf=2, 
                               min_samples_split=2, 
                               max_depth=20)
forest.fit(final_train, target)

gboost = GradientBoostingRegressor(n_estimators=1500, 
                                   learning_rate=0.03, 
                                   max_features=40, 
                                   min_samples_leaf=2, 
                                   min_samples_split=12, 
                                   random_state=RANDOM_SEED)
gboost.fit(final_train, target)

lasso = Lasso(alpha=0.0005,
              max_iter=5000,
              random_state=RANDOM_SEED)
lasso.fit(final_train, target)

ridge = Ridge(alpha=7.5,
              random_state=RANDOM_SEED)
ridge.fit(final_train, target)


def cv_rmse(model):
    rmse = -cross_val_score(model, final_train, target,
                            scoring="neg_root_mean_squared_error",
                            cv=10, n_jobs=-1)
    return (rmse)

fscore = cv_rmse(forest)
gscore = cv_rmse(gboost)
lscore = cv_rmse(lasso)
rscore = cv_rmse(ridge)
print("RandomForest CV score is:   {:.4f} ({:.4f})".format(fscore.mean(), fscore.std()))
print("Gradient Boost CV score is: {:.4f} ({:.4f})".format(gscore.mean(), gscore.std()))
print("Lasso CV score is:          {:.4f} ({:.4f})".format(lscore.mean(), lscore.std()))
print("Ridge CV score is:          {:.4f} ({:.4f})".format(rscore.mean(), rscore.std()))

# preds = forest.predict(X_valid)
# print(mean_squared_error(y_valid, preds, squared=False))


RandomForest CV score is:   0.3228 (0.0364)
Gradient Boost CV score is: 0.2765 (0.0419)
Lasso CV score is:          0.2672 (0.0380)
Ridge CV score is:          0.2684 (0.0360)


In [21]:
from src.preprocess import pipe_apply

pipe_test = pipe_apply(final_test, pipelines, direction='forward')


In [22]:
pipe_test['SalePrice'] = lasso.predict(pipe_test)
submission = pipe_apply(pipe_test, pipelines, direction='inverse')

0       118377.148024
1       159210.974128
2       182074.019254
3       199957.366287
4       196580.820379
            ...      
1454     87389.384217
1455     78493.130023
1456    165185.947923
1457    121307.371383
1458    218074.459894
Name: SalePrice, Length: 1459, dtype: float64

In [29]:
np.floor(submission.SalePrice)

0       118377.0
1       159210.0
2       182074.0
3       199957.0
4       196580.0
          ...   
1454     87389.0
1455     78493.0
1456    165185.0
1457    121307.0
1458    218074.0
Name: SalePrice, Length: 1459, dtype: float64