In [216]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from termcolor import colored
import missingno as msno

%matplotlib inline

COLOR = 'white'
plt.rcParams['text.color'] = COLOR
plt.rcParams['axes.labelcolor'] = COLOR
plt.rcParams['xtick.color'] = COLOR
plt.rcParams['ytick.color'] = COLOR

pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_rows', 500)

#  Basic Settings


In [217]:
def encode_cat_vars(x):
    # turning categorical values to numerical ones so that the machine can understand
    x = pd.get_dummies(
        x,
        columns=x.select_dtypes(
            include=["object", "category"]).columns.tolist(),
        drop_first=True,
    )
    return x

# turning certain skewed categorical values to their log values for less skewness and better scaling
def perform_log_transform(df, col_log):
    """#Perform Log Transformation of dataframe , and list of columns """
    for colname in col_log:
        df[colname + '_log'] = np.log(df[colname])
    df.drop(col_log, axis=1, inplace=True)
    return df

# Cleaning function for the data
def clean(df):

    df["MasVnrType"] = df["MasVnrType"].fillna('None')
    df["MasVnrArea"] = df["MasVnrArea"].fillna(0.0)
    df["Alley"] = df["Alley"].fillna('None')
    df["PoolQC"] = df["PoolQC"].fillna('None')
    df["Fence"] = df["Fence"].fillna('None')
    df["MiscFeature"] = df["MiscFeature"].fillna('None')
    df["FireplaceQu"] = df["FireplaceQu"].fillna('None')
    upperlimit = np.percentile(df.TotalBsmtSF.values, 99.5)
    df['TotalBsmtSF'].loc[df['TotalBsmtSF'] > upperlimit] = upperlimit
    basement_cols = ['BsmtQual', 'BsmtCond', 'BsmtExposure',
                     'BsmtFinType1', 'BsmtFinType2', 'BsmtFinSF1', 'BsmtFinSF2']
    for col in basement_cols:
        if 'FinSF' not in col:
            df[col] = df[col].fillna('None')
    # GarageArea has got some outliers lets remove them.
    upperlimit = np.percentile(df.GarageArea.values, 99.5)
    df['GarageArea'].loc[df['GarageArea'] > upperlimit] = upperlimit

    garage_cols = ['GarageType', 'GarageQual', 'GarageCond',
                   'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea']
    for col in garage_cols:
        if df[col].dtype == np.object:
            df[col] = df[col].fillna('None')
        else:
            df[col] = df[col].fillna(0)
    print(colored("All the colors have been cleaned", 'green'))
    # dealing with the LotArea SQRT
    df['SqrtLotArea'] = np.sqrt(df['LotArea'])
    df.drop(['LotArea'], axis='columns', inplace=True)

    filter = df['LotFrontage'].isnull()
    df.LotFrontage[filter] = df.SqrtLotArea[filter]

    #  making a list of categorical columns
    catCol = []
    for i in df.columns.to_list():
        dataTypeObj = df.dtypes[i]
        if (dataTypeObj == "object"):
            catCol.append(i)

    #  hitting them with the log tranformation
    log_col = ['LotFrontage', 'MasVnrArea', 'BsmtUnfSF',
               '1stFlrSF', 'OpenPorchSF', 'MiscVal', 'SqrtLotArea']
    perform_log_transform(df, log_col)
    print(colored("Log Transformation Complete", 'green'))
    df.tail(10)
    df = df.replace([np.NINF], 0)
    df = encode_cat_vars(df)
    print(colored("All values encoded", 'green'))
    return df


In [218]:
#  Reading the csv file
house = pd.read_csv("train.csv")
test_df = pd.read_csv('test.csv')


In [219]:
# here the house has the saleprice with it, which we will have to turn into SalePrice_log for an accurate model
house


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.00,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.00,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.00,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.00,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.00,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.00,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.00,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.00,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.00,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.00,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.00,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.00,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.00,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.00,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.00,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.00,7917,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,5,1999,2000,Gable,CompShg,VinylSd,VinylSd,,0.00,TA,TA,PConc,Gd,TA,No,Unf,0,Unf,0,953,953,GasA,Ex,Y,SBrkr,953,694,0,1647,0,0,2,1,3,1,TA,7,Typ,1,TA,Attchd,1999.00,RFn,2,460,TA,TA,Y,0,40,0,0,0,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.00,13175,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,Norm,Norm,1Fam,1Story,6,6,1978,1988,Gable,CompShg,Plywood,Plywood,Stone,119.00,TA,TA,CBlock,Gd,TA,No,ALQ,790,Rec,163,589,1542,GasA,TA,Y,SBrkr,2073,0,0,2073,1,0,2,0,3,1,TA,7,Min1,2,TA,Attchd,1978.00,Unf,2,500,TA,TA,Y,349,0,0,0,0,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.00,9042,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,9,1941,2006,Gable,CompShg,CemntBd,CmentBd,,0.00,Ex,Gd,Stone,TA,Gd,No,GLQ,275,Unf,0,877,1152,GasA,Ex,Y,SBrkr,1188,1152,0,2340,0,0,2,0,4,1,Gd,9,Typ,2,Gd,Attchd,1941.00,RFn,1,252,TA,TA,Y,0,60,0,0,0,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.00,9717,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1Story,5,6,1950,1996,Hip,CompShg,MetalSd,MetalSd,,0.00,TA,TA,CBlock,TA,TA,Mn,GLQ,49,Rec,1029,0,1078,GasA,Gd,Y,FuseA,1078,0,0,1078,1,0,1,0,2,1,Gd,5,Typ,0,,Attchd,1950.00,Unf,1,240,TA,TA,Y,366,0,112,0,0,0,,,,0,4,2010,WD,Normal,142125


In [220]:
# here the test_df does not have a saleprice column but has all the other necessary training columns identitcal to the last one
test_df


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.00,11622,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,5,6,1961,1961,Gable,CompShg,VinylSd,VinylSd,,0.00,TA,TA,CBlock,TA,TA,No,Rec,468.00,LwQ,144.00,270.00,882.00,GasA,TA,Y,SBrkr,896,0,0,896,0.00,0.00,1,0,2,1,TA,5,Typ,0,,Attchd,1961.00,Unf,1.00,730.00,TA,TA,Y,140,0,0,0,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.00,14267,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,6,1958,1958,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,108.00,TA,TA,CBlock,TA,TA,No,ALQ,923.00,Unf,0.00,406.00,1329.00,GasA,TA,Y,SBrkr,1329,0,0,1329,0.00,0.00,1,1,3,1,Gd,6,Typ,0,,Attchd,1958.00,Unf,1.00,312.00,TA,TA,Y,393,36,0,0,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.00,13830,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,5,5,1997,1998,Gable,CompShg,VinylSd,VinylSd,,0.00,TA,TA,PConc,Gd,TA,No,GLQ,791.00,Unf,0.00,137.00,928.00,GasA,Gd,Y,SBrkr,928,701,0,1629,0.00,0.00,2,1,3,1,TA,6,Typ,1,TA,Attchd,1997.00,Fin,2.00,482.00,TA,TA,Y,212,34,0,0,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.00,9978,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,6,1998,1998,Gable,CompShg,VinylSd,VinylSd,BrkFace,20.00,TA,TA,PConc,TA,TA,No,GLQ,602.00,Unf,0.00,324.00,926.00,GasA,Ex,Y,SBrkr,926,678,0,1604,0.00,0.00,2,1,3,1,Gd,7,Typ,1,Gd,Attchd,1998.00,Fin,2.00,470.00,TA,TA,Y,360,36,0,0,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.00,5005,Pave,,IR1,HLS,AllPub,Inside,Gtl,StoneBr,Norm,Norm,TwnhsE,1Story,8,5,1992,1992,Gable,CompShg,HdBoard,HdBoard,,0.00,Gd,TA,PConc,Gd,TA,No,ALQ,263.00,Unf,0.00,1017.00,1280.00,GasA,Ex,Y,SBrkr,1280,0,0,1280,0.00,0.00,2,0,2,1,Gd,5,Typ,0,,Attchd,1992.00,RFn,2.00,506.00,TA,TA,Y,0,82,0,0,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.00,1936,Pave,,Reg,Lvl,AllPub,Inside,Gtl,MeadowV,Norm,Norm,Twnhs,2Story,4,7,1970,1970,Gable,CompShg,CemntBd,CmentBd,,0.00,TA,TA,CBlock,TA,TA,No,Unf,0.00,Unf,0.00,546.00,546.00,GasA,Gd,Y,SBrkr,546,546,0,1092,0.00,0.00,1,1,3,1,TA,5,Typ,0,,,,,0.00,0.00,,,Y,0,0,0,0,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.00,1894,Pave,,Reg,Lvl,AllPub,Inside,Gtl,MeadowV,Norm,Norm,TwnhsE,2Story,4,5,1970,1970,Gable,CompShg,CemntBd,CmentBd,,0.00,TA,TA,CBlock,TA,TA,No,Rec,252.00,Unf,0.00,294.00,546.00,GasA,TA,Y,SBrkr,546,546,0,1092,0.00,0.00,1,1,3,1,TA,6,Typ,0,,CarPort,1970.00,Unf,1.00,286.00,TA,TA,Y,0,24,0,0,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.00,20000,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Mitchel,Norm,Norm,1Fam,1Story,5,7,1960,1996,Gable,CompShg,VinylSd,VinylSd,,0.00,TA,TA,CBlock,TA,TA,No,ALQ,1224.00,Unf,0.00,0.00,1224.00,GasA,Ex,Y,SBrkr,1224,0,0,1224,1.00,0.00,1,0,4,1,TA,7,Typ,1,TA,Detchd,1960.00,Unf,2.00,576.00,TA,TA,Y,474,0,0,0,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.00,10441,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Mitchel,Norm,Norm,1Fam,SFoyer,5,5,1992,1992,Gable,CompShg,HdBoard,Wd Shng,,0.00,TA,TA,PConc,Gd,TA,Av,GLQ,337.00,Unf,0.00,575.00,912.00,GasA,TA,Y,SBrkr,970,0,0,970,0.00,1.00,1,0,3,1,TA,6,Typ,0,,,,,0.00,0.00,,,Y,80,32,0,0,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [221]:
house = clean(house)

[32mAll the colors have been cleaned[0m
[32mLog Transformation Complete[0m
[32mAll values encoded[0m


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if df[col].dtype == np.object:
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.LotFrontage[filter] = df.SqrtLotArea[filter]
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [222]:
test_df = clean(test_df)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if df[col].dtype == np.object:
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.LotFrontage[filter] = df.SqrtLotArea[filter]


[32mAll the colors have been cleaned[0m
[32mLog Transformation Complete[0m
[32mAll values encoded[0m


In [223]:
y = house['SalePrice']
house


Unnamed: 0,Id,MSSubClass,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,TotalBsmtSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MoSold,YrSold,SalePrice,LotFrontage_log,MasVnrArea_log,BsmtUnfSF_log,1stFlrSF_log,OpenPorchSF_log,MiscVal_log,SqrtLotArea_log,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Pave,Alley_None,Alley_Pave,LotShape_IR2,LotShape_IR3,LotShape_Reg,LandContour_HLS,LandContour_Low,...,FireplaceQu_Fa,FireplaceQu_Gd,FireplaceQu_None,FireplaceQu_Po,FireplaceQu_TA,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageType_None,GarageFinish_None,GarageFinish_RFn,GarageFinish_Unf,GarageQual_Fa,GarageQual_Gd,GarageQual_None,GarageQual_Po,GarageQual_TA,GarageCond_Fa,GarageCond_Gd,GarageCond_None,GarageCond_Po,GarageCond_TA,PavedDrive_P,PavedDrive_Y,PoolQC_Fa,PoolQC_Gd,PoolQC_None,Fence_GdWo,Fence_MnPrv,Fence_MnWw,Fence_None,MiscFeature_None,MiscFeature_Othr,MiscFeature_Shed,MiscFeature_TenC,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,7,5,2003,2003,706,0,856.00,854,0,1710,1,0,2,1,3,1,8,0,2003.00,2,548.00,0,0,0,0,0,2,2008,208500,4.17,5.28,5.01,6.75,4.11,0.00,4.52,0,0,1,0,1,1,0,0,0,1,0,0,...,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
1,2,20,6,8,1976,1976,978,0,1262.00,0,0,1262,0,1,2,0,3,1,6,1,1976.00,2,460.00,298,0,0,0,0,5,2007,181500,4.38,0.00,5.65,7.14,0.00,0.00,4.58,0,0,1,0,1,1,0,0,0,1,0,0,...,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
2,3,60,7,5,2001,2002,486,0,920.00,866,0,1786,1,0,2,1,3,1,6,1,2001.00,2,608.00,0,0,0,0,0,9,2008,223500,4.22,5.09,6.07,6.82,3.74,0.00,4.66,0,0,1,0,1,1,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
3,4,70,7,5,1915,1970,216,0,756.00,756,0,1717,1,0,1,0,3,1,7,1,1998.00,3,642.00,0,272,0,0,0,2,2006,140000,4.09,0.00,6.29,6.87,3.56,0.00,4.58,0,0,1,0,1,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,5,60,8,5,2000,2000,655,0,1145.00,1053,0,2198,1,0,2,1,4,1,9,1,2000.00,3,836.00,192,0,0,0,0,12,2008,250000,4.43,5.86,6.19,7.04,4.43,0.00,4.78,0,0,1,0,1,1,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,6,5,1999,2000,0,0,953.00,694,0,1647,0,0,2,1,3,1,7,1,1999.00,2,460.00,0,0,0,0,0,8,2007,175000,4.13,0.00,6.86,6.86,3.69,0.00,4.49,0,0,1,0,1,1,0,0,0,1,0,0,...,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
1456,1457,20,6,6,1978,1988,790,163,1542.00,0,0,2073,1,0,2,0,3,1,7,2,1978.00,2,500.00,349,0,0,0,0,2,2010,210000,4.44,4.78,6.38,7.64,0.00,0.00,4.74,0,0,1,0,1,1,0,0,0,1,0,0,...,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
1457,1458,70,7,9,1941,2006,275,0,1152.00,1152,0,2340,0,0,2,0,4,1,9,2,1941.00,1,252.00,0,0,0,0,0,5,2010,266500,4.19,0.00,6.78,7.08,4.09,7.82,4.55,0,0,1,0,1,1,0,0,0,1,0,0,...,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0
1458,1459,20,5,6,1950,1996,49,1029,1078.00,0,0,1078,1,0,1,0,2,1,5,0,1950.00,1,240.00,366,112,0,0,0,4,2010,142125,4.22,0.00,0.00,6.98,0.00,0.00,4.59,0,0,1,0,1,1,0,0,0,1,0,0,...,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0


In [224]:
test_df

Unnamed: 0,Id,MSSubClass,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,TotalBsmtSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MoSold,YrSold,LotFrontage_log,MasVnrArea_log,BsmtUnfSF_log,1stFlrSF_log,OpenPorchSF_log,MiscVal_log,SqrtLotArea_log,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Pave,Alley_None,Alley_Pave,LotShape_IR2,LotShape_IR3,LotShape_Reg,LandContour_HLS,LandContour_Low,LandContour_Lvl,...,Functional_Mod,Functional_Sev,Functional_Typ,FireplaceQu_Fa,FireplaceQu_Gd,FireplaceQu_None,FireplaceQu_Po,FireplaceQu_TA,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageType_None,GarageFinish_None,GarageFinish_RFn,GarageFinish_Unf,GarageQual_Gd,GarageQual_None,GarageQual_Po,GarageQual_TA,GarageCond_Fa,GarageCond_Gd,GarageCond_None,GarageCond_Po,GarageCond_TA,PavedDrive_P,PavedDrive_Y,PoolQC_Gd,PoolQC_None,Fence_GdWo,Fence_MnPrv,Fence_MnWw,Fence_None,MiscFeature_None,MiscFeature_Othr,MiscFeature_Shed,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1461,20,5,6,1961,1961,468.00,144.00,882.00,0,0,896,0.00,0.00,1,0,2,1,5,0,1961.00,1.00,730.00,140,0,0,120,0,6,2010,4.38,0.00,5.60,6.80,0.00,0.00,4.68,0,1,0,0,1,1,0,0,0,1,0,0,1,...,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
1,1462,20,6,6,1958,1958,923.00,0.00,1329.00,0,0,1329,0.00,0.00,1,1,3,1,6,0,1958.00,1.00,312.00,393,0,0,0,0,6,2010,4.39,4.68,6.01,7.19,3.58,9.43,4.78,0,0,1,0,1,1,0,0,0,0,0,0,1,...,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
2,1463,60,5,5,1997,1998,791.00,0.00,928.00,701,0,1629,0.00,0.00,2,1,3,1,6,1,1997.00,2.00,482.00,212,0,0,0,0,3,2010,4.30,0.00,4.92,6.83,3.53,0.00,4.77,0,0,1,0,1,1,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
3,1464,60,6,6,1998,1998,602.00,0.00,926.00,678,0,1604,0.00,0.00,2,1,3,1,7,1,1998.00,2.00,470.00,360,0,0,0,0,6,2010,4.36,3.00,5.78,6.83,3.58,0.00,4.60,0,0,1,0,1,1,0,0,0,0,0,0,1,...,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
4,1465,120,8,5,1992,1992,263.00,0.00,1280.00,0,0,1280,0.00,0.00,2,0,2,1,5,0,1992.00,2.00,506.00,0,0,0,144,0,1,2010,3.76,0.00,6.92,7.15,4.41,0.00,4.26,0,0,1,0,1,1,0,0,0,0,1,0,0,...,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,4,7,1970,1970,0.00,0.00,546.00,546,0,1092,0.00,0.00,1,1,3,1,5,0,0.00,0.00,0.00,0,0,0,0,0,6,2006,3.04,0.00,6.30,6.30,0.00,0.00,3.78,0,0,0,1,1,1,0,0,0,1,0,0,1,...,0,0,1,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
1455,2916,160,4,5,1970,1970,252.00,0.00,546.00,546,0,1092,0.00,0.00,1,1,3,1,6,0,1970.00,1.00,286.00,0,0,0,0,0,4,2006,3.04,0.00,5.68,6.30,3.18,0.00,3.77,0,0,0,1,1,1,0,0,0,1,0,0,1,...,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1456,2917,20,5,7,1960,1996,1224.00,0.00,1224.00,0,0,1224,1.00,0.00,1,0,4,1,7,1,1960.00,2.00,576.00,474,0,0,0,0,9,2006,5.08,0.00,0.00,7.11,0.00,0.00,4.95,0,0,1,0,1,1,0,0,0,1,0,0,1,...,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1457,2918,85,5,5,1992,1992,337.00,0.00,912.00,0,0,970,0.00,1.00,1,0,3,1,6,0,0.00,0.00,0.00,80,0,0,0,0,7,2006,4.13,0.00,6.35,6.88,3.47,6.55,4.63,0,0,1,0,1,1,0,0,0,1,0,0,1,...,0,0,1,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0


In [225]:
house_column_list = house.columns.to_list()
test_column_list = test_df.columns.to_list()

len(test_column_list)
print(f'{len(house_column_list)} in house column list')
print(f'{len(test_column_list)} in test column list')

261 in house column list
242 in test column list


In [226]:
drop_col = []
for i in house_column_list:
    if (i not in test_column_list and i!='SalePrice'):
        print(f"{colored(i, 'green')} is not in the test_df columns")
        drop_col.append(i)
#  I am definitely making a mistake here but I can't be bothered to check what, so we take the easy way out, and just remove the columns

[32mUtilities_NoSeWa[0m is not in the test_df columns
[32mCondition2_RRAe[0m is not in the test_df columns
[32mCondition2_RRAn[0m is not in the test_df columns
[32mCondition2_RRNn[0m is not in the test_df columns
[32mHouseStyle_2.5Fin[0m is not in the test_df columns
[32mRoofMatl_CompShg[0m is not in the test_df columns
[32mRoofMatl_Membran[0m is not in the test_df columns
[32mRoofMatl_Metal[0m is not in the test_df columns
[32mRoofMatl_Roll[0m is not in the test_df columns
[32mExterior1st_ImStucc[0m is not in the test_df columns
[32mExterior1st_Stone[0m is not in the test_df columns
[32mExterior2nd_Other[0m is not in the test_df columns
[32mHeating_GasA[0m is not in the test_df columns
[32mHeating_OthW[0m is not in the test_df columns
[32mElectrical_Mix[0m is not in the test_df columns
[32mGarageQual_Fa[0m is not in the test_df columns
[32mPoolQC_Fa[0m is not in the test_df columns
[32mMiscFeature_TenC[0m is not in the test_df columns


The above are all the columns that are in the test set and not in the train set, we'll just drop them for simplicity's sake

In [227]:
house.drop(drop_col, axis='columns', inplace=True)
# drop_col

In [228]:
print(f"Number of observations in house: {colored(house.shape, 'green')}")
print(f"Number of observations in test: {colored(test_df.shape, 'green')}")

Number of observations in house: [32m(1460, 243)[0m
Number of observations in test: [32m(1459, 242)[0m


In [229]:
y = np.log(house['SalePrice'])
y = pd.DataFrame({'SalePrice_log': y})
X = house.drop(['SalePrice'], axis='columns')
y

Unnamed: 0,SalePrice_log
0,12.25
1,12.11
2,12.32
3,11.85
4,12.43
...,...
1455,12.07
1456,12.25
1457,12.49
1458,11.86


In [230]:
print(f"Number of observations in X: {colored(X.shape, 'green')}")
print(f"Number of observations in y: {colored(y.shape, 'green')}")

Number of observations in X: [32m(1460, 242)[0m
Number of observations in y: [32m(1460, 1)[0m


In [231]:
test_df = test_df.fillna(0)
np.any(np.isnan(test_df))

False

In [232]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.reset_index()
print("X_train:",X_train.shape)
print("X_test:",X_test.shape)
print("y_train:",y_train.shape)
print("y_test:",y_test.shape)

X_train: (1022, 242)
X_test: (438, 242)
y_train: (1022, 1)
y_test: (438, 1)


In [233]:
# First using a simple linear regression model with the SalePrice_log and SalePrice
from sklearn.linear_model import LinearRegression

lreg = LinearRegression()
lreg.fit(X_train, y_train)
lreg.score(X_test, y_test)

0.8831053035288541

In [None]:
y_pred = lreg.predict(test_df)
y_pred_results = pd.DataFrame(np.exp(y_pred))
y_pred_results
y_pred_df = pd.DataFrame(y_pred_results)
y_pred_df.index = range(1461, 2920)
y_pred_df.reset_index(level=0, inplace=True)
y_pred_df.columns = ['Id', 'SalePrice']
y_pred_df.to_csv('house_prices_regression_results.csv', index= False)
y_pred_df

In [235]:
from sklearn.linear_model import Lasso
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0006, random_state=1))
lasso.fit(X_train, y_train)
lasso.score(X_test, y_test)
y_pred_lasso = lasso.predict(test_df)
y_pred_lasso = pd.DataFrame(y_pred_lasso)
y_pred_lasso.index = range(1461, 2920)
y_pred_lasso.reset_index(level=0, inplace=True)
y_pred_lasso.columns = ['Id', 'SalePrice']
y_pred_lasso['SalePrice'] = np.exp(y_pred_lasso['SalePrice'])
y_pred_lasso.to_csv('lasso_results.csv', index=False)
y_pred_lasso

Unnamed: 0,Id,SalePrice
0,1461,113771.73
1,1462,152301.68
2,1463,182091.87
3,1464,204046.70
4,1465,207961.18
...,...,...
1454,2915,81008.88
1455,2916,85092.22
1456,2917,181698.38
1457,2918,116924.61
