# 1. EDA and Cleaning

In [91]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score



In [92]:
# Read in Data
housing = pd.read_csv('datasets/train.csv')
housing_test = pd.read_csv('datasets/test.csv')

In [93]:
# Lot of NA values. However some actually mean 0, so need to fix this. 
# i.e. GarageQual NA means No Garage
# housing.isna().mean().sort_values(ascending=False).head(20)

In [94]:
# housing.loc[housing['Overall Cond'] == 2, ['Functional']]

### Read through data dictionary, decide on which variables to use

In [95]:
# Remove spaces in col names b/c data dictionary col names do not include spaces
housing.columns = [n.replace(" ", "") for n in housing.columns]
# housing.head()
housing.columns.sort_values()

Index(['1stFlrSF', '2ndFlrSF', '3SsnPorch', 'Alley', 'BedroomAbvGr',
       'BldgType', 'BsmtCond', 'BsmtExposure', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtFinType1', 'BsmtFinType2', 'BsmtFullBath', 'BsmtHalfBath',
       'BsmtQual', 'BsmtUnfSF', 'CentralAir', 'Condition1', 'Condition2',
       'Electrical', 'EnclosedPorch', 'ExterCond', 'ExterQual', 'Exterior1st',
       'Exterior2nd', 'Fence', 'FireplaceQu', 'Fireplaces', 'Foundation',
       'FullBath', 'Functional', 'GarageArea', 'GarageCars', 'GarageCond',
       'GarageFinish', 'GarageQual', 'GarageType', 'GarageYrBlt', 'GrLivArea',
       'HalfBath', 'Heating', 'HeatingQC', 'HouseStyle', 'Id', 'KitchenAbvGr',
       'KitchenQual', 'LandContour', 'LandSlope', 'LotArea', 'LotConfig',
       'LotFrontage', 'LotShape', 'LowQualFinSF', 'MSSubClass', 'MSZoning',
       'MasVnrArea', 'MasVnrType', 'MiscFeature', 'MiscVal', 'MoSold',
       'Neighborhood', 'OpenPorchSF', 'OverallCond', 'OverallQual', 'PID',
       'PavedDrive', 'PoolAre

In [96]:
# Save var names in txt file
# Create function to get variable names into list from txt file
# https://stackoverflow.com/questions/23372086/how-would-i-read-only-the-first-word-of-each-line-of-a-text-file

def get_var_name(txt_file):
    vars = []
    with open(txt_file, 'r') as f:
        for line in f:
            vars.append(line.split(None, 1)[0][:-1])
    return vars

init_vars = get_var_name('datasets/initial_vars.txt')
housing_init = housing[init_vars]
housing_init.head()

Unnamed: 0,MSSubClass,MSZoning,Neighborhood,OverallQual,OverallCond,YearRemod/Add,TotalBsmtSF,HeatingQC,CentralAir,GrLivArea,...,EnclosedPorch,3SsnPorch,ScreenPorch,MiscVal,MoSold,YrSold,Condition1,Fireplaces,PoolArea,SalePrice
0,60,RL,Sawyer,6,8,2005,725.0,Ex,Y,1479,...,0,0,0,0,3,2010,RRAe,0,0,130500
1,60,RL,SawyerW,7,5,1997,913.0,Ex,Y,2122,...,0,0,0,0,4,2009,Norm,1,0,220000
2,20,RL,NAmes,5,7,2007,1057.0,TA,Y,1057,...,0,0,0,0,1,2010,Norm,0,0,109000
3,60,RL,Timber,5,5,2007,384.0,Gd,Y,1444,...,0,0,0,0,4,2010,Norm,0,0,174000
4,50,RL,SawyerW,6,8,1993,676.0,TA,Y,1445,...,0,0,0,0,3,2010,Norm,0,0,138500


### Null values

In [97]:
# Check for NA values. 
housing_init.isna().mean().sort_values
# X_init_vars.loc[X_init_vars['MasVnrArea'].isna(), ]

<bound method Series.sort_values of MSSubClass       0.000000
MSZoning         0.000000
Neighborhood     0.000000
OverallQual      0.000000
OverallCond      0.000000
YearRemod/Add    0.000000
TotalBsmtSF      0.000488
HeatingQC        0.000000
CentralAir       0.000000
GrLivArea        0.000000
BsmtFullBath     0.000975
BsmtHalfBath     0.000975
FullBath         0.000000
HalfBath         0.000000
TotRmsAbvGrd     0.000000
GarageCars       0.000488
WoodDeckSF       0.000000
OpenPorchSF      0.000000
EnclosedPorch    0.000000
3SsnPorch        0.000000
ScreenPorch      0.000000
MiscVal          0.000000
MoSold           0.000000
YrSold           0.000000
Condition1       0.000000
Fireplaces       0.000000
PoolArea         0.000000
SalePrice        0.000000
dtype: float64>

Replace NAs with 0. They are not missing values, but 0 (i.e. basement bathroom is 0 not NA if there is no basement)

In [98]:
# https://www.geeksforgeeks.org/replace-nan-values-with-zeros-in-pandas-dataframe/
housing_init = housing_init.fillna(0)

### Variable Transformations

Add 'age' of home to be time since build or latest remodel. Assume data is from 2011

In [99]:
housing_init['YearRemod/Add'].groupby(housing_init['YearRemod/Add']).count()

YearRemod/Add
1950    262
1951     10
1952      9
1953     18
1954     17
       ... 
2006    139
2007    123
2008     55
2009     23
2010      8
Name: YearRemod/Add, Length: 61, dtype: int64

In [100]:
housing_init['Age'] = 2011 - housing_init['YearRemod/Add']

In [101]:
housing_init.drop(columns =['YearRemod/Add'], inplace= True)

In [102]:
# Ages range from 1-61 years, which makes sense 
housing_init['Age'].describe()

count    2051.000000
mean       26.809849
std        21.036250
min         1.000000
25%         7.000000
50%        18.000000
75%        46.500000
max        61.000000
Name: Age, dtype: float64

Add 'TotalBaths' as new variable, adding up basement full bathrooms, basement half bathrooms, full bathrooms above grade, and half bathrooms above grade. 

In [103]:
housing_init['TotalBaths'] = housing_init['BsmtFullBath'] + 0.5*housing_init['BsmtHalfBath'] + housing_init['FullBath'] + 0.5*housing_init['HalfBath']

In [104]:
housing_init['TotalBaths'].describe()

count    2051.000000
mean        2.221599
std         0.807294
min         1.000000
25%         2.000000
50%         2.000000
75%         3.000000
max         7.000000
Name: TotalBaths, dtype: float64

In [105]:
housing_init.drop(columns =['BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath'], inplace= True)

Add 'OutdoorSF' as outdoor square feet, which represents the sum of wood deck, open porch, enclosed porch, 3 screen porch and screen porch square feet. 

In [106]:
housing_init['OutdoorSF'] = housing_init['WoodDeckSF'] + housing_init['OpenPorchSF'] + housing_init['EnclosedPorch'] + housing_init['3SsnPorch'] + housing_init['ScreenPorch'] 

In [107]:
housing_init['OutdoorSF'].describe()

count    2051.000000
mean      183.065334
std       160.116258
min         0.000000
25%        49.000000
50%       164.000000
75%       268.000000
max      1424.000000
Name: OutdoorSF, dtype: float64

In [108]:
housing_init.drop(columns =['WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch'], inplace= True)

**Deal with MiscVal ($Value of miscellaneous feature). I want to add it to the model instead of having it as a variable**

Convert month sold into calendar year quarter (Q1 for Jan-Mar, etc.)

In [109]:
# https://www.nar.realtor/blogs/economists-outlook/seasonality-in-the-housing-market
housing_init['MoSold'].describe()

count    2051.000000
mean        6.219893
std         2.744736
min         1.000000
25%         4.000000
50%         6.000000
75%         8.000000
max        12.000000
Name: MoSold, dtype: float64

In [110]:
housing_init['Quarter'] = np.nan
q = []
for month in housing_init['MoSold']:
    if month < 4:
        q.append('Q1')
    elif month < 7:
        q.append('Q2')
    elif month < 10:
        q.append('Q3')
    else:
        q.append('Q4')

housing_init['Quarter'] = q

In [111]:
housing_init['Quarter'].groupby(housing_init['Quarter']).count()

Quarter
Q1    351
Q2    817
Q3    579
Q4    304
Name: Quarter, dtype: int64

In [112]:
housing_init.drop(columns =['MoSold'], inplace=True)

Convert year to binary variable 'GFC', before and after 2008 (Great Financial Crisis) which negatively impacted real estate prices. 

In [113]:
housing_init['SalePrice'].groupby(housing_init['YrSold']).mean()

YrSold
2006    179471.609589
2007    185524.514056
2008    181750.372414
2009    182455.244395
2010    174180.059829
Name: SalePrice, dtype: float64

In [114]:
housing_init['GFC'] = np.nan
year = []
for yr in housing_init['YrSold']:
    if yr > 2009:
        year.append('0')
    else:
        year.append('1')

housing_init['GFC'] = year

In [115]:
housing_init['GFC'].groupby(housing_init['GFC']).count()

GFC
0     234
1    1817
Name: GFC, dtype: int64

In [116]:
housing_init.drop(columns =['YrSold'], inplace=True)

### Variable Types

In [117]:
housing_init.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 21 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MSSubClass    2051 non-null   int64  
 1   MSZoning      2051 non-null   object 
 2   Neighborhood  2051 non-null   object 
 3   OverallQual   2051 non-null   int64  
 4   OverallCond   2051 non-null   int64  
 5   TotalBsmtSF   2051 non-null   float64
 6   HeatingQC     2051 non-null   object 
 7   CentralAir    2051 non-null   object 
 8   GrLivArea     2051 non-null   int64  
 9   TotRmsAbvGrd  2051 non-null   int64  
 10  GarageCars    2051 non-null   float64
 11  MiscVal       2051 non-null   int64  
 12  Condition1    2051 non-null   object 
 13  Fireplaces    2051 non-null   int64  
 14  PoolArea      2051 non-null   int64  
 15  SalePrice     2051 non-null   int64  
 16  Age           2051 non-null   int64  
 17  TotalBaths    2051 non-null   float64
 18  OutdoorSF     2051 non-null 

In [119]:
# Convert variables to categorical (after TTS) 
housing_init_dummy = housing_init.drop(columns = ['TotalBsmtSF', 'GrLivArea', 'MiscVal', 'PoolArea', 'SalePrice', 'Age', 'OutdoorSF', 'TotRmsAbvGrd', 'GarageCars', 'Fireplaces', 'TotalBaths'])

In [None]:
# Do this after TTS
# housing_init_dummy = pd.get_dummies(housing_init_dummy, drop_first = True)
# housing_init_dummy

In [120]:
# Numeric variables
housing_init_num = housing_init[['TotalBsmtSF', 'GrLivArea', 'MiscVal', 'PoolArea', 'Age', 'OutdoorSF', 'TotRmsAbvGrd', 'GarageCars', 'Fireplaces', 'TotalBaths']]
housing_init_num

Unnamed: 0,TotalBsmtSF,GrLivArea,MiscVal,PoolArea,Age,OutdoorSF,TotRmsAbvGrd,GarageCars,Fireplaces,TotalBaths
0,725.0,1479,0,0,6,44,6,2.0,0,2.5
1,913.0,2122,0,0,14,74,8,2.0,1,3.5
2,1057.0,1057,0,0,4,52,5,1.0,0,2.0
3,384.0,1444,0,0,4,100,7,2.0,0,2.5
4,676.0,1445,0,0,18,59,6,2.0,0,2.0
...,...,...,...,...,...,...,...,...,...,...
2046,1884.0,1728,0,0,4,276,7,2.0,1,3.0
2047,861.0,861,0,0,61,158,4,2.0,0,1.0
2048,896.0,1913,0,0,61,0,9,2.0,1,1.5
2049,1200.0,1200,0,0,55,329,6,1.0,2,2.0


In [121]:
# Create combined new data set for X variables
# https://pandas.pydata.org/docs/reference/api/pandas.concat.html
housing_xvars = pd.concat([housing_init_dummy, housing_init_num], axis=1)

In [122]:
housing_xvars.head()

Unnamed: 0,MSSubClass,MSZoning,Neighborhood,OverallQual,OverallCond,HeatingQC,CentralAir,Condition1,Quarter,GFC,TotalBsmtSF,GrLivArea,MiscVal,PoolArea,Age,OutdoorSF,TotRmsAbvGrd,GarageCars,Fireplaces,TotalBaths
0,60,RL,Sawyer,6,8,Ex,Y,RRAe,Q1,0,725.0,1479,0,0,6,44,6,2.0,0,2.5
1,60,RL,SawyerW,7,5,Ex,Y,Norm,Q2,1,913.0,2122,0,0,14,74,8,2.0,1,3.5
2,20,RL,NAmes,5,7,TA,Y,Norm,Q1,0,1057.0,1057,0,0,4,52,5,1.0,0,2.0
3,60,RL,Timber,5,5,Gd,Y,Norm,Q2,0,384.0,1444,0,0,4,100,7,2.0,0,2.5
4,50,RL,SawyerW,6,8,TA,Y,Norm,Q1,0,676.0,1445,0,0,18,59,6,2.0,0,2.0


In [123]:
housing_xvars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 20 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MSSubClass    2051 non-null   int64  
 1   MSZoning      2051 non-null   object 
 2   Neighborhood  2051 non-null   object 
 3   OverallQual   2051 non-null   int64  
 4   OverallCond   2051 non-null   int64  
 5   HeatingQC     2051 non-null   object 
 6   CentralAir    2051 non-null   object 
 7   Condition1    2051 non-null   object 
 8   Quarter       2051 non-null   object 
 9   GFC           2051 non-null   object 
 10  TotalBsmtSF   2051 non-null   float64
 11  GrLivArea     2051 non-null   int64  
 12  MiscVal       2051 non-null   int64  
 13  PoolArea      2051 non-null   int64  
 14  Age           2051 non-null   int64  
 15  OutdoorSF     2051 non-null   int64  
 16  TotRmsAbvGrd  2051 non-null   int64  
 17  GarageCars    2051 non-null   float64
 18  Fireplaces    2051 non-null 

# Preproccesing 

In [147]:
# Convert variables to categorical with OHE
oh = OneHotEncoder(sparse = False, drop = 'first')
housing_init_dummy_oh = oh.fit_transform(housing_init_dummy)
housing_xvars_oh = pd.concat([housing_init_dummy_oh, housing_init_num], axis=1)
# housing_init_dummy = pd.get_dummies(housing_init_dummy, drop_first = True)
# housing_init_dummy
# pd.DataFrame(housing_init_dummy_oh)
# housing_init_dummy_oh.shape




TypeError: cannot concatenate object of type '<class 'numpy.ndarray'>'; only Series and DataFrame objs are valid

In [146]:
housing_init_dummy_oh

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 1., 1.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [158]:
oh.get_feature_names_out()

array(['MSSubClass_30', 'MSSubClass_40', 'MSSubClass_45', 'MSSubClass_50',
       'MSSubClass_60', 'MSSubClass_70', 'MSSubClass_75', 'MSSubClass_80',
       'MSSubClass_85', 'MSSubClass_90', 'MSSubClass_120',
       'MSSubClass_150', 'MSSubClass_160', 'MSSubClass_180',
       'MSSubClass_190', 'MSZoning_C (all)', 'MSZoning_FV',
       'MSZoning_I (all)', 'MSZoning_RH', 'MSZoning_RL', 'MSZoning_RM',
       'Neighborhood_Blueste', 'Neighborhood_BrDale',
       'Neighborhood_BrkSide', 'Neighborhood_ClearCr',
       'Neighborhood_CollgCr', 'Neighborhood_Crawfor',
       'Neighborhood_Edwards', 'Neighborhood_Gilbert',
       'Neighborhood_Greens', 'Neighborhood_GrnHill',
       'Neighborhood_IDOTRR', 'Neighborhood_Landmrk',
       'Neighborhood_MeadowV', 'Neighborhood_Mitchel',
       'Neighborhood_NAmes', 'Neighborhood_NPkVill',
       'Neighborhood_NWAmes', 'Neighborhood_NoRidge',
       'Neighborhood_NridgHt', 'Neighborhood_OldTown',
       'Neighborhood_SWISU', 'Neighborhood_Sawyer',
  

In [172]:
# pd.DataFrame(housing_init_dummy_oh, columns = oh.get_feature_names_out())

In [160]:
housing_xvars_oh = pd.concat([pd.DataFrame(housing_init_dummy_oh, columns = oh.get_feature_names_out()), housing_init_num], axis=1)

In [161]:
housing_xvars_oh

Unnamed: 0,MSSubClass_30,MSSubClass_40,MSSubClass_45,MSSubClass_50,MSSubClass_60,MSSubClass_70,MSSubClass_75,MSSubClass_80,MSSubClass_85,MSSubClass_90,...,TotalBsmtSF,GrLivArea,MiscVal,PoolArea,Age,OutdoorSF,TotRmsAbvGrd,GarageCars,Fireplaces,TotalBaths
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,725.0,1479,0,0,6,44,6,2.0,0,2.5
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,913.0,2122,0,0,14,74,8,2.0,1,3.5
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1057.0,1057,0,0,4,52,5,1.0,0,2.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,384.0,1444,0,0,4,100,7,2.0,0,2.5
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,676.0,1445,0,0,18,59,6,2.0,0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2046,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1884.0,1728,0,0,4,276,7,2.0,1,3.0
2047,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,861.0,861,0,0,61,158,4,2.0,0,1.0
2048,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,896.0,1913,0,0,61,0,9,2.0,1,1.5
2049,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1200.0,1200,0,0,55,329,6,1.0,2,2.0


In [164]:
# Define X and y variables

X = housing_xvars_oh
y = housing_init['SalePrice']

In [165]:
# TTS
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 95)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(1538, 92) (1538,)
(513, 92) (513,)


# Model

In [166]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [167]:
lr.score(X_train, y_train)

0.8873364495160501

In [168]:
lr.score(X_test, y_test)

0.877606905706273

In [169]:
preds = lr.predict(X)

In [170]:
preds.shape

(2051,)

### Convert test data in same way as training data

In [185]:
housing_test0 = pd.read_csv('datasets/test-Copy1.csv')
housing_test0.columns = [n.replace(" ", "") for n in housing_test0.columns]

init_vars1 = get_var_name('datasets/initial_vars-Copy1.txt')
housing_test1 = housing_test0[init_vars1]

housing_test1 = housing_test1.fillna(0)

housing_test1['Age'] = 2011 - housing_test1['YearRemod/Add']
housing_test1.drop(columns =['YearRemod/Add'], inplace= True)

housing_test1['TotalBaths'] = housing_test1['BsmtFullBath'] + 0.5*housing_test1['BsmtHalfBath'] + housing_test1['FullBath'] + 0.5*housing_test1['HalfBath']
housing_test1.drop(columns =['BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath'], inplace= True)

housing_test1['OutdoorSF'] = housing_test1['WoodDeckSF'] + housing_test1['OpenPorchSF'] + housing_test1['EnclosedPorch'] + housing_test1['3SsnPorch'] + housing_test1['ScreenPorch'] 
housing_test1.drop(columns =['WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch'], inplace= True)

housing_test1['Quarter'] = np.nan
q = []
for month in housing_test1['MoSold']:
    if month < 4:
        q.append('Q1')
    elif month < 7:
        q.append('Q2')
    elif month < 10:
        q.append('Q3')
    else:
        q.append('Q4')
housing_test1['Quarter'] = q
housing_test1.drop(columns =['MoSold'], inplace=True)

housing_test1['GFC'] = np.nan
year = []
for yr in housing_test1['YrSold']:
    if yr > 2009:
        year.append('0')
    else:
        year.append('1')
housing_test1['GFC'] = year
housing_test1.drop(columns =['YrSold'], inplace=True)

# housing_test1_dum = housing_test1.drop(columns = ['TotalBsmtSF', 'GrLivArea', 'MiscVal', 'PoolArea', 'Age', 'OutdoorSF'])
# housing_test1_dummy = pd.get_dummies(housing_test1_dum, drop_first = True)
# housing_test1_num = housing_test1[['TotalBsmtSF', 'GrLivArea', 'MiscVal', 'PoolArea', 'Age', 'OutdoorSF']]
# housing_test1_xvars = pd.concat([housing_test1_dummy, housing_test1_num], axis=1)

housing_test1_dummy = housing_test1.drop(columns = ['TotalBsmtSF', 'GrLivArea', 'MiscVal', 'PoolArea', 'Age', 'OutdoorSF', 'TotRmsAbvGrd', 'GarageCars', 'Fireplaces', 'TotalBaths'])
housing_test1_num = housing_test1[['TotalBsmtSF', 'GrLivArea', 'MiscVal', 'PoolArea', 'Age', 'OutdoorSF', 'TotRmsAbvGrd', 'GarageCars', 'Fireplaces', 'TotalBaths']]
oh = OneHotEncoder(sparse = False, drop = 'first')
housing_test1_dummy_oh = oh.fit_transform(housing_test1_dummy)
housing_xvars_test1_oh = pd.concat([pd.DataFrame(housing_test1_dummy_oh, columns = oh.get_feature_names_out()), housing_test1_num], axis=1)




## Run model on testing data

In [179]:
housing_xvars_test1_oh

Unnamed: 0,MSSubClass_30,MSSubClass_40,MSSubClass_45,MSSubClass_50,MSSubClass_60,MSSubClass_70,MSSubClass_75,MSSubClass_80,MSSubClass_85,MSSubClass_90,...,TotalBsmtSF,GrLivArea,MiscVal,PoolArea,Age,OutdoorSF,TotRmsAbvGrd,GarageCars,Fireplaces,TotalBaths
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1020,1928,0,0,61,172,9,1,0,2.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1967,1967,0,0,34,170,10,2,0,2.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,654,1496,0,0,5,124,7,2,1,3.5
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,968,968,0,0,5,184,5,2,0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1394,1394,0,0,48,261,6,2,2,2.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
873,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1084,1877,0,0,37,96,8,2,1,3.5
874,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1104,1988,0,0,12,230,9,2,1,2.5
875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,952,1211,0,0,43,63,5,1,1,1.0
876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,864,864,0,0,40,0,5,2,0,1.0


In [196]:
miss_cols = housing_xvars_oh.columns.difference(housing_xvars_test1_oh.columns)
miss_cols

Index(['HeatingQC_Po', 'MSSubClass_150', 'MSZoning_C (all)',
       'Neighborhood_GrnHill', 'Neighborhood_Landmrk', 'OverallQual_2'],
      dtype='object')

In [197]:
# Add missing columns to test data
housing_xvars_test1_oh[[miss_cols] = 0]

SyntaxError: invalid syntax (2677390517.py, line 2)

In [183]:
housing_xvars_oh.columns

Index(['MSSubClass_30', 'MSSubClass_40', 'MSSubClass_45', 'MSSubClass_50',
       'MSSubClass_60', 'MSSubClass_70', 'MSSubClass_75', 'MSSubClass_80',
       'MSSubClass_85', 'MSSubClass_90', 'MSSubClass_120', 'MSSubClass_150',
       'MSSubClass_160', 'MSSubClass_180', 'MSSubClass_190',
       'MSZoning_C (all)', 'MSZoning_FV', 'MSZoning_I (all)', 'MSZoning_RH',
       'MSZoning_RL', 'MSZoning_RM', 'Neighborhood_Blueste',
       'Neighborhood_BrDale', 'Neighborhood_BrkSide', 'Neighborhood_ClearCr',
       'Neighborhood_CollgCr', 'Neighborhood_Crawfor', 'Neighborhood_Edwards',
       'Neighborhood_Gilbert', 'Neighborhood_Greens', 'Neighborhood_GrnHill',
       'Neighborhood_IDOTRR', 'Neighborhood_Landmrk', 'Neighborhood_MeadowV',
       'Neighborhood_Mitchel', 'Neighborhood_NAmes', 'Neighborhood_NPkVill',
       'Neighborhood_NWAmes', 'Neighborhood_NoRidge', 'Neighborhood_NridgHt',
       'Neighborhood_OldTown', 'Neighborhood_SWISU', 'Neighborhood_Sawyer',
       'Neighborhood_SawyerW',

In [184]:
test_preds = lr.predict(housing_test1_xvars)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- MSSubClass
- OverallCond
- OverallQual
Feature names seen at fit time, yet now missing:
- HeatingQC_Po
- MSSubClass_120
- MSSubClass_150
- MSSubClass_160
- MSSubClass_180
- ...
