In [9]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.compose import ColumnTransformer


In [67]:
# Read in Data
housing = pd.read_csv('datasets/train.csv')
housing.columns

Index(['Id', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area',
       'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities',
       'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual',
       'Overall Cond', 'Year Built', 'Year Remod/Add', 'Roof Style',
       'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
       'Mas Vnr Area', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual',
       'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1',
       'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF',
       'Heating', 'Heating QC', 'Central Air', 'Electrical', '1st Flr SF',
       '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath',
       'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr',
       'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional',
       'Fireplaces', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt',
       'G

In [66]:
temp = housing.isna().mean() 
temp.tail(40)

HeatingQC        0.000000
CentralAir       0.000000
Electrical       0.000000
1stFlrSF         0.000000
2ndFlrSF         0.000000
LowQualFinSF     0.000000
GrLivArea        0.000000
BsmtFullBath     0.000975
BsmtHalfBath     0.000975
FullBath         0.000000
HalfBath         0.000000
BedroomAbvGr     0.000000
KitchenAbvGr     0.000000
KitchenQual      0.000000
TotRmsAbvGrd     0.000000
Functional       0.000000
Fireplaces       0.000000
FireplaceQu      0.487567
GarageType       0.055095
GarageYrBlt      0.055583
GarageFinish     0.055583
GarageCars       0.000488
GarageArea       0.000488
GarageQual       0.055583
GarageCond       0.055583
PavedDrive       0.000000
WoodDeckSF       0.000000
OpenPorchSF      0.000000
EnclosedPorch    0.000000
3SsnPorch        0.000000
ScreenPorch      0.000000
PoolArea         0.000000
PoolQC           0.995612
Fence            0.804973
MiscFeature      0.968308
MiscVal          0.000000
MoSold           0.000000
YrSold           0.000000
SaleType    

In [80]:
# Remove spaces in col names
housing.columns = [n.replace(" ", "") for n in housing.columns]
housing.head()

Unnamed: 0,Id,PID,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [60]:
# DUMMIFY THE VARIABLES BELOW

# MSSubClass: The building class
# MSZoning: Identifies the general zoning classification of the sale. 
# Street: Type of road access to property - binary
# Alley: Type of alley access to property 
# LotShape: General shape of property 
# LandContour: Flatness of the property 
# Utilities: Type of utilities available 
# LotConfig: Lot configuration 
# LandSlope: Slope of property 
# *Neighborhood: Physical locations within Ames city limits 
# Condition1: Proximity to main road or railroad 
# Condition2: Proximity to main road or railroad (if a second is present) 
# *BldgType: Type of dwelling 
# HouseStyle: Style of dwelling 
# *OverallQual: Overall material and finish quality 
# *OverallCond: Overall condition rating 
# RoofStyle: Type of roof
# RoofMatl: Roof material
# Exterior1st: Exterior covering on house
# Exterior2nd: Exterior covering on house (if more than one material)
# MasVnrType: Masonry veneer type
# ExterQual: Exterior material quality
# ExterCond: Present condition of the material on the exterior
# Foundation: Type of foundation
# BsmtQual: Height of the basement
# BsmtCond: General condition of the basement
# BsmtExposure: Walkout or garden level basement walls
# BsmtFinType1: Quality of basement finished area
# BsmtFinType2: Quality of second finished area (if present)
# Heating: Type of heating
# HeatingQC: Heating quality and condition
# CentralAir: Central air conditioning - binary
# Electrical: Electrical system
# Functional: Home functionality rating
# GarageType: Garage location
# GarageFinish: Interior finish of the garage
# GarageQual: Garage quality
# GarageCond: Garage condition
# PavedDrive: Paved driveway
# PoolQC: Pool quality
# Fence: Fence quality
# MiscFeature: Miscellaneous feature not covered in other categories
# SaleType: Type of sale
# FireplaceQu: Fireplace quality

# Create function to get variable names into list
# https://stackoverflow.com/questions/23372086/how-would-i-read-only-the-first-word-of-each-line-of-a-text-file

def get_var_name(txt_file):
    vars = []
    with open(txt_file, 'r') as f:
        for line in f:
            vars.append(line.split(None, 1)[0][:-1])
    return vars

dummy_vars = get_var_name('datasets/dummy_vars.txt')
X_dummy_vars = housing[dummy_vars]
X_dummy_vars.head()

Unnamed: 0,MSSubClass,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,Functional,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType
0,60,RL,Pave,,IR1,Lvl,AllPub,CulDSac,Gtl,Sawyer,...,Typ,Attchd,RFn,TA,TA,Y,,,,WD
1,60,RL,Pave,,IR1,Lvl,AllPub,CulDSac,Gtl,SawyerW,...,Typ,Attchd,RFn,TA,TA,Y,,,,WD
2,20,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,...,Typ,Detchd,Unf,TA,TA,Y,,,,WD
3,60,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Timber,...,Typ,BuiltIn,Fin,TA,TA,Y,,,,WD
4,50,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,SawyerW,...,Typ,Detchd,Unf,TA,TA,N,,,,WD


In [110]:
# oh = OneHotEncoder(sparse=False, drop='first')
# X_dummy_vars_oh = oh.fit_transform(X_dummy_vars)
# X_dummy_vars_oh

X_dummy_vars_gd = pd.get_dummies(X_dummy_vars, drop_first=True)
X_dummy_vars_gd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Columns: 207 entries, MSSubClass to SaleType_WD 
dtypes: int64(3), uint8(204)
memory usage: 456.8 KB


In [87]:
# ENSURE VARIABLES BELOW ARE IN NUMBER FORMAT
# LotFrontage: Linear feet of street connected to property 
# *LotArea: Lot size in square feet 
# MasVnrArea: Masonry veneer area in square feet 
# BsmtFinSF1: Type 1 finished square feet
# BsmtFinSF2: Type 2 finished square feet
# BsmtUnfSF: Unfinished square feet of basement area
# *TotalBsmtSF: Total square feet of basement area
# 1stFlrSF: First Floor square feet
# 2ndFlrSF: Second floor square feet
# *LowQualFinSF: Low quality finished square feet (all floors)
# GrLivArea: Above grade (ground) living area square feet
# BsmtFullBath: Basement full bathrooms
# BsmtHalfBath: Basement half bathrooms
# FullBath: Full bathrooms above grade
# HalfBath: Half baths above grade
# Bedroom: Number of bedrooms above basement level
# Kitchen: Number of kitchens
# TotRmsAbvGrd: Total rooms above grade (does not include bathrooms)
# Fireplaces: Number of fireplaces
# GarageCars: Size of garage in car capacity
# GarageArea: Size of garage in square feet
# WoodDeckSF: Wood deck area in square feet
# OpenPorchSF: Open porch area in square feet
# EnclosedPorch: Enclosed porch area in square feet
# 3SsnPorch: Three season porch area in square feet
# ScreenPorch: Screen porch area in square feet
# PoolArea: Pool area in square feet
# MiscVal: USD Value of miscellaneous feature

# Get variable names for numeric variables
numeric_vars = get_var_name('datasets/numeric_vars.txt')
X_numeric_vars = housing[numeric_vars]
X_numeric_vars.head()

Unnamed: 0,LotFrontage,LotArea,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,...,Fireplaces,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal
0,,13517,289.0,533.0,0.0,192.0,725.0,725,754,0,...,0,2.0,475.0,0,44,0,0,0,0,0
1,43.0,11492,132.0,637.0,0.0,276.0,913.0,913,1209,0,...,1,2.0,559.0,0,74,0,0,0,0,0
2,68.0,7922,0.0,731.0,0.0,326.0,1057.0,1057,0,0,...,0,1.0,246.0,0,52,0,0,0,0,0
3,73.0,9802,0.0,0.0,0.0,384.0,384.0,744,700,0,...,0,2.0,400.0,100,0,0,0,0,0,0
4,82.0,14235,0.0,0.0,0.0,676.0,676.0,831,614,0,...,0,2.0,484.0,0,59,0,0,0,0,0


In [94]:
# Convert all columns here to numeric
X_numeric_vars_to_num = X_numeric_vars.apply(pd.to_numeric)
X_numeric_vars_to_num.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 28 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   LotFrontage    1721 non-null   float64
 1   LotArea        2051 non-null   int64  
 2   MasVnrArea     2029 non-null   float64
 3   BsmtFinSF1     2050 non-null   float64
 4   BsmtFinSF2     2050 non-null   float64
 5   BsmtUnfSF      2050 non-null   float64
 6   TotalBsmtSF    2050 non-null   float64
 7   1stFlrSF       2051 non-null   int64  
 8   2ndFlrSF       2051 non-null   int64  
 9   LowQualFinSF   2051 non-null   int64  
 10  GrLivArea      2051 non-null   int64  
 11  BsmtFullBath   2049 non-null   float64
 12  BsmtHalfBath   2049 non-null   float64
 13  FullBath       2051 non-null   int64  
 14  HalfBath       2051 non-null   int64  
 15  BedroomAbvGr   2051 non-null   int64  
 16  KitchenAbvGr   2051 non-null   int64  
 17  TotRmsAbvGrd   2051 non-null   int64  
 18  Fireplac

In [90]:
# CONVERT THE VARIABLES BELOW TO TIME SERIES
# CAN MAKE SERIES DATA LIKE 0-5 YRS, 6-10, 10-15, 16+ ETC. MONTH TO SZN
# YearBuilt: Original construction date - convert to yr
# *YearRemodAdd: Remodel date (same as construction date if no remodeling or additions) - convert to yr
# GarageYrBlt: Year garage was built
# MoSold: Month Sold
# YrSold: Year Sold


# Get variable names for time variables
time_vars = get_var_name('datasets/time_vars.txt')
X_time_vars = housing[time_vars]
X_time_vars.head()



Unnamed: 0,YearBuilt,YearRemod/Add,GarageYrBlt,MoSold,YrSold
0,1976,2005,1976.0,3,2010
1,1996,1997,1997.0,4,2009
2,1953,2007,1953.0,1,2010
3,2006,2007,2007.0,4,2010
4,1900,1993,1957.0,3,2010


In [98]:
# Convert cols to time
X_time_vars_to_time = X_time_vars.apply(pd.to_datetime)
X_time_vars_to_time.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   YearBuilt      2051 non-null   datetime64[ns]
 1   YearRemod/Add  2051 non-null   datetime64[ns]
 2   GarageYrBlt    1937 non-null   datetime64[ns]
 3   MoSold         2051 non-null   datetime64[ns]
 4   YrSold         2051 non-null   datetime64[ns]
dtypes: datetime64[ns](5)
memory usage: 80.2 KB


In [111]:
# Combine new housing data set with cleaned up data
frames = [X_time_vars_to_time, X_numeric_vars_to_num, X_dummy_vars_gd]

housing_clean = pd.concat(frames)

Unnamed: 0,YearBuilt,YearRemod/Add,GarageYrBlt,MoSold,YrSold,LotFrontage,LotArea,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,MiscFeature_Shed,MiscFeature_TenC,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD
0,1970-01-01 00:00:00.000001976,1970-01-01 00:00:00.000002005,1970-01-01 00:00:00.000001976,1970-01-01 00:00:00.000000003,1970-01-01 00:00:00.000002010,,,,,,...,,,,,,,,,,
1,1970-01-01 00:00:00.000001996,1970-01-01 00:00:00.000001997,1970-01-01 00:00:00.000001997,1970-01-01 00:00:00.000000004,1970-01-01 00:00:00.000002009,,,,,,...,,,,,,,,,,
2,1970-01-01 00:00:00.000001953,1970-01-01 00:00:00.000002007,1970-01-01 00:00:00.000001953,1970-01-01 00:00:00.000000001,1970-01-01 00:00:00.000002010,,,,,,...,,,,,,,,,,
3,1970-01-01 00:00:00.000002006,1970-01-01 00:00:00.000002007,1970-01-01 00:00:00.000002007,1970-01-01 00:00:00.000000004,1970-01-01 00:00:00.000002010,,,,,,...,,,,,,,,,,
4,1970-01-01 00:00:00.000001900,1970-01-01 00:00:00.000001993,1970-01-01 00:00:00.000001957,1970-01-01 00:00:00.000000003,1970-01-01 00:00:00.000002010,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2046,NaT,NaT,NaT,NaT,NaT,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2047,NaT,NaT,NaT,NaT,NaT,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2048,NaT,NaT,NaT,NaT,NaT,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2049,NaT,NaT,NaT,NaT,NaT,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [113]:
# Remove all NA values

# housing_clean.dropna(housing_clean, inplace=True) 
# housing_clean

AttributeError: module 'pandas' has no attribute 'dropna'