In [2]:
# Selected variables + LR Model with OHE only
# Score: 31k

# 1. EDA and Cleaning

In [3]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score

In [4]:
# Read in Data
housing = pd.read_csv('datasets/train.csv')
housing_test = pd.read_csv('datasets/test.csv')

### Read through data dictionary, decide on which variables to use

In [5]:
# Remove spaces in col names b/c data dictionary col names do not include spaces
housing.columns = [n.replace(" ", "") for n in housing.columns]

# Save var names in txt file
# Create function to get variable names into list from txt file
# https://stackoverflow.com/questions/23372086/how-would-i-read-only-the-first-word-of-each-line-of-a-text-file
def get_var_name(txt_file):
    vars = []
    with open(txt_file, 'r') as f:
        for line in f:
            vars.append(line.split(None, 1)[0][:-1])
    return vars

init_vars = get_var_name('datasets/initial_vars.txt')
housing_init = housing[init_vars]

### Null values

In [6]:
# Check for NA values. 
# housing_init.isna().mean()
# Replace NAs with 0. They are not missing values, but 0 (i.e. basement bathroom is 0 not NA if there is no basement)
# https://www.geeksforgeeks.org/replace-nan-values-with-zeros-in-pandas-dataframe/
housing_init = housing_init.fillna(0)

### Variable Transformations

#### Add 'age' of home to be time since build or latest remodel. Assume data is from 2011

In [7]:
# Calculate age as 2011 - year of latest remodel or build
# housing_init['YearRemod/Add'].groupby(housing_init['YearRemod/Add']).count()
housing_init['Age'] = 2011 - housing_init['YearRemod/Add']
housing_init.drop(columns =['YearRemod/Add'], inplace= True)

In [8]:
# Ages range from 1-61 years, which makes sense 
housing_init['Age'].describe()

count    2051.000000
mean       26.809849
std        21.036250
min         1.000000
25%         7.000000
50%        18.000000
75%        46.500000
max        61.000000
Name: Age, dtype: float64

#### Add 'TotalBaths' as new variable, adding up basement full bathrooms, basement half bathrooms, full bathrooms above grade, and half bathrooms above grade. 

In [9]:
# Create new column in dataset for total number of bathrooms
housing_init['TotalBaths'] = housing_init['BsmtFullBath'] + 0.5*housing_init['BsmtHalfBath'] + housing_init['FullBath'] + 0.5*housing_init['HalfBath']
housing_init['TotalBaths'].describe()
housing_init.drop(columns =['BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath'], inplace= True)

#### Add 'OutdoorSF' as outdoor square feet, which represents the sum of wood deck, open porch, enclosed porch, 3 screen porch and screen porch square feet. 

In [10]:
# Add new column for outdoor square footage
housing_init['OutdoorSF'] = housing_init['WoodDeckSF'] + housing_init['OpenPorchSF'] + housing_init['EnclosedPorch'] + housing_init['3SsnPorch'] + housing_init['ScreenPorch'] 
housing_init['OutdoorSF'].describe()
housing_init.drop(columns =['WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch'], inplace= True)

#### Convert month sold into calendar year quarter (Q1 for Jan-Mar, etc.)

In [11]:
# https://www.nar.realtor/blogs/economists-outlook/seasonality-in-the-housing-market
housing_init['MoSold'].describe()

# Cast quarter to new column from month 
housing_init['Quarter'] = np.nan
q = []
for month in housing_init['MoSold']:
    if month < 4:
        q.append('Q1')
    elif month < 7:
        q.append('Q2')
    elif month < 10:
        q.append('Q3')
    else:
        q.append('Q4')
housing_init['Quarter'] = q

housing_init.drop(columns =['MoSold'], inplace=True)
housing_init['Quarter'].groupby(housing_init['Quarter']).count()

Quarter
Q1    351
Q2    817
Q3    579
Q4    304
Name: Quarter, dtype: int64

#### Convert year to binary variable 'GFC', before and after 2008 (Great Financial Crisis) which negatively impacted real estate prices. 

In [12]:
housing_init['SalePrice'].groupby(housing_init['YrSold']).mean()

# Cast yr sold as before/after GFC
housing_init['GFC'] = np.nan
year = []
for yr in housing_init['YrSold']:
    if yr > 2009:
        year.append('0')
    else:
        year.append('1')
housing_init['GFC'] = year

housing_init.drop(columns =['YrSold'], inplace=True)
housing_init['GFC'].groupby(housing_init['GFC']).count()

GFC
0     234
1    1817
Name: GFC, dtype: int64

### Variable Types

In [13]:
# housing_init.info()
# Break out dummy vars. Convert variables to categorical (do this after TTS) 
housing_init_dummy = housing_init.drop(columns = ['TotalBsmtSF', 'GrLivArea', 'MiscVal', 'PoolArea', 'SalePrice', 'Age', 'OutdoorSF', 'TotRmsAbvGrd', 'GarageCars', 'Fireplaces', 'TotalBaths'])
# Break out numeric variables. Ensure in right variable type (float/int)
housing_init_num = housing_init[['TotalBsmtSF', 'GrLivArea', 'MiscVal', 'PoolArea', 'Age', 'OutdoorSF', 'TotRmsAbvGrd', 'GarageCars', 'Fireplaces', 'TotalBaths']]
# housing_init_num

In [22]:
# Create combined new data set for X variables
# https://pandas.pydata.org/docs/reference/api/pandas.concat.html
housing_xvars = pd.concat([housing_init_dummy, housing_init_num], axis=1)
# housing_xvars.head()
# housing_xvars.info()

# Preproccesing 

In [15]:
# Define X and y variables
X = housing_xvars
y = housing_init['SalePrice']

# TTS
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 95)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(1538, 20) (1538,)
(513, 20) (513,)


In [31]:
# Convert variables to categorical with OHE
oh = OneHotEncoder(sparse = False, drop = 'first')
X_train_oh = oh.fit_transform(X_train)
X_test_oh = oh.fit_transform(X_test)
# X_oh = oh.fit_transform(X)
X_oh = pd.concat([pd.DataFrame(X_test_oh, columns = oh.get_feature_names_out()), X_train_oh], axis=1)
# housing_init_dummy_oh = oh.fit_transform(housing_init_dummy)
# housing_xvars_oh = pd.concat([pd.DataFrame(housing_init_dummy_oh, columns = oh.get_feature_names_out()), housing_init_num], axis=1)
X_oh



TypeError: cannot concatenate object of type '<class 'numpy.ndarray'>'; only Series and DataFrame objs are valid

# Modeling

In [30]:
lr = LinearRegression()
lr.fit(X_train_oh, y_train)
print(lr.score(X_train_oh, y_train))
print(lr.score(X_test_oh, y_test))
preds = lr.predict(X_oh)
# preds.shape
# 0.8873364495160501
# 0.877606905706273

0.9999700300845832


ValueError: X has 1184 features, but LinearRegression is expecting 2272 features as input.

### Convert test data in same way as training data

In [None]:
housing_test0 = pd.read_csv('datasets/test-Copy1.csv')
housing_test0.columns = [n.replace(" ", "") for n in housing_test0.columns]

init_vars1 = get_var_name('datasets/initial_vars-Copy1.txt')
housing_test1 = housing_test0[init_vars1]

housing_test1 = housing_test1.fillna(0)

housing_test1['Age'] = 2011 - housing_test1['YearRemod/Add']
housing_test1.drop(columns =['YearRemod/Add'], inplace= True)

housing_test1['TotalBaths'] = housing_test1['BsmtFullBath'] + 0.5*housing_test1['BsmtHalfBath'] + housing_test1['FullBath'] + 0.5*housing_test1['HalfBath']
housing_test1.drop(columns =['BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath'], inplace= True)

housing_test1['OutdoorSF'] = housing_test1['WoodDeckSF'] + housing_test1['OpenPorchSF'] + housing_test1['EnclosedPorch'] + housing_test1['3SsnPorch'] + housing_test1['ScreenPorch'] 
housing_test1.drop(columns =['WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch'], inplace= True)

housing_test1['Quarter'] = np.nan
q = []
for month in housing_test1['MoSold']:
    if month < 4:
        q.append('Q1')
    elif month < 7:
        q.append('Q2')
    elif month < 10:
        q.append('Q3')
    else:
        q.append('Q4')
housing_test1['Quarter'] = q
housing_test1.drop(columns =['MoSold'], inplace=True)

housing_test1['GFC'] = np.nan
year = []
for yr in housing_test1['YrSold']:
    if yr > 2009:
        year.append('0')
    else:
        year.append('1')
housing_test1['GFC'] = year
housing_test1.drop(columns =['YrSold'], inplace=True)

housing_test1_dummy = housing_test1.drop(columns = ['TotalBsmtSF', 'GrLivArea', 'MiscVal', 'PoolArea', 'Age', 'OutdoorSF', 'TotRmsAbvGrd', 'GarageCars', 'Fireplaces', 'TotalBaths'])
housing_test1_num = housing_test1[['TotalBsmtSF', 'GrLivArea', 'MiscVal', 'PoolArea', 'Age', 'OutdoorSF', 'TotRmsAbvGrd', 'GarageCars', 'Fireplaces', 'TotalBaths']]
oh = OneHotEncoder(sparse = False, drop = 'first')
housing_test1_dummy_oh = oh.fit_transform(housing_test1_dummy)
housing_xvars_test1_oh = pd.concat([pd.DataFrame(housing_test1_dummy_oh, columns = oh.get_feature_names_out()), housing_test1_num], axis=1)

### Run model on testing data

In [None]:
# Need same columns present in test data and train data. 
# Identify missing columns bw test and training data then add as 0s where appropriate
# housing_xvars_test1_oh
miss_cols = housing_xvars_oh.columns.difference(housing_xvars_test1_oh.columns)

# Add missing columns to test data, fill with 0s
housing_xvars_test1_oh[miss_cols] = 0
# housing_xvars_oh.columns

# Sort columns in test data so in same order as training data
# housing_xvars_test1_oh_sorted = pd.DataFrame(housing_xvars_test1_oh, columns = housing_xvars_oh.columns)
housing_xvars_test1_oh = housing_xvars_test1_oh[housing_xvars_oh.columns]

In [None]:
housing_xvars_test1_oh

In [None]:
# Get predictions on test data 
test_preds = lr.predict(housing_xvars_test1_oh)

# Fit test data with model
housing_test0['SalePrice'] = test_preds
# housing_test0

In [None]:
# Save excel for submission
submission4 = housing_test0[['Id', 'SalePrice']]
submission4.set_index('Id', inplace = True)
submission4.to_csv('submission4.csv')
# submission4