I began with a baseline that predicts only the mean house price from the training data for each house, but this model does very poorly (of course). A linear regression model on the numeric columns has an R-Squared Score of 0.78. I tried a Ridge regression model to try to manage the multicollinearity between my features but it did not perform any better. In the future I will try to use more features and maybe encode some of the discrete features.

In [1]:
# Imports
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

import eli5
from eli5.sklearn import PermutationImportance

ModuleNotFoundError: No module named 'eli5'

In [None]:
df = pd.read_csv('train.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

Feature engineering

In [None]:
def address_nas(test):
    ## Drop observations with non-used homes
    df = test[~test.SaleType.isin(['New', 'COD'])].copy()
    df = df[~df.Electrical.isna()]
    
    ## Create new variables
    
    # Fix alley variable
    df['HasAlley'] =np.where((df.Alley.str.contains("Grvl|Pave")), 1,0)

    # Fix pool variable
    df['HasPool'] = np.where((df.PoolArea > 0), 1,0)

    # Fix fence variable
    df['HasFence'] = np.where((~df.Fence.isna()), 1,0)
    
    # Create Has Fire Place
    df['HasFirePlace'] =np.where((df.Fireplaces > 0), 1,0)
    
    # Has porch
    test = df.columns.tolist()
    porch_vars = [k for k in test if 'Porch' in k]
    df['HasPorch'] = 0
    df.loc[df[porch_vars].any(axis='columns'), 'HasPorch']=1
    
    # Has basement
    df['HasBasement'] = np.where((~df.BsmtExposure.isna()), 1,0)
    
    # Create Has Garage
    df['HasGarage'] = np.where((df.GarageArea > 0), 1, 0)
    
    # Create ratio of space per car
    df['GarageAreaPerCar'] = df.GarageArea/df.GarageCars
    
    # Create ratio of space per room
    df['AverageRoomSize'] = df.GrLivArea/df.TotRmsAbvGrd
    
    # Create hasfinishedbsmt
    df['HasFinishedBsmt'] = np.where(df.BsmtFinSF1 > 0, 1, 0)
    
    # create % basement finished
    df['BsmtPerFinished'] = (df.BsmtFinSF1 + df.BsmtFinSF2)/df.TotalBsmtSF
    df.BsmtPerFinished.fillna(0,inplace=True)
       
    df['HasCentralAir'] = np.where(df.CentralAir=='Y',1,0)
    df['GasAirHeat']  = np.where(df.Heating == 'GasA', 1, 0)
    df['SBboxElectric'] = np.where(df.Electrical == 'SBrkr', 1, 0)
    df['HasDeck'] = np.where(df.WoodDeckSF > 0, 1, 0)
    df['HasRemod'] = np.where(df.YearBuilt==df.YearRemodAdd, 1, 0)
    
    df['HouseAge'] = df.YrSold - df.YearBuilt
    df['TimeSinceRemodel'] = df.YrSold - df.YearRemodAdd

    df['RemodFiveYrs'] = np.where((df.YearBuilt!=df.YearRemodAdd )&( df.HouseAge > 5) & (df.TimeSinceRemodel <=5),1,0)
    df['GaragebuiltWHouse'] = np.where(df.YearBuilt==df.GarageYrBlt, 1, 0)
    
    ## Fill nas

    df.GarageAreaPerCar.fillna(0, inplace=True)
    df.LotFrontage.fillna(0, inplace=True)
    df.GarageFinish.fillna('Unf', inplace=True)
    df.MasVnrArea.fillna(0, inplace = True)
    
    ## Drop columns
    df.drop(columns = ['Id'], inplace=True)
    df.drop(columns = ['Alley'], inplace=True)
    df.drop(columns = ['GarageType'], inplace=True)
#    df.drop(columns = ['GarageArea'], inplace=True)
    df.drop(columns = ["MiscFeature", "MiscVal"], inplace = True)
    df.drop(columns = ['Fence'], inplace=True)
    df.drop(columns = ['Fireplaces', 'FireplaceQu'], inplace=True)
    df.drop(columns = ['BsmtExposure'], inplace = True)
    df.drop(columns = ['CentralAir', 'Electrical', 'Heating', 'WoodDeckSF'], inplace = True)
    df.drop(columns = ['YearBuilt', 'GarageYrBlt', 'YearRemodAdd'], inplace = True)
    df.drop(columns = ['PoolArea', 'PoolQC'], inplace=True)
    df.drop(columns = ['BsmtFinSF1', 'BsmtFinSF2'], inplace=True)
        
    test = df.columns.tolist()
    redundant = [k for k in test if ('Qual' in k or 'Cond' in k) and ("Overall" not in k) and (k!= "SaleCondition")]
    df.drop(columns = redundant, inplace = True )
    
    test = df.columns.tolist()
    type_col = [k for k in test if 'Type' in k]
    df.drop(columns = type_col, inplace = True )
    
    df.drop(columns = porch_vars, inplace = True)
    
    return df


In [None]:
df = address_nas(df)
df_final = df.select_dtypes(exclude='object')

Explore Target Variable

In [None]:
# plotting the distribution of the target variable with a boxplot
plt.figure(figsize=(10,8))
sns.boxplot(x=df_final['SalePrice'])
plt.title("Boxplot for Target Variable")
plt.xlabel("Price in Dollars")
plt.show()

In [None]:
df.corr()['SalePrice'].values

In [None]:
plt.figure(figsize=(11,14))
sns.heatmap(df_final.corr()[['SalePrice']], xticklabels=True, yticklabels=True, square=True)
plt.title("Correlation of Each Independent Variable with the Target Variable", wrap=True, pad=20)
plt.tight_layout()
plt.show()

Explore relationship between sale price and highest correlated variables

In [None]:
from itertools import islice
import matplotlib.image as mpimg
import os

# Get correlation with Sale Price. 
# Filter variables that have over abs(.4) correlation
# Remove Sale Price from list of variables
test = df_final.corr()[['SalePrice']].sort_values(by='SalePrice',ascending=False)
keep = test.loc[abs(test.SalePrice) > .4].index.to_list()
keep.remove('SalePrice')

# split variable list into three separate lists, stored in one list named Output
length_to_split = [5, 5, 3] 
Inputt = iter(keep) 
Output = [list(islice(Inputt, elem)) 
          for elem in length_to_split] 

# Create three different pair grids using the different variable lists in Output
g0 = sns.PairGrid(df_final, y_vars=["SalePrice"], x_vars=Output[0], height=5)
g0.map(sns.scatterplot, color=".3")
                                                               
g1 = sns.PairGrid(df_final, y_vars=["SalePrice"], x_vars=Output[1], height=5)
g1.map(sns.scatterplot, color=".3")

g2 = sns.PairGrid(df_final, y_vars=["SalePrice"], x_vars=Output[2], height=5)
g2.map(sns.scatterplot, color=".3",)
                                                                                                        
# save figs temporarily    
g0.savefig('g0.png')
plt.close(g0.fig)

g1.savefig('g1.png')
plt.close(g1.fig)

g2.savefig('g2.png')
plt.close(g2.fig)

# Create figure with subplots
f, axarr = plt.subplots(3, 1, figsize=(25, 16))

# Manualy construct image by reading in and stacking the saved images into one file
axarr[0].imshow(mpimg.imread('g0.png'))
axarr[0].set_title("\n".join(["Display of Independent Variables Highly Correlated with Sales Price\n"]))
axarr[1].imshow(mpimg.imread('g1.png'))
axarr[2].imshow(mpimg.imread('g2.png'))

# turn off x and y axis on master figure
[ax.set_axis_off() for ax in axarr.ravel()]

# save file
plt.tight_layout()
plt.show()

# remove temporary png files
os.remove('g0.png')
os.remove('g1.png')
os.remove('g2.png')


Examine relationship between Sale Price, Condition, and Sqftage of Home

In [None]:
fig = plt.figure(figsize=(16, 10), dpi=80) 
chart = sns.scatterplot(x='GrLivArea', y='SalePrice', data = df_final,
                        hue='OverallQual', alpha = .7)
fig.suptitle("The Positive Correlations between Home Sale Price,\n Sq Footage and Quality Rating", fontsize=26)
plt.tight_layout()
plt.show()

Prepping data to model:

In [None]:
df_final.head()

In [None]:
def summarize_dataframe(df):
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    return summary

In [None]:
summarize_dataframe(df_final)

In [None]:
# Splitting my data into X and Y
target = ['SalePrice']

X = df_final.drop(columns=target)
y = df_final[target]

# Creating a split in my data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, 
                                                    random_state=42)

Modeling:

In [None]:
# Model-less baseline
y_mean_train = y_train.mean()

baseline_y_pred_train = ([y_mean_train]*len(y_train))
baseline_y_pred_test = ([y_mean_train]*len(y_test))

In [None]:
print(f"Training R2: {r2_score(y_train, baseline_y_pred_train)}")
print(f"Testing R2: {r2_score(y_test, baseline_y_pred_test)}")
print("---")
print(f"Training MAE: {mean_absolute_error(y_train, baseline_y_pred_train)}")
print(f"Testing MAE: {mean_absolute_error(y_test, baseline_y_pred_test)}")

In [None]:
lr = LinearRegression()

In [None]:
lr.fit(X_train, y_train)

In [None]:
y_pred_train = lr.predict(X_train)
y_pred_test = lr.predict(X_test)

In [None]:
len(y_pred_test)

In [None]:
print(f"Training R2: {r2_score(y_train, y_pred_train)}")
print(f"Testing R2: {r2_score(y_test, y_pred_test)}")
print("---")
print(f"Training MAE: {mean_absolute_error(y_train, y_pred_train)}")
print(f"Testing MAE: {mean_absolute_error(y_test, y_pred_test)}")

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(X_train.corr(), xticklabels=True, yticklabels=True)
plt.title("Exploring Correlation Between Features", pad = 20)
plt.tight_layout()
plt.show()

In [None]:
ridge = Ridge(random_state=42)

In [None]:
ridge.fit(X_train, y_train)

ridge_y_pred_train = ridge.predict(X_train)
ridge_y_pred_test = ridge.predict(X_test)

In [None]:
print(f"Training R2: {r2_score(y_train, ridge_y_pred_train)}")
print(f"Testing R2: {r2_score(y_test, ridge_y_pred_test)}")
print("---")
print(f"Training MAE: {mean_absolute_error(y_train, ridge_y_pred_train)}")
print(f"Testing MAE: {mean_absolute_error(y_test, ridge_y_pred_test)}")

Checking Feature Importance

In [None]:
perm = PermutationImportance(ridge, random_state=1).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X_train.columns.tolist(), top=None)

Next steps: remove some features, perhaps by doing elasticnet instead of just ridge