# Predicting House Sale Prices using Linear Regression

In this project we will be looking at house sales data from Ames, Iowa. We will review the features that have been provided and determine the best set of features to use for linear regression modeling of the dataset to predict future house sale prices.

In [1]:
# Import all modules/classes need for this project
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

In [2]:
# Read in the AmesHousing.tsv file for analysis
data = pd.read_csv('AmesHousing.tsv', delimiter='\t')
data.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


In [9]:
# Function to transform features to a format this useful for modeling
def transform_features(data):
    num_missing = data.isnull().sum()
    missing_cols_to_drop = num_missing[(num_missing > len(data)/20)].sort_values()
    data = data.drop(missing_cols_to_drop.index, axis=1)
    
    text_missing_vals = data.select_dtypes(include=['object']).isnull().sum()
    missing_cols_to_drop2 = text_missing_vals[text_missing_vals > 0]
    data = data.drop(missing_cols_to_drop2.index, axis=1)
    
    num_missing = data.select_dtypes(include=['int', 'float']).isnull().sum()
    numeric_cols_to_fix = num_missing[(num_missing < len(data)/20) &
                                 (num_missing > 0)]
    values_to_use = data[numeric_cols_to_fix.index].mode().to_dict(orient='records')[0]
    data = data.fillna(values_to_use)
    
    data['years_until_sold'] = data['Yr Sold'] - data['Year Built']
    data['years_since_remod'] = data['Yr Sold'] - data['Year Remod/Add']
    data = data[(data['years_until_sold'] >= 0) & (data['years_since_remod'] >= 0)]
    data = data.drop(['Year Built', 'Year Remod/Add'], axis=1)
    
    data = data.drop(['PID','Order','Mo Sold','Sale Condition','Sale Type','Yr Sold'], axis=1)
    
    return data

# Function to select the features we want to use for our modeling
def select_features(data, coeff_threshold=0.4, cat_threshold=10):
    numerical_cols = data.select_dtypes(include=['int','float'])
    numerical_coeffs = numerical_cols.corr()['SalePrice'].abs()
    data = data.drop(numerical_coeffs[numerical_coeffs < coeff_threshold].index, axis=1)
    
    nominal_cols = ["PID","MS SubClass","MS Zoning","Street","Alley",
                "Land Contour","Lot Config","Neighborhood","Condition 1",
                "Condition 2","Bldg Type","House Style","Roof Style",
                "Roof Matl","Exterior 1st","Exterior 2nd","Mas Vnr Type",
                "Foundation","Heating","Central Air","Garage Type",
                "Misc Feature","Sale Type","Sale Condition"]
    
    remaining_cat_cols = []
    for col in nominal_cols:
        if col in data.columns:
            remaining_cat_cols.append(col)
            
    col_cat_count = data[remaining_cat_cols].apply(lambda col: len(col.value_counts()))
    data = data.drop(col_cat_count[col_cat_count > cat_threshold].index, axis=1)
    
    text_cols = data.select_dtypes(include=['object'])
    for col in text_cols:
        data[col] = data[col].astype('category')
    
    data = pd.concat([data,pd.get_dummies(data.select_dtypes(
        include=['category']))], axis=1)
    data = data.drop(text_cols, axis=1)
    
    return data

# Function to perform our training of a model and analysis of results using RMSE
def train_and_test(data, k=0):
    numerical_cols = data.select_dtypes(include=['int','float'])
    train_cols = numerical_cols.columns.drop('SalePrice')
    lr = LinearRegression()
    
    if k == 0:
        train = data[0:1460]
        test = data[1460:]
    
        lr.fit(train[train_cols], train['SalePrice'])
        test_predictions = lr.predict(test[train_cols])
        mse = mean_squared_error(test_predictions, test['SalePrice'])
        rmse = np.sqrt(mse)
        return rmse
    
    if k == 1:
        shuffled_data = data.sample(frac=1,)
        fold_one = data[0:1460]
        fold_two = data[1460:]
        
        lr.fit(fold_one[train_cols], fold_one['SalePrice'])
        predictions_one = lr.predict(fold_two[train_cols])
        mse_one = mean_squared_error(predictions_one, fold_two['SalePrice'])
        rmse_one = np.sqrt(mse_one)
        
        lr.fit(fold_two[train_cols], fold_two['SalePrice'])
        predictions_two = lr.predict(fold_one[train_cols])
        mse_two = mean_squared_error(predictions_two, fold_two['SalePrice'])
        rmse_two = np.sqrt(mse_two)
        
        return np.mean([rmse_one, rmse_two])
    else:
        kf = KFold(n_splits=k, shuffle=True)
        rmses = []
        for train_index, test_index in kf.split(data):
            train = data.iloc[train_index]
            test = data.iloc[test_index]
            lr.fit(train[train_cols], train['SalePrice'])
            predictions = lr.predict(test[train_cols])
            mse = mean_squared_error(test['SalePrice'],predictions)
            rmse = np.sqrt(mse)
            rmses.append(rmse)
        return np.mean(rmses)

transform_data = transform_features(data)
selected_data = select_features(transform_data)
rmse = train_and_test(selected_data, k=4)
rmse

33138.29684594521

## Feature Engineering

We will first look at transforming the features that we use for our model to be in a format that is helpful in determining our predictions.

Let's begin by removing any column that is missing more than 5% of the values.

In [None]:
num_missing = data.isnull().sum()

In [None]:
missing_cols_to_drop = num_missing[(num_missing > len(data)/20)].sort_values()

data = data.drop(missing_cols_to_drop.index, axis=1)

Next, let's drop any text columns that are missing any data at all.

In [None]:
text_missing_vals = data.select_dtypes(include=['object']).isnull().sum()

missing_cols_to_drop2 = text_missing_vals[text_missing_vals > 0]

data = data.drop(missing_cols_to_drop2.index, axis=1)

Finally, let's fill in any remaining null values within the numerical columns with the most common value for that row.

In [None]:
num_missing = data.select_dtypes(include=['int', 'float']).isnull().sum()
numeric_cols_to_fix = num_missing[(num_missing < len(data)/20) &
                                 (num_missing > 0)]
values_to_use = data[numeric_cols_to_fix.index].mode().to_dict(orient='records')[0]

In [None]:
data = data.fillna(values_to_use)
data.isnull().sum().value_counts()

While the year a house was sold, the year it was built and the year it was remodeled or had an addition all don't really tell us useful information in modeling. Together they can provide us useful information such as years from when it was built to when it was sold or years since it has been remodeled.

In [None]:
data['years_until_sold'] = data['Yr Sold'] - data['Year Built']
data['years_since_remod'] = data['Yr Sold'] - data['Year Remod/Add']

# No value from the above columns should be negative.
# This wouldn't make sense as the Yr Sold should be the largest year
data = data[(data['years_until_sold'] >= 0) & (data['years_since_remod'] >= 0)]

# We'll also remove the columns that were used for conversions
data = data.drop(['Year Built', 'Year Remod/Add'], axis=1)

Finally, let's look at which columns just aren't useful for machine learning or columns that just leak data about the final sale

In [None]:
# PID and Orer don't tell us useful information for this analysis
data = data.drop(['PID','Order'], axis=1)

# Mo Sold, Sale Condition, Sale Type and Yr Sold tell us info about the sale
data = data.drop(['Mo Sold','Sale Condition','Sale Type','Yr Sold'], axis=1)

At this point, we'll update our function for transform_features() to incorporate these changes.

## Feature Selection

Now, we'll look at which features give us the best correlation to the predictions we want to make.

In [None]:
numerical_cols = transform_data.select_dtypes(include=['int','float'])
numerical_coeffs = numerical_cols.corr()['SalePrice'].abs()
numerical_coeffs.sort_values()

In [None]:
sns.heatmap(numerical_cols.corr().abs())

Gr Living Area and Overal Qual seem to have the best correlation to our target, SalePrice. Further, there are a number of features that don't correlate well at all. Let's remove some of those.

In [None]:
# Let's remove any features that have a correlation coefficient below 0.4
transform_data = transform_data.drop(numerical_coeffs[numerical_coeffs < 0.4].index, axis=1)

In [None]:
numerical_cols = transform_data.select_dtypes(include=['int','float'])
numerical_coeffs = numerical_cols.corr()['SalePrice'].abs()
numerical_coeffs.sort_values()

There are a number of columns that should be categorical. Let's separate those.

In [None]:
# List of columns from documentation that should be catgorical
nominal_cols = ["PID","MS SubClass","MS Zoning","Street","Alley",
                "Land Contour","Lot Config","Neighborhood","Condition 1",
                "Condition 2","Bldg Type","House Style","Roof Style",
                "Roof Matl","Exterior 1st","Exterior 2nd","Mas Vnr Type",
                "Foundation","Heating","Central Air","Garage Type",
                "Misc Feature","Sale Type","Sale Condition"]

In [None]:
# Of the listed nominal cols, let's see what we have left to use
remaining_cat_cols = []
for col in nominal_cols:
    if col in transform_data.columns:
        remaining_cat_cols.append(col)
        
# Let's get rid of the columns that seem to have too many options to really
# make them useful for our model
col_cat_count = transform_data[remaining_cat_cols].apply(lambda col: len(col.value_counts()))
transform_data = transform_data.drop(col_cat_count[col_cat_count > 10].index, axis=1)

Let's finally convert the remaining categorical columns to a categorical type.

In [None]:
text_cols = transform_data.select_dtypes(include=['object'])
for col in text_cols:
    transform_data[col] = transform_data[col].astype('category')
    
transform_data = pd.concat([transform_data,
                           pd.get_dummies(transform_data.select_dtypes(include=['category']))],
                           axis=1)
transform_data = transform_data.drop(text_cols, axis=1)

At this point, we'll update the select_features function with the calculations we made.