# Ames Housing Price Prediction

## Regression

In this notebook, I will explore various normal and regularised regression models to determine an effective model for predicting house prices in Ames, Iowa.

## Import and clean

In [109]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [110]:
# load the data
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

In [111]:
# Number of null records
print('Num null values in training set:', np.sum(train.isnull().sum()))
print('Num null values in test set:', np.sum(test.isnull().sum()))

train.isnull().sum().sort_values(ascending=False).apply(lambda x: x / len(train)).head(5)

Num null values in training set: 6965
Num null values in test set: 7000


PoolQC         0.995205
MiscFeature    0.963014
Alley          0.937671
Fence          0.807534
FireplaceQu    0.472603
dtype: float64

In [112]:
test.isnull().sum().sort_values(ascending=False).apply(lambda x: x / len(test)).head(5)

PoolQC         0.997944
MiscFeature    0.965045
Alley          0.926662
Fence          0.801234
FireplaceQu    0.500343
dtype: float64

In [113]:
def cols_startswith(columns, value, exclude=[]):
    return [x for x in columns if x.startswith(value) and x not in exclude]

def clean(df):
    # All except FireplaceQu >80% missing values
    cols = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu']
    df[cols] = df[cols].fillna('None')

    # ~50% missing values - fill with mean
    df['LotFrontage'] = df['LotFrontage'].fillna(df['LotFrontage'].mean())

    # Garage
    garage_num_cols = ['GarageArea', 'GarageCars']
    garage_cols = cols_startswith(df.columns, 'Garage', garage_num_cols)
    df[garage_cols] = df[garage_cols].fillna('None')
    for x in garage_num_cols:
        df[x] = df[x].fillna(df[x].mean())
    
    # Basement
    basement_num_cols = ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath']
    basement_cols = cols_startswith(df.columns, 'Bsmt', basement_num_cols)
    df[basement_cols] = df[basement_cols].fillna('None')
    df[basement_num_cols] = df[basement_num_cols].fillna(0)

    # MasVnr
    df['MasVnrType'] = df['MasVnrType'].fillna('None')
    df['MasVnrArea'] = df['MasVnrArea'].fillna(0)

    # For remaining columns, if numerical fill with mean, if categorical/string/object fill with most common
    remaining_null_cols = df.columns[df.isnull().any()]
    for x in remaining_null_cols:
        if df[x].dtype in (np.float64, np.int64):
            df[x] = df[x].fillna(df[x].mean())
        else:
            df[x] = df[x].fillna(df[x].mode().iloc[0])

    return df

In [108]:
train, test = clean(train), clean(test)

print('Num null values in training set:', np.sum(train.isnull().sum()))
print('Num null values in test set:', np.sum(test.isnull().sum()))

Num null values in training set: 0
Num null values in test set: 0
