# Predicting House Sale Prices
## Introduction

In [1]:
import pandas as pd
pd.options.display.max_columns=999
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold

from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn.model_selection import KFold

In [2]:
house=pd.read_csv('AmesHousing.tsv',delimiter='\t')
house.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2930 entries, 0 to 2929
Data columns (total 82 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Order            2930 non-null   int64  
 1   PID              2930 non-null   int64  
 2   MS SubClass      2930 non-null   int64  
 3   MS Zoning        2930 non-null   object 
 4   Lot Frontage     2440 non-null   float64
 5   Lot Area         2930 non-null   int64  
 6   Street           2930 non-null   object 
 7   Alley            198 non-null    object 
 8   Lot Shape        2930 non-null   object 
 9   Land Contour     2930 non-null   object 
 10  Utilities        2930 non-null   object 
 11  Lot Config       2930 non-null   object 
 12  Land Slope       2930 non-null   object 
 13  Neighborhood     2930 non-null   object 
 14  Condition 1      2930 non-null   object 
 15  Condition 2      2930 non-null   object 
 16  Bldg Type        2930 non-null   object 
 17  House Style   

In [17]:
def transform_features(df):
    return df

def select_features(df):
    return df[['Gr Liv Area','SalePrice']]

def train_and_test(df):
    train = df[:1460]
    test=df[1460:]
    ## Use pd.DataFrame.select_dtypes()` to specify column types
    ## and return only those columns as a data frame.
    numeric_train=train.select_dtypes(include=['integer','float'])
    numeric_test=test.select_dtypes(include=['integer','float'])
    
    features=numeric_train.columns.drop('SalePrice')
    lr=linear_model.LinearRegression()
    lr.fit(train[features],train['SalePrice'])
    predictions=lr.predict(test[features])
    mse=mean_squared_error(test['SalePrice'],predictions)
    rmse=np.sqrt(mse)
    return rmse


In [19]:
transform_house=transform_features(house)
filtered_house=select_features(transform_house)
rmse=train_and_test(filtered_house)
rmse

57088.25161263909

## Feature Engeneering

* Handle missing values:
** All columns:
*** Drop any with 5% or more missing values
** Text columns:
*** Drop any with 1 or more missing values
** Numerical columns:
*** Fill in with the most common value

1.All columns

In [36]:
num_missing=house.isnull().sum()
num_missing

Order             0
PID               0
MS SubClass       0
MS Zoning         0
Lot Area          0
                 ..
Mo Sold           0
Yr Sold           0
Sale Type         0
Sale Condition    0
SalePrice         0
Length: 71, dtype: int64

In [35]:
# drop the columns containing more than 5% missing values
drop_missing_cols=num_missing[(num_missing>len(house)/20)].sort_values()
drop_missing_cols
house=house.drop(drop_missing_cols.index,axis=1)

2.Text columns

In [38]:
text_missing_cnt=house.select_dtypes(include=['object']).isnull().sum().sort_values()

drop_missing_cols_2=text_missing_cnt[text_missing_cnt>0]
house=house.drop(drop_missing_cols_2.index,axis=1)

3.Numerical columns

In [40]:
num_missing=house.select_dtypes(include=['int','float']).isnull().sum()
fixable_numeric_cols=num_missing[(num_missing<len(house)/20)&(num_missing>0)].sort_values()
fixable_numeric_cols

BsmtFin SF 1       1
BsmtFin SF 2       1
Bsmt Unf SF        1
Total Bsmt SF      1
Garage Cars        1
Garage Area        1
Bsmt Full Bath     2
Bsmt Half Bath     2
Mas Vnr Area      23
dtype: int64

In [58]:
rpm_value_dict=house[fixable_numeric_cols.index].mode().to_dict(orient='records')[0]
rpm_value_dict

{'BsmtFin SF 1': 0.0,
 'BsmtFin SF 2': 0.0,
 'Bsmt Unf SF': 0.0,
 'Total Bsmt SF': 0.0,
 'Garage Cars': 2.0,
 'Garage Area': 0.0,
 'Bsmt Full Bath': 0.0,
 'Bsmt Half Bath': 0.0,
 'Mas Vnr Area': 0.0}

In [42]:
house=house.fillna(rpm_value_dict)

In [44]:
house.isnull().sum().value_counts()

0    64
dtype: int64

Create new features to better capure the information

In [45]:
year_sold=house['Yr Sold']-house['Year Built']
year_sold[year_sold<0]

2180   -1
dtype: int64

In [47]:
year_remod=house['Yr Sold']-house['Year Remod/Add']
year_remod[year_remod<0]

1702   -1
2180   -2
2181   -1
dtype: int64

In [48]:
# Create new columns
house['Years Before Sale']=year_sold
house['Years Before Remod']=year_remod
# Drop rows with negative values
house=house.drop([1702,2180,2181],axis=0)
# No longer need original year columns
house=house.drop(['Year Built','Year Remod/Add'],axis=1)

In [53]:
# Drop columns that are not useful for machine learning
house=house.drop(['PID', 'Order'], axis=1)
# Drop colunms that leak information about the final sale
house=house.drop(['Mo Sold','Sale Condition','Sale Type','Yr Sold'],axis=1)

KeyError: "['PID' 'Order'] not found in axis"

In [54]:
house.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2927 entries, 0 to 2929
Data columns (total 58 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   MS SubClass         2927 non-null   int64  
 1   MS Zoning           2927 non-null   object 
 2   Lot Area            2927 non-null   int64  
 3   Street              2927 non-null   object 
 4   Lot Shape           2927 non-null   object 
 5   Land Contour        2927 non-null   object 
 6   Utilities           2927 non-null   object 
 7   Lot Config          2927 non-null   object 
 8   Land Slope          2927 non-null   object 
 9   Neighborhood        2927 non-null   object 
 10  Condition 1         2927 non-null   object 
 11  Condition 2         2927 non-null   object 
 12  Bldg Type           2927 non-null   object 
 13  House Style         2927 non-null   object 
 14  Overall Qual        2927 non-null   int64  
 15  Overall Cond        2927 non-null   int64  
 16  Roof S

In [63]:
def transform_features(df):
    num_missing=df.isnull().sum()
    drop_missing_cols=num_missing[(num_missing>len(df)/20)].sort_values()
    df=df.drop(drop_missing_cols.index,axis=1)
    
    text_missing_cnt=df.select_dtypes(include=['object']).isnull().sum().sort_values()
    drop_missing_cols_2=text_missing_cnt[(text_missing_cnt<0)]
    df=df.drop(drop_missing_cols_2.index, axis=1)
    
    num_missing=df.select_dtypes(include=['int','float']).isnull().sum()
    fixable_numeric_cols=num_missing[(num_missing<len(df)/20)&(num_missing>0)].sort_values()
    rpm_value_dict=df[fixable_numeric_cols.index].mode().to_dict(orient='records')[0]
    df=df.fillna(rpm_value_dict)
    
    year_sold=df['Yr Sold']-df['Year Built']
    year_remod=df['Yr Sold']-df['Year Remod/Add']
    df['Years Before Sale']=year_sold
    df['Years Before Remod']=year_remod
    df=df.drop([1702,2180,2181],axis=0)
    
    df=df.drop(['PID','Order','Mo Sold','Sale Condition','Sale Type','Yr Sold'],axis=1)
    return df
    

In [64]:
def select_features(df):
    return df[["Gr Liv Area", "SalePrice"]]

def train_and_test(df):
    train=df[:1460]
    test=df[1460:]
    numeric_train=train.select_dtypes(include=['int','float'])
    numeric_test=test.select_dtypes(include=['int','float'])
    
    features=numeric_train.columns.drop('SalePrice')
    lr=linear_model.LinearRegression()
    lr.fit(train[features],train['SalePrice'])
    predictions=lr.predict(test[features])
    mse=mean_squared_error(test['SalePrice'],predictions)
    rmse=np.sqrt(mse)
    return rmse

In [65]:
houses=pd.read_csv('AmesHousing.tsv',delimiter='\t')
transform_houses=transform_features(houses)
filtered_houses=select_features(transform_houses)
rmse=train_and_test(filtered_houses)

rmse

55275.367312413066

## Feature Selection

In [66]:
numerical_houses=transform_houses.select_dtypes(include=['int','float'])
numerical_houses.head(5)

Unnamed: 0,MS SubClass,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,1st Flr SF,2nd Flr SF,Low Qual Fin SF,Gr Liv Area,Bsmt Full Bath,Bsmt Half Bath,Full Bath,Half Bath,Bedroom AbvGr,Kitchen AbvGr,TotRms AbvGrd,Fireplaces,Garage Cars,Garage Area,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,SalePrice,Years Before Sale,Years Before Remod
0,20,31770,6,5,1960,1960,112.0,639.0,0.0,441.0,1080.0,1656,0,0,1656,1.0,0.0,1,0,3,1,7,2,2.0,528.0,210,62,0,0,0,0,0,215000,50,50
1,20,11622,5,6,1961,1961,0.0,468.0,144.0,270.0,882.0,896,0,0,896,0.0,0.0,1,0,2,1,5,0,1.0,730.0,140,0,0,0,120,0,0,105000,49,49
2,20,14267,6,6,1958,1958,108.0,923.0,0.0,406.0,1329.0,1329,0,0,1329,0.0,0.0,1,1,3,1,6,0,1.0,312.0,393,36,0,0,0,0,12500,172000,52,52
3,20,11160,7,5,1968,1968,0.0,1065.0,0.0,1045.0,2110.0,2110,0,0,2110,1.0,0.0,2,1,3,1,8,2,2.0,522.0,0,0,0,0,0,0,0,244000,42,42
4,60,13830,5,5,1997,1998,0.0,791.0,0.0,137.0,928.0,928,701,0,1629,0.0,0.0,2,1,3,1,6,1,2.0,482.0,212,34,0,0,0,0,0,189900,13,12


In [67]:
abs_corr_coeffs=numerical_houses.corr()['SalePrice'].abs().sort_values()
abs_corr_coeffs

BsmtFin SF 2          0.006127
Misc Val              0.019273
3Ssn Porch            0.032268
Bsmt Half Bath        0.035875
Low Qual Fin SF       0.037629
Pool Area             0.068438
MS SubClass           0.085128
Overall Cond          0.101540
Screen Porch          0.112280
Kitchen AbvGr         0.119760
Enclosed Porch        0.128685
Bedroom AbvGr         0.143916
Bsmt Unf SF           0.182751
Lot Area              0.267520
2nd Flr SF            0.269601
Bsmt Full Bath        0.276258
Half Bath             0.284871
Open Porch SF         0.316262
Wood Deck SF          0.328183
BsmtFin SF 1          0.439284
Fireplaces            0.474831
TotRms AbvGrd         0.498574
Mas Vnr Area          0.506983
Year Remod/Add        0.533007
Years Before Remod    0.534985
Full Bath             0.546118
Year Built            0.558490
Years Before Sale     0.558979
1st Flr SF            0.635185
Garage Area           0.641425
Total Bsmt SF         0.644012
Garage Cars           0.648361
Gr Liv A

In [68]:
abs_corr_coeffs[abs_corr_coeffs>0.4]

BsmtFin SF 1          0.439284
Fireplaces            0.474831
TotRms AbvGrd         0.498574
Mas Vnr Area          0.506983
Year Remod/Add        0.533007
Years Before Remod    0.534985
Full Bath             0.546118
Year Built            0.558490
Years Before Sale     0.558979
1st Flr SF            0.635185
Garage Area           0.641425
Total Bsmt SF         0.644012
Garage Cars           0.648361
Gr Liv Area           0.717596
Overall Qual          0.801206
SalePrice             1.000000
Name: SalePrice, dtype: float64

In [69]:
transform_houses=transform_houses.drop(abs_corr_coeffs[abs_corr_coeffs<0.4].index,axis=1)

Figure out which categorical columns should be kept

In [70]:
# Create a list of column names from documentation that are meant to be categorical
nominal_features = ['PID', 'MS SubClass', 'MS Zoning', 'Street', 'Alley', 'Land Contour', 'Lot Config', 'Neighborhood', 
                    'Condition 1', 'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl', 'Exterior 1st', 
                    'Exterior 2nd', 'Mas Vnr Type', 'Foundation', 'Heating', 'Central Air', 'Garage Type', 
                    'Misc Feature', 'Sale Type', 'Sale Condition']

In [72]:
tranform_cat_cols=[]
for col in nominal_features:
    if col in transform_houses.columns:
        tranform_cat_cols.append(col)
uniqueness_counts=transform_houses[tranform_cat_cols].apply(lambda col: len(col.value_counts())).sort_values()
drop_nonuniq_cols=uniqueness_counts[uniqueness_counts>10].index
transform_houses=transform_houses.drop(drop_nonuniq_cols,axis=1)

In [74]:
# Select just the remaining text columns and convert to categorical
text_cols=transform_houses.select_dtypes(include=['object'])
for col in text_cols:
    transform_houses[col]=transform_houses[col].astype('category')
# Create dummy columns and add back to the df
transform_houses=pd.concat([
    transform_houses,
    pd.get_dummies(transform_houses.select_dtypes(include=['category']))],axis=1).drop(text_cols,axis=1)

In [75]:
# Update the feature functions
def transform_features(df):
    num_missing=df.isnull().sum()
    drop_missing_cols=num_missing[num_missing>len(df)/20].sort_values()
    df=df.drop(drop_missing_cols.index,axis=1)
    
    text_missing_cnt=df.select_dtypes(include=['object']).isnull().sum().sort_values(ascending=False)
    drop_missing_cols_2=text_missing_cnt[text_missing_cnt>0]
    df=df.drop(drop_missing_cols_2.index,axis=1)
    
    num_missing=df.select_dtypes(include=['int','float']).isnull().sum()
    fixable_numeric_cols=num_missing[(num_missing<len(df)/20)&(num_missing>0)].sort_values()
    rpm_value_dict=df[fixable_numeric_cols.index].mode().to_dict(orient='records')[0]
    df=df.fillna(rpm_value_dict)
    
    year_sold=df['Yr Sold']-df['Year Built']
    year_remod=df['Yr Sold']-df['Year Remod/Add']
    df['Years Before Sale']=year_sold
    df['Years Since Remod']=year_remod
    df=df.drop([1702,2180,2181],axis=0)
    
    df=df.drop(['PID','Order','Mo Sold','Sale Condition','Sale Type','Yr Sold'],axis=1)
    return df


In [83]:
def select_features(df,coeff_threshold=0.4,uniq_threshold=10):
    numerical_df=df.select_dtypes(include=['int','float'])
    abs_corr_coeffs=numerical_df.corr()['SalePrice'].abs().sort_values()
    df=df.drop(abs_corr_coeffs[abs_corr_coeffs<coeff_threshold].index,axis=1)
    
    nominal_features=['PID', 'MS SubClass', 'MS Zoning', 'Street', 'Alley', 'Land Contour', 'Lot Config', 'Neighborhood', 
                    'Condition 1', 'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl', 'Exterior 1st', 
                    'Exterior 2nd', 'Mas Vnr Type', 'Foundation', 'Heating', 'Central Air', 'Garage Type', 
                    'Misc Feature', 'Sale Type', 'Sale Condition']
    
    transform_cat_cols=[]
    for col in nominal_features:
        if col in df.columns:
            transform_cat_cols.append(col)
    uniqueness_counts=df[transform_cat_cols].apply(lambda col: len(col.value_counts())).sort_values()
    drop_nonuniq_cols=uniqueness_counts[uniqueness_counts>uniq_threshold].index
    df=df.drop(drop_nonuniq_cols,axis=1)
    
    text_cols=df.select_dtypes(include=['object'])
    for col in text_cols:
        df[col]=df[col].astype('category')
    df=pd.concat([
        df,
        pd.get_dummies(df.select_dtypes(include=['category']))
    ],axis=1).drop(text_cols,axis=1)
    return df

In [87]:
def train_and_test(df,k=0):
    numerical_df=df.select_dtypes(include=['int','float'])
    features=numerical_df.columns.drop('SalePrice')
    lr=linear_model.LinearRegression()
    
    if k==0:
        train=df[:1460]
        test=df[1460:]
        
        lr.fit(train[features],train['SalePrice'])
        predictions=lr.predict(test[features])
        mse=mean_squared_error(test['SalePrice'],predictions)
        rmse=np.sqrt(mse)
        return rmse
    if k==1:
        shuffled_df=df.sample(frac=1)
        train=df[:1460]
        test=df[1460:]
        
        lr.fit(train[features],train['SalePrice'])
        predictions_one=lr.predict(test[features])
        
        mse_one=mean_squared_error(test['SalePrice'],predictions_one)
        rmse_one=np.sqrt(mse_one)
        
        lr.fit(test[features],test['SalePrice'])
        predictions_two=lr.predict(traina[features])
        mse_two=mean_squared_error(trani['SalePrice'],predictions_two)
        rmse_two=np.sqrt(mse_two)
        
        avg_rmse=np.mean([rmse_one,rmse_two])
        print(rmse_one)
        print(rmse_two)
        return avg_rmse
    else:
        kf=KFold(n_splits=k,shuffle=True)
        rmse_values=[]
        for train_index,test_index in kf.split(df):
            train=df.iloc[train_index]
            test=df.iloc[test_index]
            lr.fit(train[features],train['SalePrice'])
            predictions=lr.predict(test[features])
            mse=mean_squared_error(test['SalePrice'],predictions)
            rmse=np.sqrt(mse)
            rmse_values.append(rmse)
        print(rmse_values)
        avg_rmse=np.mean(rmse_values)
        return avg_rmse
    

In [91]:
house_data=pd.read_csv('AmesHousing.tsv',delimiter='\t')
transform_house_df=transform_features(house_data)
filtered_house_df=select_features(transform_house_df)
rmse=train_and_test(filtered_house_df,k=4)

rmse

[36256.0824073573, 28288.963648243323, 39284.21156690873, 29176.065327439468]


33251.3307374872