# Predicting House Sale Price


## Introduction

We'll work with housing data for the city of Ames, Iowa, United States from 2006 to 2010. You can read more about why the data was collected [here](https://doi.org/10.1080/10691898.2011.11889627). You can also read about the different columns in the data [here](https://s3.amazonaws.com/dq-content/307/data_description.txt).

Let's start by setting up a pipeline of functions that will let us quickly iterate on different models.

<img src="pipeline.svg" width="200">


In [26]:
import pandas as pd
import numpy as np

from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, KFold
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)  

df = pd.read_csv('AmesHousing.tsv',delimiter="\t")
df.head(5)

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,Neighborhood,Condition 1,Condition 2,Bldg Type,House Style,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Roof Style,Roof Matl,Exterior 1st,Exterior 2nd,Mas Vnr Type,Mas Vnr Area,Exter Qual,Exter Cond,Foundation,Bsmt Qual,Bsmt Cond,Bsmt Exposure,BsmtFin Type 1,BsmtFin SF 1,BsmtFin Type 2,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,Heating,Heating QC,Central Air,Electrical,1st Flr SF,2nd Flr SF,Low Qual Fin SF,Gr Liv Area,Bsmt Full Bath,Bsmt Half Bath,Full Bath,Half Bath,Bedroom AbvGr,Kitchen AbvGr,Kitchen Qual,TotRms AbvGrd,Functional,Fireplaces,Fireplace Qu,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Garage Qual,Garage Cond,Paved Drive,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,5,1960,1960,Hip,CompShg,BrkFace,Plywood,Stone,112.0,TA,TA,CBlock,TA,Gd,Gd,BLQ,639.0,Unf,0.0,441.0,1080.0,GasA,Fa,Y,SBrkr,1656,0,0,1656,1.0,0.0,1,0,3,1,TA,7,Typ,2,Gd,Attchd,1960.0,Fin,2.0,528.0,TA,TA,P,210,62,0,0,0,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,5,6,1961,1961,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,CBlock,TA,TA,No,Rec,468.0,LwQ,144.0,270.0,882.0,GasA,TA,Y,SBrkr,896,0,0,896,0.0,0.0,1,0,2,1,TA,5,Typ,0,,Attchd,1961.0,Unf,1.0,730.0,TA,TA,Y,140,0,0,0,120,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,6,1958,1958,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,108.0,TA,TA,CBlock,TA,TA,No,ALQ,923.0,Unf,0.0,406.0,1329.0,GasA,TA,Y,SBrkr,1329,0,0,1329,0.0,0.0,1,1,3,1,Gd,6,Typ,0,,Attchd,1958.0,Unf,1.0,312.0,TA,TA,Y,393,36,0,0,0,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,7,5,1968,1968,Hip,CompShg,BrkFace,BrkFace,,0.0,Gd,TA,CBlock,TA,TA,No,ALQ,1065.0,Unf,0.0,1045.0,2110.0,GasA,Ex,Y,SBrkr,2110,0,0,2110,1.0,0.0,2,1,3,1,Ex,8,Typ,2,TA,Attchd,1968.0,Fin,2.0,522.0,TA,TA,Y,0,0,0,0,0,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,5,5,1997,1998,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,GLQ,791.0,Unf,0.0,137.0,928.0,GasA,Gd,Y,SBrkr,928,701,0,1629,0.0,0.0,2,1,3,1,TA,6,Typ,1,TA,Attchd,1997.0,Fin,2.0,482.0,TA,TA,Y,212,34,0,0,0,0,,MnPrv,,0,3,2010,WD,Normal,189900


In [27]:
df.columns

Index(['Order', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area',
       'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities',
       'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual',
       'Overall Cond', 'Year Built', 'Year Remod/Add', 'Roof Style',
       'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
       'Mas Vnr Area', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual',
       'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1',
       'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF',
       'Heating', 'Heating QC', 'Central Air', 'Electrical', '1st Flr SF',
       '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath',
       'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr',
       'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional',
       'Fireplaces', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt',
      

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2930 entries, 0 to 2929
Data columns (total 82 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Order            2930 non-null   int64  
 1   PID              2930 non-null   int64  
 2   MS SubClass      2930 non-null   int64  
 3   MS Zoning        2930 non-null   object 
 4   Lot Frontage     2440 non-null   float64
 5   Lot Area         2930 non-null   int64  
 6   Street           2930 non-null   object 
 7   Alley            198 non-null    object 
 8   Lot Shape        2930 non-null   object 
 9   Land Contour     2930 non-null   object 
 10  Utilities        2930 non-null   object 
 11  Lot Config       2930 non-null   object 
 12  Land Slope       2930 non-null   object 
 13  Neighborhood     2930 non-null   object 
 14  Condition 1      2930 non-null   object 
 15  Condition 2      2930 non-null   object 
 16  Bldg Type        2930 non-null   object 
 17  House Style   

In [29]:
df.describe(include='all')

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,Neighborhood,Condition 1,Condition 2,Bldg Type,House Style,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Roof Style,Roof Matl,Exterior 1st,Exterior 2nd,Mas Vnr Type,Mas Vnr Area,Exter Qual,Exter Cond,Foundation,Bsmt Qual,Bsmt Cond,Bsmt Exposure,BsmtFin Type 1,BsmtFin SF 1,BsmtFin Type 2,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,Heating,Heating QC,Central Air,Electrical,1st Flr SF,2nd Flr SF,Low Qual Fin SF,Gr Liv Area,Bsmt Full Bath,Bsmt Half Bath,Full Bath,Half Bath,Bedroom AbvGr,Kitchen AbvGr,Kitchen Qual,TotRms AbvGrd,Functional,Fireplaces,Fireplace Qu,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Garage Qual,Garage Cond,Paved Drive,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
count,2930.0,2930.0,2930.0,2930,2440.0,2930.0,2930,198,2930,2930,2930,2930,2930,2930,2930,2930,2930,2930,2930.0,2930.0,2930.0,2930.0,2930,2930,2930,2930,2907.0,2907.0,2930,2930,2930,2850,2850,2847,2850,2929.0,2849,2929.0,2929.0,2929.0,2930,2930,2930,2929,2930.0,2930.0,2930.0,2930.0,2928.0,2928.0,2930.0,2930.0,2930.0,2930.0,2930,2930.0,2930,2930.0,1508,2773,2771.0,2771,2929.0,2929.0,2771,2771,2930,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,13,572,106,2930.0,2930.0,2930.0,2930,2930,2930.0
unique,,,,7,,,2,2,4,4,3,5,3,28,9,8,5,8,,,,,6,8,16,17,5.0,,4,5,6,5,5,4,6,,6,,,,6,5,2,5,,,,,,,,,,,5,,8,,5,6,,3,,,5,5,3,,,,,,,4,4,5,,,,10,6,
top,,,,RL,,,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1Story,,,,,Gable,CompShg,VinylSd,VinylSd,,,TA,TA,PConc,TA,TA,No,GLQ,,Unf,,,,GasA,Ex,Y,SBrkr,,,,,,,,,,,TA,,Typ,,Gd,Attchd,,Unf,,,TA,TA,Y,,,,,,,Ex,MnPrv,Shed,,,,WD,Normal,
freq,,,,2273,,,2918,120,1859,2633,2927,2140,2789,443,2522,2900,2425,1481,,,,,2321,2887,1026,1015,1752.0,,1799,2549,1310,1283,2616,1906,859,,2499,,,,2885,1495,2734,2682,,,,,,,,,,,1494,,2728,,744,1731,,1231,,,2615,2665,2652,,,,,,,4,330,95,,,,2536,2413,
mean,1465.5,714464500.0,57.387372,,69.22459,10147.921843,,,,,,,,,,,,,6.094881,5.56314,1971.356314,1984.266553,,,,,,101.896801,,,,,,,,442.629566,,49.722431,559.262547,1051.614544,,,,,1159.557679,335.455973,4.676792,1499.690444,0.431352,0.061134,1.566553,0.379522,2.854266,1.044369,,6.443003,,0.599317,,,1978.132443,,1.766815,472.819734,,,,93.751877,47.533447,23.011604,2.592491,16.002048,2.243345,,,,50.635154,6.216041,2007.790444,,,180796.060068
std,845.96247,188730800.0,42.638025,,23.365335,7880.017759,,,,,,,,,,,,,1.411026,1.111537,30.245361,20.860286,,,,,,179.112611,,,,,,,,455.590839,,169.168476,439.494153,440.615067,,,,,391.890885,428.395715,46.31051,505.508887,0.52482,0.245254,0.552941,0.502629,0.827731,0.214076,,1.572964,,0.647921,,,25.528411,,0.760566,215.046549,,,,126.361562,67.4834,64.139059,25.141331,56.08737,35.597181,,,,566.344288,2.714492,1.316613,,,79886.692357
min,1.0,526301100.0,20.0,,21.0,1300.0,,,,,,,,,,,,,1.0,1.0,1872.0,1950.0,,,,,,0.0,,,,,,,,0.0,,0.0,0.0,0.0,,,,,334.0,0.0,0.0,334.0,0.0,0.0,0.0,0.0,0.0,0.0,,2.0,,0.0,,,1895.0,,0.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,1.0,2006.0,,,12789.0
25%,733.25,528477000.0,20.0,,58.0,7440.25,,,,,,,,,,,,,5.0,5.0,1954.0,1965.0,,,,,,0.0,,,,,,,,0.0,,0.0,219.0,793.0,,,,,876.25,0.0,0.0,1126.0,0.0,0.0,1.0,0.0,2.0,1.0,,5.0,,0.0,,,1960.0,,1.0,320.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,4.0,2007.0,,,129500.0
50%,1465.5,535453600.0,50.0,,68.0,9436.5,,,,,,,,,,,,,6.0,5.0,1973.0,1993.0,,,,,,0.0,,,,,,,,370.0,,0.0,466.0,990.0,,,,,1084.0,0.0,0.0,1442.0,0.0,0.0,2.0,0.0,3.0,1.0,,6.0,,1.0,,,1979.0,,2.0,480.0,,,,0.0,27.0,0.0,0.0,0.0,0.0,,,,0.0,6.0,2008.0,,,160000.0
75%,2197.75,907181100.0,70.0,,80.0,11555.25,,,,,,,,,,,,,7.0,6.0,2001.0,2004.0,,,,,,164.0,,,,,,,,734.0,,0.0,802.0,1302.0,,,,,1384.0,703.75,0.0,1742.75,1.0,0.0,2.0,1.0,3.0,1.0,,7.0,,1.0,,,2002.0,,2.0,576.0,,,,168.0,70.0,0.0,0.0,0.0,0.0,,,,0.0,8.0,2009.0,,,213500.0


In [30]:
def  transform_features(df):
    return df
    
def select_features(df):
    return   df[['Gr Liv Area','SalePrice']]

def train_and_test(df):
    
    data = select_features(df)
    data = data.select_dtypes(include=['float64','int64'])
    train = data[0:1460]
    test  = data[1460:] 

    features = data.columns.drop('SalePrice')

    lr = linear_model.LinearRegression()
    lr.fit(train[features],train['SalePrice'])
    predictions = lr.predict(test[features])
    mse = mean_squared_error(test['SalePrice'], predictions)
    rmse = mse ** (1/2)
    return rmse

transform_df = transform_features(df)
filtered_df = select_features(transform_df)
rmse = train_and_test(filtered_df)

rmse    

57088.25161263909

## Feature Engineering

1. remove features that we don't want to use in the model, just based on the number of missing values or data leakage

transform features into the proper format (numerical to categorical, scaling numerical, filling in missing values, etc)

create new features by combining other features


Handle missing values:
* All columns:
    * Drop any with 5% or more missing values for now.
* Text columns:
    * Drop any with 1 or more missing values for now.
* Numerical columns:
    * For columns with missing values, fill in with the mode common value in that column

Drop columns that:
* that aren't useful for ML
* leak data about the final sale, read more about columns here    

In [31]:
# 1: All columns: Drop any with 5% or more missing values for now.

null_counts = df.isnull().sum()
length = len(df) * 0.05
print(df.shape)
df = df[ null_counts[null_counts < length].index]
print(df.shape)

(2930, 82)
(2930, 71)


In [32]:
# 2.Text columns: Drop any with 1 or more missing values for now.
text_null_counts =df.select_dtypes(include=['object']).isnull().sum()
text_null_cols = text_null_counts[text_null_counts > 0 ].index
df = df.drop(columns=text_null_cols)
print(df.shape)

(2930, 64)


In [33]:
# 3.Numerical columns: For columns with missing values, fill in with the most common value in that column
    
num_null_counts =df.select_dtypes(include=['float64',"int64"]).isnull().sum()
num_null_cols = num_null_counts[num_null_counts > 0].index.tolist()
print(num_null_cols)

for col in num_null_cols:
    col_mode = df[col].mode()
    df[col] = df[col].fillna(col_mode[0])

['Mas Vnr Area', 'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Garage Cars', 'Garage Area']


In [34]:
# 4. Create new columns
df['Years Before Sale'] = df['Yr Sold'] - df['Year Built']
df['Years Since Remod'] = df['Yr Sold'] - df['Year Remod/Add']

## Drop rows with negative values for both of these new features
df = df [ (df['Years Before Sale']>0)  & (df['Years Since Remod']>0)]

## No longer need original year columns
df = df.drop(["Year Built", "Year Remod/Add"], axis = 1)

In [35]:
## Drop columns that aren't useful for ML
df = df.drop(["PID", "Order"], axis=1)

## Drop columns that leak info about the final sale
df = df.drop(["Mo Sold", "Sale Condition", "Sale Type", "Yr Sold"], axis=1)

Let's update transform_features() and read in data again

In [36]:
df = pd.read_csv("AmesHousing.tsv", delimiter="\t")

def  transform_features(df):
    # 1: All columns: Drop any with 5% or more missing values for now.
    null_counts = df.isnull().sum()
    length = len(df) * 0.05
    df = df[ null_counts[null_counts < length].index]

    # 2.Text columns: Drop any with 1 or more missing values for now.
    text_null_counts =df.select_dtypes(include=['object']).isnull().sum()
    text_null_cols = text_null_counts[text_null_counts > 0 ].index
    df = df.drop(columns=text_null_cols)    
    
    # 3.Numerical columns: For columns with missing values, fill in with the most common value in that column
    num_null_counts =df.select_dtypes(include=['float64',"int64"]).isnull().sum()
    num_null_cols = num_null_counts[num_null_counts > 0].index.tolist()
    for col in num_null_cols:
        col_mode = df[col].mode()
        df[col] = df[col].fillna(col_mode[0])    

    # 4. Create new columns
    df['Years Before Sale'] = df['Yr Sold'] - df['Year Built']
    df['Years Since Remod'] = df['Yr Sold'] - df['Year Remod/Add']
    df = df [(df['Years Before Sale']>0)  & (df['Years Since Remod']>0)]
    df = df.drop(["Year Built", "Year Remod/Add"], axis = 1)        

    # 5. Drop columns that aren't useful or leak info about the final sale
    df = df.drop(["PID", "Order","Mo Sold", "Sale Condition", "Sale Type", "Yr Sold"], axis=1)
    
    return df

def select_features(df):
    return   df[['Gr Liv Area','SalePrice']]

def train_and_test(df):
    
    data = select_features(df)
    data = data.select_dtypes(include=['float64','int64'])
    train = data[:1460]
    test  = data[1460:] 

    features = data.columns.drop('SalePrice')

    lr = linear_model.LinearRegression()
    lr.fit(train[features],train['SalePrice'])
    predictions = lr.predict(test[features])
    mse = mean_squared_error(test['SalePrice'], predictions)
    rmse = mse ** (1/2)
    return rmse

transform_df = transform_features(df)
filtered_df = select_features(transform_df)
rmse = train_and_test(filtered_df)

rmse    

47125.95445367698

## Feature Selection


In [37]:
num_df =transform_df.select_dtypes(include=['float64','int64'])
corrmat =transform_df.corr()['SalePrice']
corrmat[corrmat >0.4]

Overall Qual     0.785137
Mas Vnr Area     0.469301
BsmtFin SF 1     0.450672
Total Bsmt SF    0.632745
1st Flr SF       0.623102
Gr Liv Area      0.724094
Full Bath        0.541690
TotRms AbvGrd    0.476159
Fireplaces       0.488419
Garage Cars      0.622464
Garage Area      0.608802
SalePrice        1.000000
Name: SalePrice, dtype: float64

In [38]:
## Drop columns with less than 0.4 correlation with SalePrice
transform_df = transform_df.drop(corrmat[corrmat < 0.4].index, axis=1)
transform_df.columns

Index(['MS Zoning', 'Street', 'Lot Shape', 'Land Contour', 'Utilities',
       'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual', 'Roof Style',
       'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Area',
       'Exter Qual', 'Exter Cond', 'Foundation', 'BsmtFin SF 1',
       'Total Bsmt SF', 'Heating', 'Heating QC', 'Central Air', '1st Flr SF',
       'Gr Liv Area', 'Full Bath', 'Kitchen Qual', 'TotRms AbvGrd',
       'Functional', 'Fireplaces', 'Garage Cars', 'Garage Area', 'Paved Drive',
       'SalePrice'],
      dtype='object')

* Which columns in the data frame should be converted to the categorical data type? 
  * We will drop categorical column has more than 10 of unique values
  * Which categorical columns have a few unique values but more than 95% of the values in the column belong to a specific category? This would be similar to a low variance numerical feature (no variability in the data for the model to capture).
* Which columns are currently numerical but need to be encoded as categorical instead (because the numbers don't have any semantic meaning)?

In [39]:
## Create a list of column names from documentation that are *meant* to be categorical
nominal_features = ["PID", "MS SubClass", "MS Zoning", "Street", "Alley", "Land Contour", "Lot Config", "Neighborhood", 
                    "Condition 1", "Condition 2", "Bldg Type", "House Style", "Roof Style", "Roof Matl", "Exterior 1st", 
                    "Exterior 2nd", "Mas Vnr Type", "Foundation", "Heating", "Central Air", "Garage Type", 
                    "Misc Feature", "Sale Type", "Sale Condition"]

In [40]:
###  Find and drop categorical column has more than 10 of unique values
transform_cat_cols =[]
for col in nominal_features:
    if col in transform_df.columns:
        transform_cat_cols.append(col)
uniqueness_counts = transform_df[transform_cat_cols].apply(lambda x : len(x.value_counts()))

## Aribtrary cutoff of 10 unique values (worth experimenting)
drop_nonuniq_cols = uniqueness_counts[uniqueness_counts > 10].index
transform_df = transform_df.drop(drop_nonuniq_cols, axis=1)


In [41]:
## Select just the remaining text columns and convert to categorical
text_cols = transform_df.select_dtypes(include=['object'])

for col in text_cols: 
    transform_df[col] = transform_df[col].astype('category')

## Create dummy columns and add back to the dataframe!
# transform_df = pd.concat([
#     transform_df, 
#     pd.get_dummies(transform_df.select_dtypes(include=['category']))
# ], axis=1).drop(text_cols,axis=1)    
    
## Create dummy columns and add back to the dataframe!
transform_df = pd.concat([
    transform_df,
    pd.get_dummies(transform_df.select_dtypes(include=['category']))
], axis =1).drop(text_cols,axis=1)


* Update select_features() using the above code

* Update train_and_test(), implement k-fold cross validation using k folds:



In [42]:
def  transform_features(df):  # feature engineering
    # 1: All columns: Drop any with 5% or more missing values for now.
    null_counts = df.isnull().sum()
    length = len(df) * 0.05
    df = df[ null_counts[null_counts < length].index]

    # 2.Text columns: Drop any with 1 or more missing values for now.
    text_null_counts =df.select_dtypes(include=['object']).isnull().sum()
    text_null_cols = text_null_counts[text_null_counts > 0 ].index
    df = df.drop(columns=text_null_cols)    
    
    # 3.Numerical columns: For columns with missing values, fill in with the most common value in that column
    num_null_counts =df.select_dtypes(include=['float64',"int64"]).isnull().sum()
    num_null_cols = num_null_counts[num_null_counts > 0].index.tolist()
    for col in num_null_cols:
        col_mode = df[col].mode()
        df[col] = df[col].fillna(col_mode[0])    

    # 4. Create new columns
    df['Years Before Sale'] = df['Yr Sold'] - df['Year Built']
    df['Years Since Remod'] = df['Yr Sold'] - df['Year Remod/Add']
    df = df [(df['Years Before Sale']>0)  & (df['Years Since Remod']>0)]
    df = df.drop(["Year Built", "Year Remod/Add"], axis = 1)        

    # 5. Drop columns that aren't useful or leak info about the final sale
    df = df.drop(["PID", "Order","Mo Sold", "Sale Condition", "Sale Type", "Yr Sold"], axis=1)
    
    return df

def select_features(df, coeff_threshold=0.4, uniq_threshold=10): # feature selection
    num_df = df.select_dtypes(include=['float64','int64'])
    corrmat =num_df.corr()['SalePrice']
    
    ## Drop columns with less than 'coeff_threshold' correlation with SalePrice
    df = df.drop(corrmat[corrmat < coeff_threshold].index, axis=1)
    
    ## Create a list of column names from documentation that are *meant* to be categorical
    nominal_features = ["PID", "MS SubClass", "MS Zoning", "Street", "Alley", "Land Contour", "Lot Config", "Neighborhood", 
                        "Condition 1", "Condition 2", "Bldg Type", "House Style", "Roof Style", "Roof Matl", "Exterior 1st", 
                        "Exterior 2nd", "Mas Vnr Type", "Foundation", "Heating", "Central Air", "Garage Type", 
                        "Misc Feature", "Sale Type", "Sale Condition"]
    
    ###  Find and drop categorical column has more than 'uniq_threshold' of unique values
    transform_cat_cols =[]
    for col in nominal_features:
        if col in df.columns:
            transform_cat_cols.append(col)
    uniqueness_counts = df[transform_cat_cols].apply(lambda x : len(x.value_counts()))
    
    ## Aribtrary cutoff of 10 unique values (worth experimenting)
    drop_nonuniq_cols = uniqueness_counts[uniqueness_counts > uniq_threshold].index
    df = df.drop(drop_nonuniq_cols, axis=1)
    
    ## Select just the remaining text columns and convert to categorical
    text_cols = df.select_dtypes(include=['object'])
    
    for col in text_cols: 
        df[col] = df[col].astype('category')

    ## Create dummy columns and add back to the dataframe!
    df = pd.concat([
        df,
        pd.get_dummies(df.select_dtypes(include=['category']))
    ], axis =1).drop(text_cols,axis=1)
    
    return df

def train_and_test(df, k=0):
    
    data = df.select_dtypes(include=['float64','int64'])
    features = data.columns.drop('SalePrice')    
    lr = linear_model.LinearRegression()   
    
    if k==0:
        train = data[:1460]
        test  = data[1460:] 
        lr.fit(train[features],train['SalePrice'])
        predictions = lr.predict(test[features])
        mse = mean_squared_error(test['SalePrice'], predictions)
        rmse = mse ** (1/2)
        return rmse
    if  k==1:
        
        datta = data.sample(frac=1)
        train = data[:1460]
        test  = data[1460:] 
        
        lr.fit(train[features],train['SalePrice'])
        predictions = lr.predict(test[features])
        mse = mean_squared_error(test['SalePrice'], predictions)
        rmse1 = mse ** (1/2)

        lr.fit(test[features],test['SalePrice'])
        predictions = lr.predict(train[features])
        mse = mean_squared_error(train['SalePrice'], predictions)
        rmse2 = mse ** (1/2)
        print (rmse1)
        print (rmse2)
        return (rmse1+ rmse2)/2   
    else:
        
        kf = KFold(k, shuffle=True, random_state=1)
        rmse_values = []
        
        for train_index, test_index in kf.split(df):
            train = data.iloc[train_index]
            test  = data.iloc[test_index] 

            lr.fit(train[features],train['SalePrice'])
            predictions = lr.predict(test[features])
            mse = mean_squared_error(test['SalePrice'], predictions)
            rmse_values.append (mse ** (1/2))
        print(rmse_values)
        avg_rmse = np.mean(rmse_values)
        return avg_rmse
       
    
df = pd.read_csv("AmesHousing.tsv", delimiter="\t")
transform_df = transform_features(df)
filtered_df = select_features(transform_df)
rmse = train_and_test(filtered_df,5)

rmse 

[28559.253979995625, 27617.41549010686, 34087.46202251305, 26972.30664851595, 30659.84840825993]


29579.25730987828