In [87]:
# Import dependencies
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import requests
from prophet import Prophet
import seaborn as sns


### Data Preprocessing


In [88]:
# Fetch the data
ames_df = pd.read_csv('Resources/ames_housing.csv')

In [89]:
# Display the data
ames_df.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


In [90]:
# Check for missing values
missing_values = ames_df.isnull().sum()

missing_values

Order               0
PID                 0
MS SubClass         0
MS Zoning           0
Lot Frontage      490
                 ... 
Mo Sold             0
Yr Sold             0
Sale Type           0
Sale Condition      0
SalePrice           0
Length: 82, dtype: int64

In [91]:
# Calculate the percentage of missing values
percent_missing = missing_values / len(ames_df) * 100

percent_missing

Order              0.000000
PID                0.000000
MS SubClass        0.000000
MS Zoning          0.000000
Lot Frontage      16.723549
                    ...    
Mo Sold            0.000000
Yr Sold            0.000000
Sale Type          0.000000
Sale Condition     0.000000
SalePrice          0.000000
Length: 82, dtype: float64

In [92]:
# Impute missing values for numerical features using the mean
numerical_columns = ames_df.select_dtypes(include=['int64', 'float64']).columns

for column in numerical_columns:
    ames_df[column].fillna(ames_df[column].mean(), inplace=True)

In [93]:
# Impute missing values for categorical features using the mode
categorical_columns = ames_df.select_dtypes(include=['object']).columns

for column in categorical_columns:
    ames_df[column].fillna(ames_df[column].mode()[0], inplace=True)

In [94]:
# Verify that there are no missing values
missing_values = ames_df.isnull().sum()

missing_values

Order             0
PID               0
MS SubClass       0
MS Zoning         0
Lot Frontage      0
                 ..
Mo Sold           0
Yr Sold           0
Sale Type         0
Sale Condition    0
SalePrice         0
Length: 82, dtype: int64

In [100]:
# Show the updated dataframe's top 5 rows
ames_df.head()

Unnamed: 0,Order,PID,MS SubClass,Lot Frontage,Lot Area,Street,Alley,Overall Qual,Overall Cond,Year Built,...,Sale Type_New,Sale Type_Oth,Sale Type_VWD,Sale Type_WD,Sale Condition_Abnorml,Sale Condition_AdjLand,Sale Condition_Alloca,Sale Condition_Family,Sale Condition_Normal,Sale Condition_Partial
0,1,526301100,20,141.0,31770,1,0,6,5,1960,...,False,False,False,True,False,False,False,False,True,False
1,2,526350040,20,80.0,11622,1,0,5,6,1961,...,False,False,False,True,False,False,False,False,True,False
2,3,526351010,20,81.0,14267,1,0,6,6,1958,...,False,False,False,True,False,False,False,False,True,False
3,4,526353030,20,93.0,11160,1,0,7,5,1968,...,False,False,False,True,False,False,False,False,True,False
4,5,527105010,60,74.0,13830,1,0,5,5,1997,...,False,False,False,True,False,False,False,False,True,False


In [101]:
# Show the updated dataframes last 5 rows
ames_df.tail()

Unnamed: 0,Order,PID,MS SubClass,Lot Frontage,Lot Area,Street,Alley,Overall Qual,Overall Cond,Year Built,...,Sale Type_New,Sale Type_Oth,Sale Type_VWD,Sale Type_WD,Sale Condition_Abnorml,Sale Condition_AdjLand,Sale Condition_Alloca,Sale Condition_Family,Sale Condition_Normal,Sale Condition_Partial
2925,2926,923275080,80,37.0,7937,1,0,6,6,1984,...,False,False,False,True,False,False,False,False,True,False
2926,2927,923276100,20,69.22459,8885,1,0,5,5,1983,...,False,False,False,True,False,False,False,False,True,False
2927,2928,923400125,85,62.0,10441,1,0,5,5,1992,...,False,False,False,True,False,False,False,False,True,False
2928,2929,924100070,20,77.0,10010,1,0,5,5,1974,...,False,False,False,True,False,False,False,False,True,False
2929,2930,924151050,60,74.0,9627,1,0,7,5,1993,...,False,False,False,True,False,False,False,False,True,False


In [96]:
# Encode categorical features
binary_columns = [column for column in ames_df.columns if ames_df[column].nunique() == 2]

label_encoders = {}

for column in binary_columns:
    label_encoders[column] = LabelEncoder()
    ames_df[column] = label_encoders[column].fit_transform(ames_df[column])

In [97]:
# One-hot encode the remaining categorical features
ames_df = pd.get_dummies(ames_df, columns=[column for column in categorical_columns if column not in binary_columns])

In [98]:
ames_df.dtypes

Order                       int64
PID                         int64
MS SubClass                 int64
Lot Frontage              float64
Lot Area                    int64
                           ...   
Sale Condition_AdjLand       bool
Sale Condition_Alloca        bool
Sale Condition_Family        bool
Sale Condition_Normal        bool
Sale Condition_Partial       bool
Length: 303, dtype: object