Pricing homes

In [1]:
import numpy as np
import pandas as pd
import joblib
from sklearn import model_selection, pipeline, compose, preprocessing, impute, linear_model, metrics

Set global variables

Define functions

Define data science problem

In [2]:
# Business problem: Predict the price of a home
# Data science problem: Given a set of features of a home, predict its price

Gather data

In [3]:
# Load the dataset from a public URL
url = "https://jse.amstat.org/v19n3/decock/AmesHousing.txt"
df = pd.read_csv(url, sep="\t")  # Tab-separated values
df.head()  # Display the first few rows of the dataset

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


Explore data

In [4]:
df.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


Process and transform data

In [5]:
# Separate the target variable from the feature set
target = "SalePrice"
X = df.drop(columns=[target])  # Features
y = df[target]  # Target variable

In [6]:
# Identify numeric and categorical features
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()

In [7]:
# Define preprocessing steps for numeric features
numeric_transformer = pipeline.Pipeline(steps=[
    ("imputer", impute.SimpleImputer(strategy="median")),  # Fill missing values with median
    ("scaler", preprocessing.StandardScaler())  # Standardize numeric values
])

# Define preprocessing steps for categorical features
categorical_transformer = pipeline.Pipeline(steps=[
    ("imputer", impute.SimpleImputer(strategy="most_frequent")),  # Fill missing values with most frequent value
    ("onehot", preprocessing.OneHotEncoder(handle_unknown="ignore"))  # Convert categories to one-hot encoding
])

# Combine preprocessing steps into a single transformer
preprocessor = compose.ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

Model and evaluate data

In [8]:
# Create a pipeline that includes preprocessing and model training
model = pipeline.Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", linear_model.LinearRegression())
])

In [9]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [10]:
# Train the model on the training data
model.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [11]:
# Generate predictions for both training and testing sets
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate performance metrics for training and testing sets
train_rmse = metrics.mean_squared_error(y_train, y_train_pred)
train_r2 = metrics.r2_score(y_train, y_train_pred)
test_rmse = metrics.mean_squared_error(y_test, y_test_pred)
test_r2 = metrics.r2_score(y_test, y_test_pred)

# Display model performance results
print("=== Model Performance ===")
print(f"Train RMSE: {train_rmse:,.2f}")
print(f"Train R²:   {train_r2:.3f}")
print(f"Test RMSE:  {test_rmse:,.2f}")
print(f"Test R²:    {test_r2:.3f}")

=== Model Performance ===
Train RMSE: 359,961,068.03
Train R²:   0.939
Test RMSE:  878,258,783.28
Test R²:    0.890


Save model artifacts

In [12]:
# Save the trained model to a file
joblib.dump(model, "house_price_model.pkl")
print("Model saved to 'house_price_model.pkl'")

Model saved to 'house_price_model.pkl'


Prediction using trained model

In [13]:
# Load the saved model from disk
loaded_model = joblib.load("house_price_model.pkl")

# Example: create a small sample DataFrame with the same columns as X
# (Here we just take the first row from X_test for demonstration)
sample_data = X_test.iloc[[0]]
sample_data.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition
1357,1358,903427090,70,RM,,5100,Pave,Grvl,Reg,Lvl,...,0,0,,MnPrv,,0,6,2008,WD,Normal


In [14]:
# Predict the sale price for the sample input
sample_pred = loaded_model.predict(sample_data)
print(f"Predicted SalePrice: {sample_pred[0]:,.2f}")

Predicted SalePrice: 155,881.51


Answer data science problem

In [15]:
# Given a set of features of a home, we can predict its price