# Ames housing price prediction

## Initialization

In [1]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

# Step 1: Import MLflow
# ...

## Configuration

In [2]:
DATA_SET_URL = "https://raw.githubusercontent.com/janwillemkl/mlops-hands-on/main/data/ames_housing.csv"

FEATURES = {
    "nominal": ["ms_zoning", "lot_shape", "land_contour"],
    "ordinal": ["land_slope", "overall_qual", "overall_cond"],
    "numerical": ["lot_frontage", "lot_area", "mas_vnr_area"],
}
TARGET = "sale_price"

RANDOM_STATE = 42

# Step 2: Define MLflow tracking server and experiment
# ...

## Data set (ingestion & preprocessing)

In [3]:
raw_data = pd.read_csv(DATA_SET_URL)
raw_data.head()

Unnamed: 0.1,Unnamed: 0,order,pid,ms_sub_class,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,...,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,sale_condition,sale_price
0,0,1,526301100,20,RL,141.0,31770,Pave,,IR1,...,0,,,,0,5,2010,WD,Normal,215000
1,1,2,526350040,20,RH,80.0,11622,Pave,,Reg,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,2,3,526351010,20,RL,81.0,14267,Pave,,IR1,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,3,4,526353030,20,RL,93.0,11160,Pave,,Reg,...,0,,,,0,4,2010,WD,Normal,244000
4,4,5,527105010,60,RL,74.0,13830,Pave,,IR1,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


In [4]:
columns = (
    FEATURES["nominal"]
    + FEATURES["ordinal"]
    + FEATURES["numerical"]
    + [TARGET]
)

features = raw_data[columns]
features.head()

Unnamed: 0,ms_zoning,lot_shape,land_contour,land_slope,overall_qual,overall_cond,lot_frontage,lot_area,mas_vnr_area,sale_price
0,RL,IR1,Lvl,Gtl,6,5,141.0,31770,112.0,215000
1,RH,Reg,Lvl,Gtl,5,6,80.0,11622,0.0,105000
2,RL,IR1,Lvl,Gtl,6,6,81.0,14267,108.0,172000
3,RL,Reg,Lvl,Gtl,7,5,93.0,11160,0.0,244000
4,RL,IR1,Lvl,Gtl,5,5,74.0,13830,0.0,189900


## Model training

In [5]:
train_data, test_data = train_test_split(features, random_state=RANDOM_STATE)

train_input = train_data.drop([TARGET], axis=1)
train_output = train_data[TARGET]

test_input = test_data.drop([TARGET], axis=1)
test_output = test_data[TARGET]

In [6]:
# Ordinal pipeline
ordinal_pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OrdinalEncoder()),
    ]
)

# Nominal pipeline
nominal_pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
    ]
)

# Numerical pipeline
numerical_pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="mean")),
        ("encoder", StandardScaler()),
    ]
)

# Preprocessing
preprocessing_pipeline = ColumnTransformer(
    [
        ("ordinal_preprocessor", ordinal_pipeline, FEATURES["ordinal"]),
        ("nominal_preprocessor", nominal_pipeline, FEATURES["nominal"]),
        ("numerical_preprocessor", numerical_pipeline, FEATURES["numerical"]),
    ]
)

# Estimator
pipeline = Pipeline(
    [
        ("preprocessor", preprocessing_pipeline),
        ("estimator", GradientBoostingRegressor(random_state=RANDOM_STATE)),
    ]
)

In [3]:
# Step 3: MLflow configuration
# ...

In [4]:
# Step 4: Enable autologging
# ...

In [12]:
# Step 5: Wrap fit() and score() 

pipeline.fit(train_input, train_output)
pipeline.score(test_input, test_output)



## Example prediction

In [None]:
example = pd.DataFrame([{
    "ms_zoning": "RL", 
    "lot_shape": "IR1", 
    "land_contour": "Lvl",
    "land_slope": "Gtl",
    "overall_qual": 6,
    "overall_cond": 5,
    "lot_frontage": 141.0,
    "lot_area": 31770,
    "mas_vnr_area": 112.0
}])

pipeline.predict(example)[0]