# Train pipeline

This notebook takes the train and test data and does the following:

- Data processing (cleaning, feature engineering, scaling etc.) to prepare it for model training
- Trains a simple model
- Saves any artifacts that will be used during inference to disk

> This process will likely be captured in a Pipeline asset on Highwind

In [1]:
# Config
RANDOM_SEED = 42
ARTIFACT_SAVE_DIR = "../saved_model/"
TRAIN_DATA_PATH = "../data/train.csv"
TEST_DATA_PATH = "../data/test.csv"
TARGET_COLUMN = "MedHouseVal"
MODEL_ARGS = {
    "alpha": 0.01,
    "fit_intercept": True,
    "random_state": RANDOM_SEED
}
PUSH_TO_HF = True # Whether to push to Hugging Face Hub or not
HF_REPO_NAME = "MelioAI/california-housing" # For pushing model to Hugging Face Hub 

In [2]:
import os
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
import pandas as pd
import joblib
from skops import card, hub_utils
from tempfile import mkdtemp, mkstemp
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


## Load data

In [3]:
train_df = pd.read_csv(TRAIN_DATA_PATH)
train_df.head(3)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,3.2596,33.0,5.017657,1.006421,2300.0,3.691814,32.71,-117.03,1.03
1,3.8125,49.0,4.473545,1.041005,1314.0,1.738095,33.77,-118.16,3.821
2,4.1563,4.0,5.645833,0.985119,915.0,2.723214,34.66,-120.48,1.726


In [4]:
# Separate features and labels
X_train = train_df.copy()
y_train = X_train.pop(TARGET_COLUMN)

In [5]:
# Check shapes
print(f"X_train: {X_train.shape}")
print(f"y_train: {y_train.shape}")

X_train: (16512, 8)
y_train: (16512,)


## Process data

### Optional steps

Insert any optional data processing steps here

In [6]:
# (Optional) Add data cleaning here

In [7]:
# (Optional) Add feature engineering here

### Feature scaling

In [8]:
# Initialise scaler and scale train features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [9]:
# Save the scaler for later use
save_scaler_path = os.path.join(ARTIFACT_SAVE_DIR, "scaler.joblib")
joblib.dump(scaler, save_scaler_path)

['../saved_model/scaler.joblib']

## Train model

In [10]:
# Define model
model = Lasso(**MODEL_ARGS)

In [11]:
# Train model
model.fit(X_train_scaled, y_train)

In [12]:
# Check learned model weights
model.coef_

array([ 0.80095744,  0.12708701, -0.16275931,  0.20620745, -0.        ,
       -0.03060176, -0.79011254, -0.75567379])

In [13]:
# Save the model for later use
save_model_path = os.path.join(ARTIFACT_SAVE_DIR, "model.joblib")
joblib.dump(model, save_model_path)

['../saved_model/model.joblib']

## (Optional) Save to Hugging Face Hub

Save trained model files to the Hugging Face Hub so that they can be downloaded later. In this step, we use the useful helper functions provided by the `skops` package.

If `PUSH_TO_HF` is enabled (see top of this notebook), this section will execute. Remember to log into Hugging Face with the CLI by running: `huggingface-cli login` otherwise this section won't work.

In [14]:
# Make temporary local repo dir
local_repo = Path("../hf-repo") # mkdtemp(prefix="skops-")

if PUSH_TO_HF:

    # Initialise HF repo
    hub_utils.init(
        model=Path(save_model_path),
        requirements=[
            f"scikit-learn=={sklearn.__version__}",
            f"joblib=={joblib.__version__}"
        ],
        dst=local_repo,
        task="tabular-classification",
        data=X_train.head(),
        model_format="pickle"
    )

    # Add feature scaler to repo
    hub_utils.add_files(save_scaler_path, dst=local_repo)

In [15]:
if PUSH_TO_HF:

    # Create and populate basic model card
    model_card = card.Card(model=model)
    metadata = card.metadata_from_config(local_repo / "config.json")
    
    # Add model card detail
    limitations = (
        "This model is made for the purposes of showing how to use Highwind only."
    )
    model_description = (
        "This is a linear regression model trained on California housing dataset. This model could be"
        " used to predict median price of a house in California, given certain features. This model is very basic and"
        " should only be used as an example of how to use Highwind."
    )
    model_card_authors = "MelioAI, ruanmelio"
    usage_code = """
```python
import joblib
from huggingface_hub import hf_hub_download

# Feature scaler
hf_hub_download("MelioAI/california-housing", "scaler.joblib")
scaler = joblib.load("scaler.joblib")

# Classifier model
hf_hub_download("MelioAI/california-housing", "model.joblib")
model = joblib.load("model.joblib")
```
"""
    model_card.add(
        folded=False,
        **{
            "Model Card Authors": model_card_authors,
            "Intended uses & limitations": limitations,
            "Model description": model_description,
            "Model description/Intended uses & limitations": limitations,
            "How to Get Started with the Model": usage_code
        },
    )

    # Add tags
    model_card.metadata.library_name = "sklearn"
    model_card.metadata.tags = ["sklearn", "tabular-regression"]

    # Save model card
    model_card.save(local_repo / "README.md")

In [16]:
# Remember to log into HF with the CLI by running: huggingface-cli login
if PUSH_TO_HF:

    # Push to HF Hub
    hub_utils.push(
        repo_id=HF_REPO_NAME,
        source=local_repo
    )

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]
[A
model.joblib: 100%|██████████| 695/695 [00:00<00:00, 1.30kB/s]
scaler.joblib: 100%|██████████| 1.22k/1.22k [00:00<00:00, 2.10kB/s]
Upload 2 LFS files: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s]


## (Optional) model evaluation

Using test set

In [17]:
# Remove scaler and model from memory to prove loading from disk works
del model
del scaler

In [18]:
# Load test data
test_df = pd.read_csv(TEST_DATA_PATH)
print(f"test_df: {test_df.shape}")

# Separate features and labels
X_test = test_df.copy()
y_test = X_test.pop(TARGET_COLUMN)

test_df: (4128, 9)


In [19]:
# Run through same preprocessing steps
# Feature scaling
scaler = joblib.load(os.path.join(ARTIFACT_SAVE_DIR, "scaler.joblib"))
X_test_scaled = scaler.transform(X_test)

# Load model
model = joblib.load(os.path.join(ARTIFACT_SAVE_DIR, "model.joblib"))

In [20]:
# Make predictions on test set
y_pred = model.predict(X_test_scaled)

In [21]:
# Model evaluation
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

print(f"MSE: {round(mse, 3)}")
print(f"RMSE: {round(rmse, 3)}")
print("-"*5)
print(f"mean({TARGET_COLUMN}): {round(y_test.mean(), 3)}")
print(f"std({TARGET_COLUMN}): {round(y_test.std(), 3)}")

MSE: 0.548
RMSE: 0.74
-----
mean(MedHouseVal): 2.055
std(MedHouseVal): 1.145
