# 03 – Train Model
This notebook loads prepared data, trains a model, and logs it to MLflow.

In [None]:
%pip install -e ../src

In [None]:
from pyspark.sql.functions import col
from project.common import get_spark_session, get_logger
from project.train_model import train
import mlflow
import pandas as pd

logger = get_logger("train_model")
spark = get_spark_session()

In [None]:

# Load prepared data
input_path = "dbfs:/tmp/nyc_taxi/prepared_data"
logger.info(f"Loading data from {input_path}")
df = spark.read.format("delta").load(input_path)

In [None]:

# Convert to Pandas for sklearn training (use Spark MLlib for large data)
df_pd = df.select("trip_distance", "passenger_count", "tip_pct").dropna().toPandas()
df_pd = df_pd[df_pd["tip_pct"].between(0, 5)]  # Filter extreme outliers

df_pd["target"] = df_pd["tip_pct"]
df_pd = df_pd.drop(columns=["tip_pct"])

In [None]:
# Train model with MLflow tracking
logger.info("Starting training run")
model = train(df_pd, target_column="target")

# ✅ Training complete. View results in MLflow experiment UI.
You can now register this model or trigger evaluation/notebook.