# 📘 04_Model_Training_And_Registration

This notebook performs the following tasks:
- Loads the engineered features from the Delta Live Table pipeline
- Trains a simple classification model to predict anomaly risk
- Registers the trained model with MLflow for downstream inference


In [0]:
import pandas as pd
from pyspark.sql.functions import col
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import mlflow
import mlflow.sklearn


In [0]:
# Load the data from Delta table
df = spark.read.table("arao.aerodemo.sensor_features").toPandas()

# Prepare data
feature_cols = ["engine_temp", "fuel_efficiency", "vibration", "altitude", "airspeed",
                "oil_pressure", "engine_rpm", "battery_voltage", "prev_anomaly"]
df["label"] = (df["anomaly_score"] > 0.5).astype(int)
X = df[feature_cols]
y = df["label"]

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [0]:
with mlflow.start_run(run_name="Aircraft_Anomaly_RF_Model"):
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    preds = model.predict(X_test)
    print(classification_report(y_test, preds))

    mlflow.sklearn.log_model(model, "model", registered_model_name="AircraftAnomalyPredictor")
    mlflow.log_params({"model_type": "RandomForest", "n_estimators": 100})
