# 📘 04_Model_Training_And_Registration

This notebook performs the following tasks:
- Loads the engineered features from the Delta Live Table pipeline
- Trains a simple classification model to predict anomaly risk
- Registers the trained model with MLflow for downstream inference


In [0]:
import pandas as pd
from pyspark.sql.functions import col
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import mlflow
import mlflow.sklearn


In [0]:
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Load and prepare data
feature_df = spark.read.table("arao.aerodemo.sensor_features").toPandas()
feature_df = feature_df.fillna({"prev_anomaly": 0.0})
X = feature_df[[
    "engine_temp", "fuel_efficiency", "vibration", "altitude", "airspeed",
    "oil_pressure", "engine_rpm", "battery_voltage", "prev_anomaly"
]]
y = feature_df["anomaly_score"].astype(int)

# Split and scale
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Generate signature after training
signature = infer_signature(X_train, model.predict(X_train_scaled))

# Log with MLflow
with mlflow.start_run(run_name="Aircraft_Anomaly_RF_Model"):
    preds = model.predict(X_test_scaled)
    report = classification_report(y_test, preds, output_dict=True)

    mlflow.log_params(model.get_params())
    if "1" in report:
        mlflow.log_metrics({
            "precision": report["1"].get("precision", 0.0),
            "recall": report["1"].get("recall", 0.0),
            "f1-score": report["1"].get("f1-score", 0.0)
        })

    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="model",
        signature=signature,
        registered_model_name="AircraftAnomalyPredictor"
    )

In [0]:
# Check for nulls
print("NaN count per column in X_train:")
print(X_train.isnull().sum())

# Check data types
print("\nData types:")
print(X_train.dtypes)

# Optional: show rows with NaNs
print("\nSample rows with NaNs:")
print(X_train[X_train.isnull().any(axis=1)].head())

In [0]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
import pandas as pd

# Drop rows with NaNs
df_cleaned = df.dropna()

# Separate target
y = df_cleaned["anomaly_score"]

# Drop timestamp and prepare features
X = df_cleaned.drop(columns=["anomaly_score", "timestamp"])

# Encode all categorical columns
X_encoded = pd.get_dummies(X, columns=["aircraft_id", "model", "event_type"], drop_first=True)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Train and log model
with mlflow.start_run(run_name="Aircraft_Anomaly_RF_Model"):
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    preds = model.predict(X_test)
    report = classification_report(y_test, preds, output_dict=True)
    print(classification_report(y_test, preds))

    # Log model with signature
    signature = infer_signature(X_train, model.predict(X_train))

    mlflow.sklearn.log_model(
        model,
        artifact_path="model",
        registered_model_name="AircraftAnomalyPredictor",
        signature=signature
    )

    mlflow.log_params({"model_type": "RandomForest", "n_estimators": 100})
    mlflow.log_metrics({
        "precision": report.get("1", {}).get("precision", 0.0),
        "recall": report.get("1", {}).get("recall", 0.0),
        "f1-score": report.get("1", {}).get("f1-score", 0.0)
    })

In [0]:
# Create a DataFrame of predictions
pred_df = pd.DataFrame({
    "aircraft_id": X_test.index,  # Or use the correct aircraft_id if available
    "prediction_date": pd.Timestamp("today").date(),  # Replace with actual date column if available
    "predicted_anomaly": preds
})

# Convert to Spark DataFrame
pred_sdf = spark.createDataFrame(pred_df)

# Save predictions to Delta table
pred_sdf.write.format("delta").mode("overwrite").saveAsTable("arao.aerodemo.anomaly_predictions")

print("✅ Predictions saved to 'anomaly_predictions' Delta table")