In [3]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
import joblib
import os

# Step 2: Simulate or Load Your Training Data
# You can later replace this with data from PostgreSQL or a CSV export

np.random.seed(42)

# Normal null% and volume values
null_percent = np.random.normal(loc=5, scale=2, size=90)
null_anomalies = np.random.uniform(low=30, high=80, size=10)
null_percent = np.concatenate([null_percent, null_anomalies])
np.random.shuffle(null_percent)

volume = np.random.normal(loc=1000, scale=50, size=90)
volume_anomalies = np.random.uniform(low=100, high=300, size=10)
volume = np.concatenate([volume, volume_anomalies])
np.random.shuffle(volume)

# Schema change: mostly 0, some 1
schema_change = np.random.choice([0, 1], size=100, p=[0.9, 0.1])

# Create DataFrame
df = pd.DataFrame({
    "null_percent": null_percent,
    "volume": volume,
    "schema_change": schema_change
})

# Step 3: Train Isolation Forest
X = df[["null_percent", "volume", "schema_change"]]
model = IsolationForest(contamination=0.1, random_state=42)
model.fit(X)

# Step 4: Save Model to 'cv_ml/models/isolation_forest_cv.pkl'
output_path = os.path.join("models", "isolation_forest_cv.pkl")
os.makedirs(os.path.dirname(output_path), exist_ok=True)
joblib.dump(model, output_path)

print(f"✅ Model trained and saved to: {output_path}")


✅ Model trained and saved to: models\isolation_forest_cv.pkl
