## Implementing ML Model Monitoring Pipelines

### Model Performance Drift:
**Description**: Setup a monitoring pipeline to track key performance metrics (e.g., accuracy, precision) of an ML model over time using a monitoring tool or dashboard.

In [None]:
# write your code from here
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score
import matplotlib.pyplot as plt
from datetime import datetime
import os
import json

class ModelPerformanceMonitor:
    def __init__(self, log_path='model_performance_log.json'):
        self.log_path = log_path
        # Load existing log or create new
        if os.path.exists(log_path):
            with open(log_path, 'r') as f:
                self.log = json.load(f)
        else:
            self.log = []

    def log_performance(self, y_true, y_pred, model_version='v1'):
        try:
            acc = accuracy_score(y_true, y_pred)
            prec = precision_score(y_true, y_pred, zero_division=0)
            rec = recall_score(y_true, y_pred, zero_division=0)
            timestamp = datetime.now().isoformat()

            record = {
                'timestamp': timestamp,
                'model_version': model_version,
                'accuracy': acc,
                'precision': prec,
                'recall': rec
            }

            self.log.append(record)
            with open(self.log_path, 'w') as f:
                json.dump(self.log, f, indent=2)

            print(f"Logged performance at {timestamp}")
            return record
        except Exception as e:
            print(f"Error logging performance: {e}")
            return None

    def plot_performance(self):
        try:
            df = pd.DataFrame(self.log)
            if df.empty:
                print("No logged performance data to plot.")
                return

            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df = df.sort_values('timestamp')

            plt.figure(figsize=(10,6))
            plt.plot(df['timestamp'], df['accuracy'], label='Accuracy')
            plt.plot(df['timestamp'], df['precision'], label='Precision')
            plt.plot(df['timestamp'], df['recall'], label='Recall')
            plt.xlabel('Time')
            plt.ylabel('Metric')
            plt.title('Model Performance Over Time')
            plt.legend()
            plt.grid(True)
            plt.tight_layout()
            plt.show()
        except Exception as e:
            print(f"Error plotting performance: {e}")

# Example usage:
if __name__ == "__main__":
    monitor = ModelPerformanceMonitor()

    # Simulated true labels and predictions
    y_true = [1, 0, 1, 1, 0, 1, 0, 0, 1, 0]
    y_pred = [1, 0, 0, 1, 0, 1, 1, 0, 1, 0]

    monitor.log_performance(y_true, y_pred, model_version='v1')

    # Call this regularly or schedule to see performance drift over time
    monitor.plot_performance()


### Feature Distribution Drift:
**Description**: Monitor the distribution of your input features in deployed models to detect any significant shifts from training data distributions.

In [None]:
# write your code from here
import pandas as pd
import numpy as np
from scipy.stats import ks_2samp
import matplotlib.pyplot as plt

class FeatureDistributionMonitor:
    def __init__(self, training_data: pd.DataFrame, feature_columns: list):
        try:
            self.training_data = training_data[feature_columns]
            self.feature_columns = feature_columns
            self.baseline_distributions = {}
            for col in feature_columns:
                self.baseline_distributions[col] = self.training_data[col].dropna().values
        except Exception as e:
            print(f"Error initializing monitor: {e}")

    def detect_drift(self, new_data: pd.DataFrame, alpha=0.05):
        try:
            results = {}
            for col in self.feature_columns:
                if col not in new_data.columns:
                    results[col] = {'drift': True, 'reason': 'Column missing in new data'}
                    continue
                
                new_values = new_data[col].dropna().values
                if len(new_values) == 0 or len(self.baseline_distributions[col]) == 0:
                    results[col] = {'drift': False, 'p_value': None, 'reason': 'Insufficient data'}
                    continue

                stat, p_value = ks_2samp(self.baseline_distributions[col], new_values)
                drift = p_value < alpha
                results[col] = {'drift': drift, 'p_value': p_value}
            return results
        except Exception as e:
            print(f"Error detecting drift: {e}")
            return {}

    def plot_feature_distribution(self, new_data: pd.DataFrame, feature):
        try:
            if feature not in self.feature_columns:
                print(f"Feature '{feature}' not in monitored features.")
                return
            
            plt.figure(figsize=(8, 5))
            plt.hist(self.baseline_distributions[feature], bins=30, alpha=0.5, label='Training')
            if feature in new_data.columns:
                plt.hist(new_data[feature].dropna(), bins=30, alpha=0.5, label='New Data')
            plt.title(f'Distribution Comparison for {feature}')
            plt.legend()
            plt.show()
        except Exception as e:
            print(f"Error plotting distributions: {e}")

# Example usage:
if __name__ == "__main__":
    # Simulate training data
    train_df = pd.DataFrame({
        'feature1': np.random.normal(0, 1, 1000),
        'feature2': np.random.uniform(0, 10, 1000)
    })

    monitor = FeatureDistributionMonitor(train_df, ['feature1', 'feature2'])

    # Simulate new incoming data (with drift in feature1)
    new_df = pd.DataFrame({
        'feature1': np.random.normal(1, 1, 1000),  # shifted mean
        'feature2': np.random.uniform(0, 10, 1000)
    })

    drift_results = monitor.detect_drift(new_df)
    print(drift_results)

    # Plot distributions for feature1
    monitor.plot_feature_distribution(new_df, 'feature1')


### Anomaly Detection in Predictions:
**DEscription**: Implement an anomaly detection mechanism to flag unusual model
predictions. Simulate anomalies by altering input data.

In [None]:
# write your code from here
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest

class PredictionAnomalyDetector:
    def __init__(self, contamination=0.01):
        try:
            self.model = IsolationForest(contamination=contamination, random_state=42)
            self.fitted = False
        except Exception as e:
            print(f"Error initializing detector: {e}")

    def fit(self, predictions: pd.Series):
        try:
            X = predictions.values.reshape(-1, 1)
            self.model.fit(X)
            self.fitted = True
        except Exception as e:
            print(f"Error fitting model: {e}")

    def predict(self, new_predictions: pd.Series):
        try:
            if not self.fitted:
                raise ValueError("Model not fitted yet. Call fit() first.")
            X_new = new_predictions.values.reshape(-1, 1)
            anomaly_labels = self.model.predict(X_new)  # -1 for anomaly, 1 for normal
            anomalies = new_predictions[anomaly_labels == -1]
            return anomalies
        except Exception as e:
            print(f"Error predicting anomalies: {e}")
            return pd.Series(dtype=float)

# Example usage:
if __name__ == "__main__":
    # Simulate normal predictions
    preds = pd.Series(np.random.normal(loc=50, scale=5, size=1000))

    # Insert anomalies by adding extreme values
    preds_with_anomalies = preds.copy()
    anomalies_indices = np.random.choice(preds.index, size=10, replace=False)
    preds_with_anomalies.loc[anomalies_indices] = preds_with_anomalies.loc[anomalies_indices] + np.random.uniform(20, 50, size=10)

    detector = PredictionAnomalyDetector(contamination=0.01)
    detector.fit(preds)
    anomalies = detector.predict(preds_with_anomalies)
    
    print("Anomalies detected at indices:")
    print(anomalies.index.tolist())
    print(anomalies.values)
