### Implementing Adversarial Validation for Data Drift
Description: Create and train a classifier that distinguishes between train and test datasets, using the classifier’s performance to infer data drift.

In [1]:
# write your code from here
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

def adversarial_validation(train_df: pd.DataFrame, test_df: pd.DataFrame, sample_size: int = 5000):
    """
    Performs adversarial validation to detect data drift.

    Parameters:
        train_df (pd.DataFrame): Training data (reference).
        test_df (pd.DataFrame): Test/inference data to compare.
        sample_size (int): Number of rows to sample from each dataset for balanced classification.

    Returns:
        float: ROC-AUC score of classifier distinguishing between datasets.
        str: Drift status ("Drift Detected" or "No Drift")
    """
    # Align common columns and drop NaNs
    common_cols = train_df.columns.intersection(test_df.columns)
    train_sample = train_df[common_cols].dropna().sample(n=min(sample_size, len(train_df)), random_state=42)
    test_sample = test_df[common_cols].dropna().sample(n=min(sample_size, len(test_df)), random_state=42)

    # Label the source of data
    train_sample['is_test'] = 0
    test_sample['is_test'] = 1

    # Combine and shuffle
    combined = pd.concat([train_sample, test_sample], axis=0).sample(frac=1, random_state=42)
    X = combined.drop(columns='is_test')
    y = combined['is_test']

    # Train/test split for validation
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

    # Train classifier
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)

    # Evaluate
    y_pred = clf.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, y_pred)

    status = "Drift Detected" if auc > 0.7 else "No Drift"
    return round(auc, 4), status
    # Simulate train and test data
train_df = pd.DataFrame({
    'feature1': np.random.normal(0, 1, 1000),
    'feature2': np.random.normal(5, 1, 1000)
})

# Simulated drift in test data
test_df = pd.DataFrame({
    'feature1': np.random.normal(0, 1, 1000),       # No drift
    'feature2': np.random.normal(8, 1, 1000)        # Drift introduced
})

auc_score, status = adversarial_validation(train_df, test_df)
print(f"AUC: {auc_score}, Status: {status}")



AUC: 0.9799, Status: Drift Detected
