In [1]:
# Question: Detecting Data Drift
# Description: Identify potential data drift between two time periods for a numeric attribute.
from scipy.stats import ks_2samp
import matplotlib.pyplot as plt

def detect_data_drift(data_period1, data_period2, alpha=0.05):
    if not data_period1 or not data_period2:
        return "One or both data periods are empty."

    # Perform Kolmogorov–Smirnov test
    statistic, p_value = ks_2samp(data_period1, data_period2)

    # Plot histograms
    plt.hist(data_period1, bins=20, alpha=0.5, label='Period 1', density=True)
    plt.hist(data_period2, bins=20, alpha=0.5, label='Period 2', density=True)
    plt.title('Data Distribution Comparison')
    plt.xlabel('Value')
    plt.ylabel('Density')
    plt.legend()
    plt.grid(True)
    plt.show()

    drift_detected = p_value < alpha

    return {
        'ks_statistic': statistic,
        'p_value': p_value,
        'data_drift_detected': drift_detected
    }
