### Data Drift: Detection Using Statistical Tests
**Question**: Simulate two datasets with a shift in distribution, and apply the Kolmogorov-
Smirnov test to determine if data drift has occurred.

In [2]:
import numpy as np
from scipy.stats import ks_2samp
import unittest

def simulate_data(loc, scale, size):
    if not isinstance(size, int) or size <= 0:
        raise ValueError("size must be a positive integer")
    if scale <= 0:
        raise ValueError("scale must be positive")
    return np.random.normal(loc=loc, scale=scale, size=size)

def detect_drift(data1, data2, alpha=0.05):
    if len(data1) == 0 or len(data2) == 0:
        raise ValueError("Input datasets must not be empty")
    ks_stat, p_value = ks_2samp(data1, data2)
    drift = p_value < alpha
    return ks_stat, p_value, drift

# Example usage
if __name__ == "__main__":
    data1 = simulate_data(loc=0, scale=1, size=1000)
    data2 = simulate_data(loc=0.5, scale=1, size=1000)
    ks_stat, p_value, drift = detect_drift(data1, data2)
    print(f"KS Statistic: {ks_stat:.4f}, P-value: {p_value:.4f}")
    print("Data drift detected." if drift else "No significant data drift detected.")

# Unit Tests
class TestDataDriftDetection(unittest.TestCase):

    def test_no_drift(self):
        d1 = simulate_data(0, 1, 1000)
        d2 = simulate_data(0, 1, 1000)
        _, _, drift = detect_drift(d1, d2)
        self.assertFalse(drift, "Should not detect drift for identical distributions")

    def test_with_drift(self):
        d1 = simulate_data(0, 1, 1000)
        d2 = simulate_data(1, 1, 1000)
        _, _, drift = detect_drift(d1, d2)
        self.assertTrue(drift, "Should detect drift for different distributions")

    def test_invalid_size(self):
        with self.assertRaises(ValueError):
            simulate_data(0, 1, -5)

    def test_invalid_scale(self):
        with self.assertRaises(ValueError):
            simulate_data(0, 0, 100)

    def test_empty_data(self):
        with self.assertRaises(ValueError):
            detect_drift(np.array([]), np.array([1,2,3]))

unittest.main(argv=['first-arg-is-ignored'], exit=False)


.....
----------------------------------------------------------------------
Ran 5 tests in 0.011s

OK


KS Statistic: 0.2000, P-value: 0.0000
Data drift detected.


<unittest.main.TestProgram at 0x77a57544d0c0>