<center> Data Drifts

In [1]:
%reload_ext autoreload
%autoreload 2
import os
import sys

from dotenv import load_dotenv
load_dotenv()
project = os.getenv('PROJECTPATH', None)
if project:
    sys.path.append(project)

In [2]:
import numpy as np
from scipy.stats import skewnorm, entropy
import matplotlib.pyplot as plt
from matplotlib.axes import Axes
from ipywidgets import interact, FloatSlider, IntSlider, Layout, Dropdown
from typing import Literal, Callable


In [3]:
from src.data_drift_metrics import PSI, Wasserstein, KS, JansenShannon, FeatureGeneratorWhithBins

# I. Population Stability Index
$$
\text{PSI} = \sum_{i=1}^{n} \left( \text{Expected}_i - \text{Actual}_i \right) \cdot \ln \left( \frac{\text{Expected}_i}{\text{Actual}_i} \right)
$$

$$
\text{PSI} \in [0,\ \infty)
$$

where:
- $Expected_i$ — mean in i-bin of train data
- $Actual_i$ — mean in i-bin of current data
- $n$ — number of bins

Logic:
- quantifies the distribution shift between two vectors by comparing their proportions across predefined bins
- It is calculated using a logarithmic formula that emphasizes both magnitude and direction of change.
- Binning is required, and the quality of the metric can depend on the number and boundaries of the bins.
- PSI is particularly sensitive to small proportions

Application in Evidently:
 - tabular data numerical and categorical

## II. Wasserstein distance (Earth Mover's Distance)

$$
W_1^{\text{norm}}(P, Q) = \frac{1}{b - a} \int_0^1 \left| F_P^{-1}(u) - F_Q^{-1}(u) \right| \, du
$$

$$
W_1^{\text{norm}}(P, Q) \in [0,\ 1]
$$

where:
- $F_P^{-1}(u)$ — quantile function (inverse CDF) of the training dataset  
- $F_Q^{-1}(u)$ — quantile function (inverse CDF) of the current dataset  
- $u \in [0, 1]$ — probability level  
- $[a, b]$ — support range: from min to max across both distributions  
- $W_1^{\text{norm}}(P, Q)$ — normalized Wasserstein distance representing the average shift in quantiles, scaled by the support range

Logic:
- Measures the geometric distance between quantile functions (inverse CDFs) of two distributions
- The normalized version divides the Wasserstein distance by the scale
- Unlike PSI it does not require binning — it operates directly on continuous distributions via their CDFs.

Application in Evidently:
 - only numerical
 - default method for numerical data, if > 1000 objects

## III. Kolmogorov–Smirnov
$$
D_{n,m} = \sup_x \left| F_n(x) - F_m(x) \right|
$$

$$
D_{n,m} \in [0,\ 1]
$$

where:
- $F_n(x)$ — empirical cumulative distribution function (ECDF) the training dataset 
- $F_m(x)$ — ECDF of the **current** dataset (e.g., production data)  
- $x$ — any value within the combined domain of both datasets
- $D_{n,m}$ — the maximum vertical distance between the two ECDFs

## IV. Jensen–Shannon distance
1. **Entropy**: 
    $$ H(p) = -\sum_i p_ilog_2(p_i) $$
    
    where:
    - $p$ is a probability of event $i$
    
    **intuition**: 
    - entropy is the expected value of information
#

2. **Cross Entropy**:
    $$ H(p, q) = -\sum_i p_ilog_2(q_i) $$
    
    where:
    - $p$ is the true distribution  
    - $q$ is the estimated (model) distribution  
    
    **intuition**
    - How many bits are needed to encode events from $p$ using $q$ 
#

3. **KLD Kullback–Leibler Divergence**:
    $$ D_{\mathrm{KL}}(p \parallel q) = \sum_i p_ilog_2(\frac{p_i}{q_i})$$
    $$ = \sum_i p_ilog_2(p_i) -\sum_i p_ilog_2(q_i)$$
    $$ = H(p,q) - H(p)$$
    
    **intuition**:
    - Kullback–Leibler Divergence measures the difference between Cross Entropy and Entropy. 
    - It equals zero only when the two distributions are identical, and is strictly positive otherwise.
#

4. **Mixture Distribution**:
    $$ \vec{m} = \frac{\vec{p} + \vec{q}}{2} $$
    
    **intuition**:
    - $\vec{m}$ is the average (mixture) distribution  
    - acts as a symmetric reference between $\vec{p}$ and $\vec{q}$ 
#
5. **Jensen–Shannon Divergence**:
    $$D_{\mathrm{JS}}(\vec{p} \parallel \vec{q}) = \frac{D_{\mathrm{KL}}(\vec{p} \parallel \vec{m}) + D_{\mathrm{KL}}(\vec{q} \parallel \vec{m})}{2}$$
    where:
    - $\vec{m}$ is the mixture distribution  
    
    **intuition**: 
    - symmetric, smoothed version of KL Divergence 
    - avoids infinite values and handles zero probabilities gracefully
#
6. **Jensen–Shannon Distance**:
    $$ \text{Jensen–Shannon distance} = \sqrt{D_{\mathrm{JS}}(\vec{p} \parallel \vec{q})} $$

    **Properties**: 
    - The metric lies within the range [0, 1] and is symmetric
    
    **Intuition**: 
    - When calculating KLD, we took base-2 logarithms of probabilities in [0, 1], which is similar to squaring.
    - Taking the square root brings the values back into the [0, 1] range

# Visualization

In [4]:

class Visualizer:
    def __init__(self, **kwargs):
        self.psi = PSI(**kwargs)
        self.wass = Wasserstein(**kwargs)
        self.ks = KS(**kwargs)
        self.jsd = JansenShannon(**kwargs) 

    def plot_psi(self, ax: Axes):
        ax.bar(["PSI"], [self.psi.value], color='skyblue')
        ax.set_ylim(-0.5, 50)
        ax.axhline(0.25, color='red', linestyle='--', label='PSI = 0.25 - порог значительного дрифта')
        ax.text(0, self.psi.value + 0.2, f'{self.psi.value:.4f}', ha='center', va='bottom', fontsize=12, fontweight='bold', color='black')
        ax.set_title("PSI (Population Stability Index)")
        ax.set_ylim(0, 20)
        ax.legend()

    def plot_psi_bins (self, ax: Axes):
        bin_centers = (self.psi.bins[:-1] + self.psi.bins[1:]) / 2
        width = (self.psi.bins[1] - self.psi.bins[0]) * 0.4  # ширина столбиков
        ax.bar(bin_centers - width / 2, self.psi.train_ratio, width=width, label='Train', alpha=0.7, color='skyblue')
        ax.bar(bin_centers + width / 2, self.psi.current_ratio, width=width, label='Current', alpha=0.7, color='orange')
        ax.plot(bin_centers, self.psi.bins_contribution, label='PSI Contribution', color='purple', linewidth=2.5, marker='o', markersize=6)
        ax.set_title("Доли по бинам (для PSI)")
        ax.set_xlabel("Значения признака (по бинам)")
        ax.set_ylabel("Доля наблюдений")
        ax.grid(True, linestyle='--', alpha=0.3)
        ax.legend()
    
    def plot_js (self, ax: Axes):
        ax.bar(["JSD"], [self.jsd.value], color='lightgreen')
        ax.axhline(0, color='gray', linestyle='--')
        ax.axhline(0.1, color='red', linestyle='--', label='JSD = 0.1 - порог статистической значимости')
        ax.text(0, self.jsd.value + 0.02, f'{self.jsd.value:.4f}', ha='center', va='bottom', fontsize=12, fontweight='bold', color='black')
        ax.set_title("Jensen-Shannon Distance")
        ax.set_ylim(0, 1)
        ax.legend()
    
    def plot_jsd_bins (self, ax: Axes):
        bin_centers = (self.jsd.bins[:-1] + self.jsd.bins[1:]) / 2
        width = (self.jsd.bins[1] - self.jsd.bins[0]) * 0.4  
        ax.bar(bin_centers - width / 2, self.jsd.train_ratio, width=width, label='Train', alpha=0.5, color='skyblue')
        ax.bar(bin_centers + width / 2, self.jsd.current_ratio, width=width, label='Current', alpha=0.5, color='orange')
        ax.plot(bin_centers, self.jsd.jsd_contrib, label='JSD Contribution', color='purple', linewidth=2.5, marker='o', markersize=6)
        ax.set_title("Вклад каждого бина в Jensen–Shannon Divergence")
        ax.set_xlabel("Значения признака (по бинам)")
        ax.set_ylabel("Вклад в дивергенцию")
        ax.grid(True, linestyle='--', alpha=0.3)
        ax.legend()

    def plot_wasserstein (self,  ax: Axes):
        ax.bar(["W1"], [self.wass.value], color='lightgreen')
        ax.axhline(0, color='gray', linestyle='--')
        ax.axhline(0.1, color='red', linestyle='--', label='EMD = 0.1 - порог значительного дрифта')
        ax.text(0, self.wass.value + 0.02, f'{self.wass.value:.4f}', ha='center', va='bottom', fontsize=12, fontweight='bold', color='black')
        ax.set_title("W1 (Wasserstein distance) / EMD (Earth Mover's Distance)")
        ax.set_ylim(0, 1)
        ax.legend()
    
    def plot_wasserstein_geom (self, ax: Axes):       
        ax.plot(self.wass.common_x, self.wass.train_iecdf, label='Train IECDF', color='blue')
        ax.plot(self.wass.common_x, self.wass.current_iecdf, label='Current IECDF', color='orange')
        ax.fill_between(self.wass.common_x, self.wass.train_iecdf, self.wass.current_iecdf, color='lightcoral', alpha=0.4, label='|Train - Current|')
        ax.set_title("Разность IECDF двух выборок")
        ax.set_xlabel("Probability (quantiles)")
        ax.set_ylabel("Feature value")
        ax.grid(True)
        ax.legend()

    def plot_ks (self, ax: Axes):
        ax.bar(["KS"], [self.ks.value], color='lightgreen')
        ax.plot(["KS"], [self.ks.p_value], marker='o', color='purple', markersize=8, label='p-value')
        ax.axhline(0, color='gray', linestyle='--')
        ax.axhline(0.05, color='red', linestyle='--', label='P-value = 0.05 - порог статистической значимости')
        ax.text(0, self.ks.value + 0.02, f'{self.ks.value:.4f}', ha='center', va='bottom', fontsize=12, fontweight='bold', color='black')
        ax.text(0, float(self.ks.p_value) + 0.02, f'{self.ks.p_value:.2e}', ha='center', va='bottom', fontsize=9, fontweight='bold', color='purple')
        ax.set_title("KS Kolmogorov-Smirnov")
        ax.set_ylim(0, 1)
        ax.legend()

    def plot_ks_geom (self, ax: Axes):       
        ax.plot(self.ks.all_uniques, self.ks.train_ecfd, label='Train ECDF', color='blue')
        ax.plot(self.ks.all_uniques, self.ks.current_ecfd, label='Current ECDF', color='orange')
        ax.fill_between(self.ks.all_uniques, self.ks.train_ecfd, self.ks.current_ecfd, color='lightcoral', alpha=0.4, label='|Train - Current|')
        ax.set_title("Разность ECDF двух выборок")
        ax.set_xlabel("Значения фичи (нормализованные)")
        ax.set_ylabel("Probability")
        ax.grid(True)
        ax.legend()
    
    def visualize (self, **kwargs):
        """ 
        Функция визуализации
        """
        _, axs = plt.subplots(
            nrows = 4,
            ncols= 2, 
            figsize=(12, 15), 
            gridspec_kw={"width_ratios": [1, 1.5]},
            constrained_layout=True
        )

        # Ряд 1: PSI
        ax1, ax2 = axs[0]
        self.plot_psi(ax1)
        self.plot_psi_bins(ax2)
        
        # Ряд 2: Jensen-Shennon
        ax3, ax4 = axs[1]
        self.plot_js(ax3)
        self.plot_jsd_bins(ax4)

        # Ряд 3: Wasserstein
        ax5, ax6 = axs[2]
        self.plot_wasserstein(ax5)
        self.plot_wasserstein_geom(ax6)

        # Ряд 3: Колмогоров-Смирнов
        ax7, ax8 = axs[3]
        self.plot_ks(ax7)
        self.plot_ks_geom(ax8)

        plt.show()


In [None]:
l = Layout(width='1000px')
style = {'description_width': '200px'}

def run_visualizer (**kwargs):
    viz = Visualizer(**kwargs)
    return viz.visualize(**kwargs)

interact(
    run_visualizer,
    distribution = Dropdown(options=['normal', 'skewed'], value='skewed', description='Распределение:'),
    skew = IntSlider(value=10, min=-10, max=10, step=1, description='Смещение распределения', layout = l, style = style),

    n_batches = IntSlider(value=50, min=5, max=100, step=1, description='ЧИСЛО БИНОВ', layout = l, style = style),
    
    train_size = IntSlider(value=20000, min=1000, max=100000, step=100, description='TRAIN: размер выборки', layout = l, style = style),
    current_size = IntSlider(value=3500, min=1000, max=100000, step=100, description='CURRENT: размер выборки', layout = l, style = style),
    
    train_mean = FloatSlider(value=35.0, min=0, max=100.0, step=1.0, description='TRAIN: среднее', layout = l, style = style),
    current_mean = FloatSlider(value=30.0, min=0, max=100.0, step=1.0, description='CURRENT: среднее', layout = l, style = style),
    
    train_std = FloatSlider(value=5.0, min=0.01, max=10.0, step=0.1, description='TRAIN: стандартное отклонение', layout = l, style = style),
    current_std = FloatSlider(value=5.2, min=0.01, max=10.0, step=0.1, description='CURRENT: стандартное отклонение', layout = l, style = style),
)


interactive(children=(Dropdown(description='Распределение:', index=1, options=('normal', 'skewed'), value='ske…

<function __main__.run_visualizer(**kwargs)>

# DEV

In [6]:
kwargs = {
    "distribution": "normal",
    "n_batches": 50, 
    "train_size": 10000, 
    "current_size": 10000, 
    "train_mean": 5, 
    "current_mean": 5, 
    "train_std": 5, 
    "current_std": 5, 
}
data= FeatureGeneratorWhithBins(**kwargs)


In [7]:
import math

In [8]:
math.log(0.25, 2), math.log(0.2, 2), math.log(0.1, 2)

(-2.0, -2.321928094887362, -3.321928094887362)

In [9]:
a = 0.500001
b = 0.499999

c = 0.5
d = 0.5

if a + b > 1 or c + d > 1:
    raise ValueError('Сумма вероятностей не может превышать 1')
a_log = math.log(a, 2)
b_log = math.log(b, 2)
c_log = math.log(c, 2)
d_log = math.log(d, 2)
print (f"энтропия: -(({a} * {a_log}) + ({b} * {b_log})) = {-1*(a*a_log + b*b_log)}")
print (f"кросс-энтропия: -(({a} * {c_log}) + ({b} * {d_log})) = {-1*(a*c_log + b*d_log)}")
print (f"дельта после логарифмирования: {abs(c_log - d_log)}")

print (f"дельта: {a_log - c_log}")
print (f"дельта: {a_log - d_log}")

энтропия: -((0.500001 * -0.9999971146128036) + (0.499999 * -1.0000028853929672)) = 0.9999999999971146
кросс-энтропия: -((0.500001 * -1.0) + (0.499999 * -1.0)) = 1.0
дельта после логарифмирования: 0.0
дельта: 2.8853871963940847e-06
дельта: 2.8853871963940847e-06
