# Data Drift

In [1]:
import pandas as pd
from scipy.stats import ks_2samp

In [2]:
df = pd.read_csv('../data/processed/train_data.csv')

detect drift in transaction amunt and fraud flagg using KL divergence

In [3]:
# Convert transaction time to datetime
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])

# Set a time window, e.g., monthly
df['month'] = df['trans_date_trans_time'].dt.to_period('M')

# Ensure no missing values for the columns you want to check
df = df.dropna(subset=['amt', 'is_fraud'])

# Get all unique months in the dataset
months = df['month'].unique()

# Iterate through the months and compare each to the previous one
for k in [1, 2, 3]:
    for i in range(1, len(months)):
        month_1 = months[i-k]
        month_2 = months[i]

        # Filter data for the two months
        window_1 = df[df['month'] == month_1]
        window_2 = df[df['month'] == month_2]

        # Check for data drift on 'amt', 'is_fraud' features
        for feature in ['amt', 'is_fraud']:
            ks_stat, p_value = ks_2samp(window_1[feature], window_2[feature])

            # Check for data drift based on the p-value
            if p_value < 0.05:
                print(f"Data drift detected for {feature}!")
                # Print the KS statistic and p-value
                print(f"Comparing {feature} between {month_1} and {month_2}:")
                print(f"KS Statistic: {ks_stat}, p-value: {p_value}")
                print("-" * 50)
            else:
                # print(f"No significant drift detected for {feature}.")
                pass

Data drift detected for amt!
Comparing amt between 2019-02 and 2019-03:
KS Statistic: 0.010315094233342925, p-value: 0.003906521775580153
--------------------------------------------------
Data drift detected for amt!
Comparing amt between 2019-03 and 2019-04:
KS Statistic: 0.0077989317713660355, p-value: 0.029068772650483576
--------------------------------------------------
Data drift detected for amt!
Comparing amt between 2019-09 and 2019-10:
KS Statistic: 0.007864127903544182, p-value: 0.026723893693126
--------------------------------------------------
Data drift detected for amt!
Comparing amt between 2020-04 and 2020-05:
KS Statistic: 0.0074011766704468185, p-value: 0.04203534196892533
--------------------------------------------------
Data drift detected for amt!
Comparing amt between 2020-05 and 2020-06:
KS Statistic: 0.008237496853547799, p-value: 0.024157033803896624
--------------------------------------------------
Data drift detected for amt!
Comparing amt between 2019-0