In [None]:
# 03_leakage_detection.ipynb
import pandas as pd
from pathlib import Path
data_dir = Path('/mnt/data')
out_dir = data_dir / 'outputs'


In [None]:
payments = pd.read_csv(data_dir / 'payments_system.csv', parse_dates=['timestamp'])
refunds = pd.read_csv(data_dir / 'refunds.csv', parse_dates=['refund_timestamp'])
refunded = pd.read_csv(out_dir / 'refunded_detailed.csv')   # produced by 02 notebook


In [None]:
# Identify duplicate refunds by transaction_id (multiple refund rows -> duplicates)
dup_refund_counts = refunds.groupby('transaction_id').size().reset_index(name='refund_count')
duplicate_refunds = dup_refund_counts[dup_refund_counts['refund_count'] > 1].copy()
duplicate_refunds.to_csv(out_dir / 'duplicate_refunds_summary.csv', index=False)
len(duplicate_refunds)


In [None]:
# join refunds to payments to see mismatched customer_ids
payments_small = payments[['transaction_id','customer_id']].drop_duplicates()
refunds_joined = refunds.merge(payments_small, on='transaction_id', how='left', suffixes=('_refund','_payment'))
misallocated = refunds_joined[refunds_joined['customer_id_refund'] != refunds_joined['customer_id_payment']]
misallocated.to_csv(out_dir / 'refunds_misallocated.csv', index=False)
len(misallocated)


In [None]:
# rule: refund_amount > 1.1 * original_amount (if original exists)
joined = refunds_joined.merge(payments[['transaction_id','amount']], on='transaction_id', how='left')
joined['over_refund_rule'] = joined['refund_amount'] > 1.1 * joined['amount']
suspicious_amounts = joined[joined['over_refund_rule'] == True]
suspicious_amounts.to_csv(out_dir / 'suspicious_refund_amounts.csv', index=False)
len(suspicious_amounts)


In [None]:
summary = {
    'duplicate_refund_txns': int(len(duplicate_refunds)),
    'misallocated_refunds': int(len(misallocated)),
    'suspicious_amount_refunds': int(len(suspicious_amounts))
}
pd.Series(summary).to_frame('value').to_csv(out_dir / 'leakage_detection_summary.csv')
summary


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

data_dir = Path('/mnt/data')
out_dir = data_dir / 'outputs'

refunded = pd.read_csv(out_dir / 'refunded_detailed.csv', parse_dates=['refund_timestamp', 'timestamp'])


In [None]:
# create numeric features for anomaly detection
refunded['time_to_refund_hours'] = (pd.to_datetime(refunded['refund_timestamp']) - pd.to_datetime(refunded['timestamp'])).dt.total_seconds() / 3600.0
refunded['refund_ratio'] = refunded['refund_amount'] / refunded['amount'].replace(0, np.nan)
refunded['channel_code'] = refunded['channel'].astype('category').cat.codes

features = refunded[['amount','refund_amount','time_to_refund_hours','refund_ratio','channel_code']].fillna(0)


In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(features)

model = IsolationForest(n_estimators=200, contamination=0.05, random_state=42)
model.fit(X)
refunded['anomaly_score'] = model.decision_function(X)
refunded['anomaly_flag'] = model.predict(X) == -1

refunded.sort_values('anomaly_score').head(20).to_csv(out_dir / 'anomaly_top20.csv', index=False)
refunded['anomaly_flag'].sum()


In [None]:
# basic distribution checks
refunded.groupby('anomaly_flag')[['amount','refund_amount']].describe()

# Save anomaly-labelled dataset
refunded.to_csv(out_dir / 'refunded_with_anomalies.csv', index=False)
