In [None]:
import pandas as pd
from pathlib import Path
data_dir = Path('C:/Revenue Leakage')
out_dir = data_dir / 'outputs'


In [3]:
payments = pd.read_excel(data_dir / 'payments_system.xlsx', parse_dates=['timestamp'])
refunds = pd.read_excel(data_dir / 'refunds.xlsx', parse_dates=['refund_timestamp'])
refunded = pd.read_excel(out_dir / 'refunded_detailed.xlsx')   # produced by 02 notebook


In [4]:
# Identify duplicate refunds by transaction_id (multiple refund rows -> duplicates)
dup_refund_counts = refunds.groupby('transaction_id').size().reset_index(name='refund_count')
duplicate_refunds = dup_refund_counts[dup_refund_counts['refund_count'] > 1].copy()
print(duplicate_refunds)
duplicate_refunds.to_csv(out_dir / 'duplicate_refunds_summary.csv', index=False)
len(duplicate_refunds)


                            transaction_id  refund_count
29    056a5d20-c2f7-4289-8320-3745e02767b7             2
104   17572151-7cfa-44c0-9d54-d6992ac7a84e             2
111   1804a41c-2c44-4059-868f-a90c7ad6ff5b             2
116   19a20913-94ae-4dc1-840c-fdfa3679b0a1             2
137   1e6f0be8-240f-4f0a-b202-fc66524db1fe             2
224   349c8e7f-57e8-474a-9d07-689b836492c7             2
297   42ff29df-95d2-45ce-806a-312601d0a55e             2
317   4784e067-1b80-478e-a540-a49026d640f6             2
353   4e40b37e-c743-467c-8875-7f0db03f622a             2
428   5c2408f2-1251-4b2f-b48c-975ba092d606             2
471   64ff1fe2-ee66-4c64-90ac-91a2851ae9dd             2
489   67d16390-aa26-4825-975a-1a2229359838             2
499   6a0b7891-0a16-41ea-a11d-e47f83082f81             2
534   70b5a0e9-c186-4fe7-9aca-18ac6caac532             2
738   a029e5c5-2684-47d8-8363-22588d1700b4             2
769   a7da6cb4-e817-47d8-801b-cc8273644a7a             2
974   ce52bbb0-f85d-4e4f-9dcd-2

24

In [6]:
# join refunds to payments to see mismatched customer_ids
payments_small = payments[['transaction_id','customer_id']].drop_duplicates()
refunds_joined = refunds.merge(payments_small, on='transaction_id', how='left', suffixes=('_refund','_payment'))
misallocated = refunds_joined[refunds_joined['customer_id_refund'] != refunds_joined['customer_id_payment']]
misallocated.to_csv(out_dir / 'refunds_misallocated.csv', index=False)
len(misallocated)


24

In [7]:
# rule: refund_amount > 1.1 * original_amount (if original exists)
joined = refunds_joined.merge(payments[['transaction_id','amount']], on='transaction_id', how='left')
joined['over_refund_rule'] = joined['refund_amount'] > 1.1 * joined['amount']
suspicious_amounts = joined[joined['over_refund_rule'] == True]
suspicious_amounts.to_csv(out_dir / 'suspicious_refund_amounts.csv', index=False)
len(suspicious_amounts)


80

In [8]:
summary = {
    'duplicate_refund_txns': int(len(duplicate_refunds)),
    'misallocated_refunds': int(len(misallocated)),
    'suspicious_amount_refunds': int(len(suspicious_amounts))
}
pd.Series(summary).to_frame('value').to_csv(out_dir / 'leakage_detection_summary.csv')
summary


{'duplicate_refund_txns': 24,
 'misallocated_refunds': 24,
 'suspicious_amount_refunds': 80}

In [13]:
import numpy as np
from pathlib import Path
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

data_dir = Path('C:/Revenue Leakage')
out_dir = data_dir / 'outputs'

refunded = pd.read_excel(out_dir / 'refunded_detailed.xlsx', parse_dates=['refund_timestamp', 'timestamp'])


In [14]:
# create numeric features for anomaly detection
refunded['time_to_refund_hours'] = (pd.to_datetime(refunded['refund_timestamp']) - pd.to_datetime(refunded['timestamp'])).dt.total_seconds() / 3600.0
refunded['refund_ratio'] = refunded['refund_amount'] / refunded['amount'].replace(0, np.nan)
refunded['channel_code'] = refunded['channel'].astype('category').cat.codes

features = refunded[['amount','refund_amount','time_to_refund_hours','refund_ratio','channel_code']].fillna(0)


In [15]:
scaler = StandardScaler()
X = scaler.fit_transform(features)

model = IsolationForest(n_estimators=200, contamination=0.05, random_state=42)
model.fit(X)
refunded['anomaly_score'] = model.decision_function(X)
refunded['anomaly_flag'] = model.predict(X) == -1

refunded.sort_values('anomaly_score').head(20).to_csv(out_dir / 'anomaly_top20.csv', index=False)
refunded['anomaly_flag'].sum()


np.int64(60)

In [16]:
# basic distribution checks
refunded.groupby('anomaly_flag')[['amount','refund_amount']].describe()

# Save anomaly-labelled dataset
refunded.to_csv(out_dir / 'refunded_with_anomalies.csv', index=False)
