In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os, glob, pprint, hashlib

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

DATA_DIR = ''
PROC_DIR = DATA_DIR + '/processed'

In [None]:
print("PROC_DIR exists?", os.path.exists(PROC_DIR))
print(sorted(glob.glob(PROC_DIR + "/*.parquet"))[:10])

principal = pd.read_parquet(os.path.join(PROC_DIR, 'principal_clean.parquet'))
train_pr  = pd.read_parquet(os.path.join(PROC_DIR, 'train_clean.parquet'))
valid_pr  = pd.read_parquet(os.path.join(PROC_DIR, 'valid_clean.parquet'))
test_pr   = pd.read_parquet(os.path.join(PROC_DIR, 'test_clean.parquet'))

### **Visualização dos dados**



In [None]:
#distribuição dos valores
principal = principal.copy()
principal['amount_log'] = np.log1p(principal['amount_paid']).astype('float32')

ax = principal['amount_log'].plot(kind='hist', bins=120)
ax.set_title('Principal: distribuição de log1p(amount_paid)')
ax.set_xlabel('log1p(amount_paid)')
plt.show()

In [None]:
#volume diário (contagem e soma)
daily = principal.set_index('timestamp')['amount_paid'].resample('D').agg(['count','sum'])
daily[['count']].plot(title='Principal: transações por dia'); plt.show()
daily[['sum']].plot(title='Principal: soma de valores por dia'); plt.show()

In [None]:
#desbalanceamento
vc = principal['label'].value_counts().sort_index()  # 0,1
ax = vc.rename({0:'legítimas', 1:'ilícitas'}).plot(kind='bar')
ax.set_title('Análise do balanceamento das classes')
ax.set_ylabel('contagem')
ax.set_yscale('log')

for i, v in enumerate(vc.values):
    ax.text(i, v, f'{v:,}', ha='center', va='bottom', fontsize=9)

plt.show()

### **Feature engineering**

In [7]:
def _hash_bucket(series: pd.Series, n_buckets: int = 4096, salt: str = "cp_v1") -> pd.Series:
    def _h(x: str) -> int:
        if x is None or (isinstance(x, float) and np.isnan(x)):
            x = "UNK"
        s = (salt + str(x)).encode("utf-8")
        return int(hashlib.md5(s).hexdigest(), 16) % n_buckets
    return series.astype("string").map(_h).astype("int32")


In [8]:
def add_time_safe_tx_features(
    pdf,
    col_ts='timestamp',
    col_src='src_account',
    col_dst='dst_account',
    col_amt='amount_paid',
    K=50,
    alpha=0.3,
    n_buckets=4096
):
    #ordenar as transações por timestamp (evitar vazamento) e usar uma cópia
    pdf = pdf.sort_values(col_ts).copy()

    #criação das features que capturam sazonalidade (fraudes podem ocorrer em horários atípicos)
    pdf['_dow']  = pdf[col_ts].dt.dayofweek.astype('int8')
    pdf['_hour'] = pdf[col_ts].dt.hour.astype('int8')

    #aplica log1p ao valor pago para reduzir assimetria (melhorar a escala)
    pdf['_amt_log'] = np.log1p(pdf[col_amt]).astype('float32')

    #analisar se as contas envolvidas estão no mesmo banco
    if 'src_bank' in pdf.columns and 'dst_bank' in pdf.columns:
        pdf['_same_bank'] = (pdf['src_bank'] == pdf['dst_bank']).astype('int8')
    else:
        pdf['_same_bank'] = 0

    if 'pay_currency' in pdf.columns and 'recv_currency' in pdf.columns:
        pdf['currency_pair'] = (pdf['pay_currency'].fillna('UNK') + '->' + pdf['recv_currency'].fillna('UNK')).astype('string')
        pdf['cp_bucket'] = _hash_bucket(pdf['currency_pair'], n_buckets=n_buckets, salt="cp_v1")
    elif 'currency_pair' in pdf.columns:
        pdf['currency_pair'] = pdf['currency_pair'].astype('string')
        pdf['cp_bucket'] = _hash_bucket(pdf['currency_pair'], n_buckets=n_buckets, salt="cp_v1")
    else:
        pdf['currency_pair'] = 'UNK'
        pdf['cp_bucket'] = 0

    #número de transações anteriores do remetente (contas novas vs veteranas)
    gsrc = pdf.groupby(col_src, sort=False)
    pdf['_src_tx_count_prev'] = gsrc.cumcount().astype('int32')

    #usei o .shift para deslocar o valor do remetente em 1 dentro do grupo (sem olhar a transação atual)
    prev_src_amt = gsrc[col_amt].shift()

    #acumulados históricos do remetente ATÉ a transação anterior
    pdf['_src_amt_sum_prev']  = prev_src_amt.groupby(pdf[col_src]).expanding().sum().reset_index(level=0, drop=True).fillna(0).astype('float32')
    #uma transação muito maior que a média pode ser atípica
    pdf['_src_amt_mean_prev'] = prev_src_amt.groupby(pdf[col_src]).expanding().mean().reset_index(level=0, drop=True).fillna(0).astype('float32')
    pdf['_src_amt_std_prev']  = prev_src_amt.groupby(pdf[col_src]).expanding().std().reset_index(level=0, drop=True).fillna(0).astype('float32')

    #tempo em segundos desde a última transação do remetente (burst pode sugerir fraude)
    pdf['_src_secs_since_last'] = (pdf[col_ts] - gsrc[col_ts].shift()).dt.total_seconds().fillna(-1).astype('float32')

    #rolling causal para remetente (últimas K transações ANTES) + EWMA causal
    pdf['_src_roll_mean_K'] = prev_src_amt.groupby(pdf[col_src]).rolling(K, min_periods=1).mean().reset_index(level=0, drop=True).astype('float32')
    pdf['_src_roll_std_K']  = prev_src_amt.groupby(pdf[col_src]).rolling(K, min_periods=2).std().reset_index(level=0, drop=True).fillna(0).astype('float32')
    pdf['_src_roll_sum_K']  = prev_src_amt.groupby(pdf[col_src]).rolling(K, min_periods=1).sum().reset_index(level=0, drop=True).astype('float32')
    pdf['_src_ewm_mean']    = prev_src_amt.groupby(pdf[col_src]).apply(lambda s: s.ewm(alpha=alpha, adjust=False).mean()).reset_index(level=0, drop=True).astype('float32')

    #mesma lógica para o destinatário
    gdst = pdf.groupby(col_dst, sort=False)
    pdf['_dst_rx_count_prev'] = gdst.cumcount().astype('int32')
    prev_dst_amt = gdst[col_amt].shift()

    pdf['_dst_amt_sum_prev']  = prev_dst_amt.groupby(pdf[col_dst]).expanding().sum().reset_index(level=0, drop=True).fillna(0).astype('float32')
    pdf['_dst_amt_mean_prev'] = prev_dst_amt.groupby(pdf[col_dst]).expanding().mean().reset_index(level=0, drop=True).fillna(0).astype('float32')
    pdf['_dst_amt_std_prev']  = prev_dst_amt.groupby(pdf[col_dst]).expanding().std().reset_index(level=0, drop=True).fillna(0).astype('float32')
    pdf['_dst_secs_since_last'] = (pdf[col_ts] - gdst[col_ts].shift()).dt.total_seconds().fillna(-1).astype('float32')

    pdf['_dst_roll_mean_K'] = prev_dst_amt.groupby(pdf[col_dst]).rolling(K, min_periods=1).mean().reset_index(level=0, drop=True).astype('float32')
    pdf['_dst_roll_std_K']  = prev_dst_amt.groupby(pdf[col_dst]).rolling(K, min_periods=2).std().reset_index(level=0, drop=True).fillna(0).astype('float32')
    pdf['_dst_roll_sum_K']  = prev_dst_amt.groupby(pdf[col_dst]).rolling(K, min_periods=1).sum().reset_index(level=0, drop=True).astype('float32')
    pdf['_dst_ewm_mean']    = prev_dst_amt.groupby(pdf[col_dst]).apply(lambda s: s.ewm(alpha=alpha, adjust=False).mean()).reset_index(level=0, drop=True).astype('float32')

    #padronizar o valor atual pelo histórico do remetente/destinatário (z-scores)
    eps = 1e-6
    pdf['_src_amt_z'] = ((pdf[col_amt] - pdf['_src_amt_mean_prev']) / (pdf['_src_amt_std_prev'] + eps)).astype('float32')
    pdf['_dst_amt_z'] = ((pdf[col_amt] - pdf['_dst_amt_mean_prev']) / (pdf['_dst_amt_std_prev'] + eps)).astype('float32')

    return pdf

In [9]:
principal_feats = add_time_safe_tx_features(principal)
principal_feats.to_parquet(os.path.join(PROC_DIR, 'principal_feats.parquet'), index=False)

tail_feats_path = None
tail_path = os.path.join(PROC_DIR, 'tail_ood_clean.parquet')
if os.path.exists(tail_path):
    tail_ood = pd.read_parquet(tail_path)
    tail_ood_feats = add_time_safe_tx_features(tail_ood)
    tail_feats_path = os.path.join(PROC_DIR, 'tail_ood_feats.parquet')
    tail_ood_feats.to_parquet(tail_feats_path, index=False)