In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../data/processed_data.csv')

In [3]:
df.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class', 'Hour', 'Transaction_Count_Per_Hour',
       'Transaction_Mean_Per_Hour', 'Amount_Ratio'],
      dtype='object')

In [4]:
df = df.sort_values('Time').reset_index(drop=True)

print(f"Veri seti boyutu: {df.shape}")

Veri seti boyutu: (284807, 35)


In [6]:
df['Time_Delta'] = df['Time'].diff().fillna(0)
df[['Time', 'Time_Delta']].head()

Unnamed: 0,Time,Time_Delta
0,0.0,0.0
1,0.0,0.0
2,1.0,1.0
3,1.0,0.0
4,2.0,1.0


In [7]:
df['Event_Block'] = (df['Time'] // 3600).astype(int)

In [8]:
print(df['Event_Block'].value_counts().sort_index().head())

Event_Block
0    3963
1    2217
2    1576
3    1821
4    1082
Name: count, dtype: int64


### Neden Sıralama (Ranking)?
Fraud zamana bağlıdır.
Bağımsız sınıflandırma yerine, saatlik bloklardaki en şüpheli işlemleri üste taşıyan **Ranking** yapısına geçiyorum.
Bu yaklaşım, analistlerin alarm listesini tarama mantığıyla (yukarıdan aşağıya) birebir örtüşür.

In [9]:
max_block = df['Event_Block'].max()
train_cutoff = int(max_block * 0.70)
val_cutoff = int(max_block * 0.85)

In [10]:
train_df = df[df['Event_Block'] <= train_cutoff]
val_df = df[(df['Event_Block'] > train_cutoff) & (df['Event_Block'] <= val_cutoff)]
test_df = df[df['Event_Block'] > val_cutoff]

In [11]:
cols_to_drop = ['Class', 'Time', 'Event_Block']

X_train = train_df.drop(cols_to_drop, axis=1)
y_train = train_df['Class']

In [12]:
X_val = val_df.drop(cols_to_drop, axis=1)
y_val = val_df['Class']

X_test = test_df.drop(cols_to_drop, axis=1)
y_test = test_df['Class']

In [13]:
q_train = train_df.groupby('Event_Block', sort=False).size().to_list()
q_val = val_df.groupby('Event_Block', sort=False).size().to_list()
q_test = test_df.groupby('Event_Block', sort=False).size().to_list()

In [14]:
print(f"Train Seti : {len(X_train)} satır, {len(q_train)} grup (Saatlik Blok)")
print(f"Val Seti   : {len(X_val)} satır, {len(q_val)} grup")
print(f"Test Seti  : {len(X_test)} satır, {len(q_test)} grup")

Train Seti : 167622 satır, 33 grup (Saatlik Blok)
Val Seti   : 57243 satır, 7 grup
Test Seti  : 59942 satır, 8 grup


In [16]:
import os

In [17]:
output_dir = '../data/ranking'
os.makedirs(output_dir, exist_ok=True)

In [18]:
train_df.to_csv(f'{output_dir}/train.csv', index=False)
val_df.to_csv(f'{output_dir}/val.csv', index=False)
test_df.to_csv(f'{output_dir}/test.csv', index=False)