In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pyarrow
from dataclasses import dataclass
import yaml
from typing import Union

In [2]:
os.getcwd()

'C:\\Users\\iaros\\My_documents\\Education\\projects\\fraud_detection_01\\notebooks'

In [3]:
os.chdir("..")
os.getcwd()

'C:\\Users\\iaros\\My_documents\\Education\\projects\\fraud_detection_01'

## Загрузка конфигураций

In [4]:
# Общие настройки
with open("./config/base.yaml") as f:
    base_cfg = yaml.safe_load(f)
# Настройки фрода
with open("./config/fraud.yaml") as f:
    fraud_cfg = yaml.safe_load(f)
# Настройки фрода для дропов
with open("./config/drops.yaml", encoding="utf8") as f:
    drops_cfg = yaml.safe_load(f)
# Настройки времени
with open("./config/time.yaml") as f:
    time_cfg = yaml.safe_load(f)

## Класс `DropBatchProcessor`
- модуль `data_generator.fraud.drops.processor`
- Обработка полученной дропом партии (батча) денег

In [5]:
class DropBatchHandler:
    """
    Обработка полученной дропом партии (батча) денег
    ---------------------------
    drop_type: str. 'distributor' или 'purchaser'
    amt_hand: DropAmountHandler. Генератор сумм входящих/исходящих транзакций, сумм снятий.
              Управление балансом текущего дропа.
    behav_hand: DistBehaviorHandler | PurchBehaviorHandler.
                Управление поведением дропа: распределителя или покупателя.
    create_txn: CreateDropTxn. Создание транзакций.
    declined: bool. По умолчанию False. Отклоняются ли транзакции.
    txns_fm_batch: list. Транзакции дропа в текущем батче.
    """

    def __init__(self, base: DropBaseClasses, create_txn: CreateDropTxn):
        """
        base: DropBaseClasses. Объекты основных классов для дропов.
        create_txn: CreateDropTxn. Создание транзакций.
        """
        self.drop_type = base.drop_type
        self.amt_hand = base.amt_hand
        self.behav_hand = base.behav_hand
        self.create_txn = create_txn
        self.declined = False
        self.txns_fm_batch = []


    def should_decline(self):
        """
        Проверка будет ли отклонена транзакция.
        Возвращает True или False в зависимости от достижения лимитов.
        Также записывает это значение в self.declined
        """
        self.declined = self.create_txn.limit_reached()
        # print("is_declined", self.declined)
        return self.declined


    def reset_cache(self, all=False):
        """
        Сброс кэша.
        --------
        all: bool. Если True то сбрасывает атрибуты: txns_fm_batch, declined.
             Если False то declined не сбрсывает
             Также передается в методы классов:
             DistBehaviorHandler | PurchBehaviorHandler,
             DropAmountHandler.
        """
        behav_hand = self.behav_hand
        amt_hand = self.amt_hand

        self.txns_fm_batch = []
        amt_hand.reset_cache(all=all)
        behav_hand.reset_cache(all=all)

        if not all:
            return
        
        self.declined = False


    def distributor(self):
        """
        Обработка партии(батча) денег полученных дропом
        распределителем.
        """
        behav_hand = self.behav_hand
        amt_hand = self.amt_hand
        create_txn = self.create_txn
        
    
        while amt_hand.balance > 0:
            declined = self.should_decline() # будет ли отклонена транзакция
            behav_hand.guide_scenario()

            if behav_hand.to_crypto: # перевод на криптобиржу или нет
                txn_out = create_txn.purchase(declined=declined)
            else: # Иначе перевод/снятие
                to_drop = behav_hand.to_drop # Пробовать ли перевести другому дропу.
                txn_out = create_txn.trf_or_atm(receive=False, 
                                    to_drop=to_drop, declined=declined)
            # Добавляем в список транз-ций батча   
            self.txns_fm_batch.append(txn_out)

            # Сколько попыток будет после первой откл. транз-ции
            behav_hand.attempts_after_decline()

            # Если это не первая отклоненная транзакция, то вычитаем попытку
            # совершить транзакцию после отклонения
            behav_hand.deduct_attempts()
            
            # Решение об остановке после отклоненной транзакции
            if behav_hand.stop_after_decline():
                break
            

    def purchaser(self):
        """
        Обработка партии(батча) денег полученных дропом
        покупателем.
        """
        behav_hand = self.behav_hand
        amt_hand = self.amt_hand
        create_txn = self.create_txn
        
        while amt_hand.balance > 0:
            declined = self.should_decline() # будет ли отклонена транзакция

            txn_out = create_txn.purchase(declined=declined)
            self.txns_fm_batch.append(txn_out)
            
            # Сколько попыток будет после первой откл. транз-ции
            behav_hand.attempts_after_decline()

            # Если это не первая отклоненная транзакция, то вычитаем попытку
            # совершить транзакцию после отклонения
            behav_hand.deduct_attempts()
            
            # Решение об остановке после отклоненной транзакции
            if behav_hand.stop_after_decline():
                break


    def process_batch(self):
        """
        Вызов соответствующего типу дропа метода для обработки
        батча денег.
        Метод выбирается исходя из self.drop_type.
        ---------
        """
        drop_type = self.drop_type

        if drop_type == "distributor":
            self.distributor()
        elif drop_type == "purchaser":
            self.purchaser()

**Тест `DropBatchProcessor`**

In [2]:
# Временный импорт
import os
import yaml
import pandas as pd
import numpy as np
os.chdir("..")

# Общие настройки
with open("./config/base.yaml") as f:
    base_cfg = yaml.safe_load(f)
# Настройки фрода
with open("./config/fraud.yaml") as f:
    fraud_cfg = yaml.safe_load(f)
# Настройки фрода для дропов
with open("./config/drops.yaml", encoding="utf8") as f:
    drops_cfg = yaml.safe_load(f)
# Настройки времени
with open("./config/time.yaml") as f:
    time_cfg = yaml.safe_load(f)

In [3]:
from data_generator.fraud.drops.build.config import DropConfigBuilder
from data_generator.fraud.drops.build.builder import DropBaseClasses
from data_generator.fraud.drops.txns import CreateDropTxn
from data_generator.fraud.drops.processor import DropBatchHandler
from data_generator.general_time import pd_timestamp_to_unix

drop_cfg_build = DropConfigBuilder(base_cfg=base_cfg, fraud_cfg=fraud_cfg, drop_cfg=drops_cfg)
configs = drop_cfg_build.build_dist_cfg()
# configs = drop_cfg_build.build_purch_cfg()

base_agg1 = DropBaseClasses(drop_type="distributor", configs=configs)
# base_agg1 = DropBaseClasses(drop_type="purchaser", configs=configs)
base_agg1.build_all()
# base_agg1.build_all(drop_type="purchaser")
acc_hand1 = base_agg1.acc_hand
amt_hand1 = base_agg1.amt_hand
part_data1 = base_agg1.part_data
time_hand1 = base_agg1.time_hand
behav_hand1 = base_agg1.behav_hand

cr_drop_txn1 = CreateDropTxn(configs=configs, base=base_agg1)
drop_clients = configs.clients

batch_hand = DropBatchHandler(base=base_agg1, create_txn=cr_drop_txn1)

for client in drop_clients.iloc[[8]].itertuples():
    part_data1.client_info = client
    acc_hand1.client_id = client.client_id
    acc_hand1.get_account(own=True)
part_data1.client_info

Pandas(Index=8, client_id=3565, district_id=44, birth_date='1962-07-29', sex='male', region='Ставропольский', area='Ставрополь', timezone='UTC+3', lat=45.0445439, lon=41.9690168, population=398266, home_ip='2.60.13.50', geometry=<MULTIPOLYGON (((41.81 45.006, 41.811 45.006, 41.813 45.007, 41.815 45.007, ...>)

In [4]:
def reset_caches(cr_drop_txn, behav_hand, amt_hand, time_hand, part_data, batch_hand):
    cr_drop_txn.reset_cache(only_counters=False)
    behav_hand.reset_cache(all=True)
    amt_hand.reset_cache(all=True) # batch_txns здесь
    time_hand.reset_cache()
    part_data.reset_cache()
    batch_hand.reset_cache(all=True)

In [4]:
# acc_hand1.outer_accounts.loc[~(acc_hand1.outer_accounts.isin(acc_hand1.used_accounts))].sample(1)
# acc_hand1.outer_accounts.loc[~(acc_hand1.outer_accounts.isin(acc_hand1.used_accounts))]
# acc_hand1.reset_cache()
acc_hand1.used_accounts

Series([], Name: account_id, dtype: object)

**`should_decline`**  
- cr_drop_txn1.out_txns = cr_drop_txn1.out_lim
- cr_drop_txn1.in_txns = cr_drop_txn1.in_lim
- cr_drop_txn1.out_txns = cr_drop_txn1.out_lim - 1
- cr_drop_txn1.in_txns = cr_drop_txn1.in_lim - 1
- cr_drop_txn1.out_txns = cr_drop_txn1.out_lim + 1
- cr_drop_txn1.in_txns = cr_drop_txn1.in_lim + 1

In [5]:
reset_caches(cr_drop_txn1, behav_hand1, amt_hand1, time_hand1, part_data1, batch_hand)
# cr_drop_txn1.out_txns = cr_drop_txn1.out_lim
# cr_drop_txn1.in_txns = cr_drop_txn1.in_lim
# cr_drop_txn1.out_txns = cr_drop_txn1.out_lim - 1
# cr_drop_txn1.in_txns = cr_drop_txn1.in_lim - 1
# cr_drop_txn1.out_txns = cr_drop_txn1.out_lim + 1
# cr_drop_txn1.in_txns = cr_drop_txn1.in_lim + 1
batch_hand.should_decline()

True

### **Тест метода `distributor`**

**баланс > trf_max** - выводить кол-во попыток через print внутри atts_after_decline()  
4 кейса
- scen=`"split_transfer"`, in_chunks = `True`
- scen=`"atm+transfer`", in_chunks = `True`
- scen=`"atm"`, in_chunks = `False`
- scen=`"transfer"`, in_chunks = `False`

In [10]:
reset_caches(cr_drop_txn1, behav_hand1, amt_hand1, time_hand1, part_data1, batch_hand)
acc_hand1.reset_cache()
all_txns1 = []
start_time = pd.to_datetime("2025-07-02 11:15:00", format="%Y-%m-%d %H:%M:%S")
time_hand1.last_unix = pd_timestamp_to_unix(start_time)
time_hand1.start_unix = pd_timestamp_to_unix(start_time)

In [11]:
while not batch_hand.declined:
    amt_hand1.balance = behav_hand1.trf_max + 1000
    behav_hand1.scen = "transfer"
    behav_hand1.in_chunks = False

    batch_hand.distributor()
    txns_fm_batch1 = batch_hand.txns_fm_batch
    all_txns1.extend(txns_fm_batch1)
    batch_hand.reset_cache(all=False)

attempts 3


In [4]:
all_txns_df = pd.DataFrame(all_txns1)
print(all_txns_df.shape[0])
all_txns_df

**С генерацией входящих транзакций и рандомизацией поведения**
- реализация через цикл наподобие run_drop_lifecycle()

In [7]:
reset_caches(cr_drop_txn1, behav_hand1, amt_hand1, time_hand1, part_data1, batch_hand)
acc_hand1.reset_cache()
all_txns1 = []

while True:
    declined = batch_hand.declined
    receive_txn1 = cr_drop_txn1.trf_or_atm(dist=True, receive=True, to_drop=False, declined=declined)
    all_txns1.append(receive_txn1)
    if declined:
        break
        
    behav_hand1.sample_scenario() # выбрать сценарий
    behav_hand1.in_chunks_val() # транзакции по частям или нет 
    
    batch_hand.process_batch()
    txns_fm_batch1 = batch_hand.txns_fm_batch
    all_txns1.extend(txns_fm_batch1)
    batch_hand.reset_cache(all=False)

In [9]:
all_txns_df = pd.DataFrame(all_txns1)
print(all_txns_df.shape[0])
all_txns_df #.query("type != 'inbound'").shape

16


(11, 19)

### **Тест метода `purchaser`**

**баланс > amt_max** - выводить кол-во попыток через print внутри atts_after_decline()
- scen=`"split_money"`, in_chunks = `True`
- scen=`"transfer"`, in_chunks = `False`

In [7]:
reset_caches(cr_drop_txn1, behav_hand1, amt_hand1, time_hand1, part_data1, batch_hand)
acc_hand1.reset_cache()
all_txns2 = []
start_time = pd.to_datetime("2025-07-02 11:15:00", format="%Y-%m-%d %H:%M:%S")
time_hand1.last_unix = pd_timestamp_to_unix(start_time)
time_hand1.start_unix = pd_timestamp_to_unix(start_time)

In [8]:
while not batch_hand.declined:
    amt_hand1.balance = behav_hand1.amt_max + 1000
    behav_hand1.scen = "one_purchase"
    behav_hand1.in_chunks = False

    batch_hand.purchaser()
    txns_fm_batch2 = batch_hand.txns_fm_batch
    all_txns2.extend(txns_fm_batch2)
    batch_hand.reset_cache(all=False)

attempts 0


In [10]:
all_txns_df2 = pd.DataFrame(all_txns2)
print(all_txns_df2.shape[0])
all_txns_df2

**`purchaser` с генерацией входящих транзакций и рандомизацией поведения**
- реализация через цикл наподобие run_drop_lifecycle()

In [7]:
reset_caches(cr_drop_txn1, behav_hand1, amt_hand1, time_hand1, part_data1, batch_hand)
acc_hand1.reset_cache()
all_txns3 = []

while True:
    declined = batch_hand.declined
    receive_txn3 = cr_drop_txn1.trf_or_atm(receive=True, to_drop=False, declined=declined)
    all_txns3.append(receive_txn3)
    if declined:
        break
        
    behav_hand1.sample_scenario() # выбрать сценарий
    behav_hand1.in_chunks_val() # транзакции по частям или нет 
    
    batch_hand.process_batch()
    txns_fm_batch3 = batch_hand.txns_fm_batch
    all_txns3.extend(txns_fm_batch3)
    batch_hand.reset_cache(all=False)

In [33]:
all_txns_df3 = pd.DataFrame(all_txns3)
print(all_txns_df3.shape[0])
all_txns_df3

# from_7th = all_txns_df3.iloc[7:all_txns_df3.index.max()+1].copy()[["type", "amount", "status"]] #.query("type != 'inbound'").shape
# from_7th["balance"] = -from_7th["amount"]
# from_7th.loc[from_7th.index.min(), "balance"] = from_7th.loc[from_7th.index.min(), "amount"]
# from_7th["balance"] = from_7th["balance"].cumsum()
# from_7th

In [29]:
amt_hand1.balance

np.float64(5700.0)

## Класс `DropLifecycleManager`
- модуль `data_generator.fraud.drops.simulator`
- Полный жизненный цикл дропа

In [None]:
class DropLifecycleManager:
    """
    Управление полным жизненный циклом одного дропа.
    ------------------
    drop_type: str. 'distributor' или 'purchaser'
    acc_hand: DropAccountHandler. Генератор номеров счетов входящих/исходящих транзакций.
              Учет использованных счетов.
    amt_hand: DropAmountHandler. Генератор сумм входящих/исходящих транзакций, сумм снятий.
              Управление балансом текущего дропа.
    time_hand: DropTimeHandler.
               Управление временем транзакций дропа.
    behav_hand: DistBehaviorHandler | PurchBehaviorHandler.
                Управление поведением дропа: распределителя или покупателя.
    part_data: DropTxnPartData.
               Генерация части данных о транзакции дропа.
    behav_hand: DistBehaviorHandler | PurchBehaviorHandler.
                Управление поведением дропа: распределителя или покупателя.
    create_txn: CreateDropTxn. Создание транзакций.
    batch_hand: DropBatchHandler. Обработка полученной дропом партии (батча) денег
    drop_txns: list. Созданные транзакции дропа.
    """
    def __init__(self, base: DropBaseClasses, create_txn: CreateDropTxn):
        """
        base: DropBaseClasses. Объекты основных классов для дропов.
        create_txn: CreateDropTxn. Создание транзакций.
        """
        self.drop_type = base.drop_type
        self.acc_hand = base.acc_hand
        self.amt_hand = base.amt_hand
        self.time_hand = base.time_hand
        self.part_data = base.part_data
        self.behav_hand = base.behav_hand
        self.create_txn = create_txn
        self.batch_hand = DropBatchHandler(base=base, create_txn=create_txn)
        self.drop_txns = []


    def reset_all_caches(self):
        """
        Сброс кэшей когда активность дропа закончена совсем
        """
        # Сброс всего кэша batch_hand включает в себя полный сброс кэша
        # в behav_hand и amt_hand
        self.batch_hand.reset_cache(all=True)
        self.time_hand.reset_cache()
        self.part_data.reset_cache()
        self.create_txn.reset_cache()
        self.drop_txns = []
        

    def run_drop_lifecycle(self):
        # создать счет дропа, записать is_drop = True в таблице acc_hand.accounts
        acc_hand = self.acc_hand
        # получить номер счета дропа. Пишется в атрибут acc_hand.account
        acc_hand.get_account(own=True) 
        acc_hand.label_drop() # помечаем клиента как дропа в таблице acc_hand.accounts
        
        behav_hand = self.behav_hand
        batch_hand = self.batch_hand
        create_txn = self.create_txn

        while True:
            declined = batch_hand.declined # статус транзакции. будет ли она отклонена
            # входящая транзакция. Новый батч денег.
            receive_txn = create_txn.trf_or_atm(declined=declined, \
                                                to_drop=False, receive=True) 
            drop_txns = self.drop_txns
            drop_txns.append(receive_txn)
            # если у дропа достигнут лимит то транзакции отклоняются. 
            # Если входящая отклонена, дропу больше не пытаются послать деньги
            if declined: 
                break

            behav_hand.sample_scenario() # выбрать сценарий
            behav_hand.in_chunks_val() # транзакции по частям или нет

            batch_hand.process_batch() # обработка полученного батча

            txns_fm_batch = batch_hand.txns_fm_batch
            drop_txns.extend(txns_fm_batch)
            # сброс кэша после завершения обработки батча
            batch_hand.reset_cache(all=False)

## **Тест `DropLifecycleManager`**

In [1]:
# Временный импорт
import os
import yaml
import pandas as pd
import numpy as np
os.chdir("..")

# Общие настройки
with open("./config/base.yaml") as f:
    base_cfg = yaml.safe_load(f)
# Настройки фрода
with open("./config/fraud.yaml") as f:
    fraud_cfg = yaml.safe_load(f)
# Настройки фрода для дропов
with open("./config/drops.yaml", encoding="utf8") as f:
    drops_cfg = yaml.safe_load(f)
# Настройки времени
with open("./config/time.yaml") as f:
    time_cfg = yaml.safe_load(f)

In [10]:
from data_generator.fraud.drops.build.config import DropConfigBuilder
from data_generator.fraud.drops.build.builder import DropBaseClasses
from data_generator.fraud.drops.txns import CreateDropTxn
from data_generator.fraud.drops.simulator import DropLifecycleManager

drop_cfg_build = DropConfigBuilder(base_cfg=base_cfg, time_cfg=time_cfg, fraud_cfg=fraud_cfg, drop_cfg=drops_cfg)
# configs = drop_cfg_build.build_dist_cfg()
configs = drop_cfg_build.build_purch_cfg()

# base_agg1 = DropBaseClasses(drop_type="distributor", configs=configs)
base_agg1 = DropBaseClasses(drop_type="purchaser", configs=configs)
base_agg1.build_all()
acc_hand1 = base_agg1.acc_hand
# amt_hand1 = base_agg1.amt_hand
part_data1 = base_agg1.part_data
# time_hand1 = base_agg1.time_hand
# behav_hand1 = base_agg1.behav_hand

cr_drop_txn1 = CreateDropTxn(configs=configs, base=base_agg1)
drop_clients = configs.clients

life_manager = DropLifecycleManager(base=base_agg1, create_txn=cr_drop_txn1)
batch_hand = life_manager.batch_hand

for client in drop_clients.iloc[[8]].itertuples():
    part_data1.client_info = client
    acc_hand1.client_id = client.client_id
    acc_hand1.get_account(own=True)
part_data1.client_info
print("ready for tests")

ready for tests


In [11]:
# def reset_caches(cr_drop_txn, behav_hand, amt_hand, time_hand, part_data):
#     cr_drop_txn.reset_cache()
#     behav_hand.reset_cache(all=False)
#     amt_hand.reset_cache(life_end=True) # batch_txns здесь
#     time_hand.reset_cache()
#     part_data.reset_cache()

**`run_drop_lifecycle`**  

In [12]:
all_txns = []
life_manager.run_drop_lifecycle()
drop_txns = life_manager.drop_txns
all_txns.extend(drop_txns)
life_manager.reset_all_caches() # сброс всего кэша после завершения активности дропа

In [13]:
all_txns_df = pd.DataFrame(all_txns)
all_txns_df

Unnamed: 0,client_id,txn_time,unix_time,amount,type,channel,category,online,merchant_id,trans_city,trans_lat,trans_lon,trans_ip,device_id,account,is_fraud,is_suspicious,status,rule
0,3872,2025-01-06 00:35:00,1736123700,34100.0,inbound,transfer,not applicable,True,,not applicable,,,not applicable,,13665.0,False,False,approved,not applicable
1,3872,2025-01-06 02:36:00,1736130960,19000.0,purchase,ecom,misc_net,True,6791.0,Уфа,54.734853,55.957865,2.60.14.82,6652.0,,False,False,approved,not applicable
2,3872,2025-01-06 03:16:00,1736133360,5000.0,purchase,ecom,travel_net,True,6825.0,Уфа,54.734853,55.957865,2.60.14.82,6651.0,,False,False,approved,not applicable
3,3872,2025-01-06 05:04:00,1736139840,6000.0,purchase,ecom,misc_net,True,6829.0,Уфа,54.734853,55.957865,2.60.14.82,6652.0,,False,False,approved,not applicable
4,3872,2025-01-06 08:00:00,1736150400,4100.0,purchase,ecom,travel_net,True,6824.0,Уфа,54.734853,55.957865,2.60.14.82,6651.0,,False,False,approved,not applicable
5,3872,2025-01-06 09:46:00,1736156760,33900.0,inbound,transfer,not applicable,True,,not applicable,,,not applicable,,13665.0,False,False,approved,not applicable
6,3872,2025-01-07 01:17:00,1736212620,33900.0,purchase,ecom,shopping_net,True,6828.0,Уфа,54.734853,55.957865,2.60.14.82,6652.0,,False,False,approved,not applicable
7,3872,2025-01-07 04:11:00,1736223060,6100.0,inbound,transfer,not applicable,True,,not applicable,,,not applicable,,13665.0,False,False,approved,not applicable
8,3872,2025-01-07 05:14:00,1736226840,6100.0,purchase,ecom,shopping_net,True,6898.0,Уфа,54.734853,55.957865,2.60.14.82,6651.0,,False,False,approved,not applicable
9,3872,2025-01-07 07:53:00,1736236380,16000.0,inbound,transfer,not applicable,True,,not applicable,,,not applicable,,13665.0,False,False,approved,not applicable


In [5]:
life_manager.drop_txns

[]

In [5]:
behav_hand1.scen, behav_hand1.in_chunks, behav_hand1.attempts

(None, None, 0)

In [6]:
cr_drop_txn1.last_txn, cr_drop_txn1.in_txns, cr_drop_txn1.out_txns

(None, 0, 0)

In [7]:
amt_hand1.batch_txns, amt_hand1.chunk_size, amt_hand1.balance, amt_hand1.last_amt, amt_hand1.first_decl, amt_hand1.declined_txns

(0, 0, 0, 0, 0, 0)

In [8]:
time_hand1.start_unix, time_hand1.last_unix, time_hand1.in_txns, time_hand1.out_txns,

(0, 0, 0, 0)

In [9]:
part_data1.last_txn

In [12]:
# правильный account_id в aсс_hand.account
own_id = acc_hand1.client_id
own_acc_id = acc_hand1.accounts.query("client_id == @own_id")["account_id"].iat[0]
assert acc_hand1.account == own_acc_id, "aсс_hand.account does not belong to client."

In [13]:
# клиент помечен как дроп в accounts
acc_hand1.accounts.query("client_id == @own_id")

Unnamed: 0,client_id,account_id,is_drop
2057,2174,12057,True


## Класс `DropSimulator`
- модуль `data_generator.fraud.drops.simulator`
- Генерация множества дропов

In [None]:
class DropSimulator:
    """
    Генерация активности множества дропов.
    ------------------------
    base_cfg: dict. Конфиги из base.yaml
    drop_type: str. 'distributor' или 'purchaser'
    drop_clients: pd.DataFrame. Клиенты которые будут дропами.
    part_data: DropTxnPartData. Генерация части данных транзакции.
    acc_hand: DropAccountHandler. Генератор номеров счетов входящих/исходящих 
              транзакций. Учет использованных счетов.
    txn_recorder: FraudTxnsRecorder. Запись транзакций в файл.
    life_manager: DropLifecycleManager. Управление полным жизненный циклом
                  одного дропа.
    all_txns: list. Список для записи всех созданных транзакций.
    txns_df: pd.DataFrame. Пустой датафрейм с колонками и проставленными типами
    """
    def __init__(self, base_cfg, configs, base, create_txn, txn_recorder):
        """
        base_cfg: dict. Конфиги из base.yaml
        configs: DropDistributorCfg | DropPurchaserCfg.
                 Конфиги и данные для создания дроп транзакций.
        base: Объекты основных классов для дропов. 
        create_txn: CreateDropTxn. Создание транзакций.
        """
        self.base_cfg = base_cfg
        self.drop_type = base.drop_type
        self.drop_clients = configs.clients
        self.part_data = base.part_data
        self.acc_hand = base.acc_hand
        self.txn_recorder = txn_recorder
        self.txns_df = configs.transactions
        self.life_manager = DropLifecycleManager(base=base, create_txn=create_txn)
        self.all_txns = []
    

    def write_to_file(self, data, category, file_key):
        """
        Запись данных в файл по пути из yaml конфига.
        data: pd.DataFrame | gpd.DataFrame. Данные.
        category: str. Категория файлов в yaml конфиге например:
                  'cleaned', 'generated'. Соответсвует
                  структуре папок в data/
        file_key: str. Ключ к полному пути конкретного файла в категории.
        """
        path = self.base_cfg["data_paths"][category][file_key]
        file_type = path.split(".")[-1]

        if file_type == "csv":
            return data.to_csv(path, index=False)
        
        if file_type == "gpkg":
            return data.to_file(path, layer="layer_name", driver="GPKG")
        
        if file_type == "parquet":
            return data.to_parquet(path, engine="pyarrow")


    def run(self):
        """
        Полная генерация активности дропов соответсвующего типа
        """
        drop_clients = self.drop_clients
        drop_type = self.drop_type
        progress_bar = create_progress_bar(drop_clients, text=f"Generating {drop_type} drops")
        part_data = self.part_data
        acc_hand = self.acc_hand
        life_manager = self.life_manager
        all_txns = self.all_txns
        txn_recorder = self.txn_recorder

        # Итерируемся через семплированных клиентов под дроп
        for client in drop_clients.itertuples():
            # Запись данных текущего клиента в атрибуты
            # некоторых классов
            part_data.client_info = client
            acc_hand.client_id = client.client_id

            # Генерация полного цикла активности одного дропа
            life_manager.run_drop_lifecycle()
            # Запись транзакций дропа в общий список
            drop_txns = life_manager.drop_txns
            all_txns.extend(drop_txns)

            # Сброс кэша дропа для следующей итерации
            life_manager.reset_all_caches()
            progress_bar.update(1)
        
        # Запись измененного датафрейма accounts в csv файл
        # Путь указывается в base.yaml
        accounts = acc_hand.accounts
        self.write_to_file(data=accounts, category="base", \
                           file_key="accounts")
        
        # Запись всех созданных транзакций дропов в parquet файл
        txn_recorder.all_txns = pd.DataFrame(self.all_txns)
        
        txn_recorder.write_to_file() # Это уже метод FraudTxnsRecorder

**Тест `DropSimulator`**

In [1]:
# Временный импорт
import os
import yaml
import pandas as pd
import numpy as np
import geopandas as gpd
import pyarrow
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

os.chdir("..")

# Общие настройки
with open("./config/base.yaml") as f:
    base_cfg = yaml.safe_load(f)
with open("./config/legit.yaml") as f:
    legit_cfg = yaml.safe_load(f)
# Настройки фрода
with open("./config/fraud.yaml") as f:
    fraud_cfg = yaml.safe_load(f)
# Настройки фрода для дропов
with open("./config/drops.yaml", encoding="utf8") as f:
    drops_cfg = yaml.safe_load(f)
# Настройки времени
with open("./config/time.yaml") as f:
    time_cfg = yaml.safe_load(f)

client_devices = pd.read_csv("./data/base/client_devices.csv")
print("OK")

OK


In [2]:
# datetime_suffix = datetime.now().strftime("%Y-%m-%d_%H%M%S")
# datetime_suffix = "2025-07-20_110919"
# run_dir = "generation_run_" + datetime_suffix
# history_dir = base_cfg["data_paths"]["generated"]["history"]
# run_dir_path = os.path.join(history_dir, run_dir)
# if not os.path.exists(run_dir_path):
#     os.mkdir(run_dir_path)
run_dir = "./data/generated/history/generation_run_2025-07-21_154309"
print("OK")

OK


In [1]:
# from data_generator.fraud.drops.build.config import DropConfigBuilder
# from data_generator.fraud.drops.build.builder import DropBaseClasses
# from data_generator.fraud.drops.txns import CreateDropTxn
# from data_generator.fraud.drops.simulator import DropLifecycleManager, DropSimulator
# from data_generator.fraud.recorder import FraudTxnsRecorder

# # drop_type = "distributor"
# drop_type = "purchaser"

# drop_cfg_build = DropConfigBuilder(base_cfg=base_cfg, legit_cfg=legit_cfg, time_cfg=time_cfg, \
#                                    fraud_cfg=fraud_cfg, drop_cfg=drops_cfg, run_dir=run_dir)

# if drop_type == "distributor":
#     configs = drop_cfg_build.build_dist_cfg()
# elif drop_type == "purchaser":
#     configs = drop_cfg_build.build_purch_cfg()

# base = DropBaseClasses(drop_type=drop_type, configs=configs)
# base.build_all()
# # acc_hand1 = base.acc_hand
# # amt_hand1 = base.amt_hand
# # part_data1 = base.part_data
# # time_hand1 = base.time_hand
# # behav_hand1 = base.behav_hand

# create_txn = CreateDropTxn(configs=configs, base=base)
# drop_clients = configs.clients
# txn_recorder = FraudTxnsRecorder(configs)

# # life_manager = DropLifecycleManager(base=base, create_txn=create_txn)
# # batch_hand = life_manager.batch_hand
# drop_sim = DropSimulator(base_cfg=base_cfg, configs=configs, base=base, create_txn=create_txn, txn_recorder=txn_recorder)
# print("ready for tests")

In [1]:
import os
import pandas as pd
os.chdir("..")

In [75]:


from data_generator.utils import load_configs
from data_generator.validator import ConfigsValidator 
from data_generator.runner.utils import make_dir_for_run
from data_generator.runner.drops import DropsRunner


# Общие настройки
base_cfg = load_configs("./config/base.yaml")
# Настройки легальных транзакций
legit_cfg = load_configs("./config/legit.yaml")
# Общие настройки фрода
fraud_cfg = load_configs("./config/fraud.yaml")
# Настройки compromised client фрода
drop_cfg = load_configs("./config/drops.yaml", encoding="utf8")
# Настройки времени
time_cfg = load_configs("./config/time.yaml")

run_dir = r".\data\generated\history\generation_run_2025-07-24_093233" #make_dir_for_run(base_cfg=base_cfg)
print(run_dir)

# drop_type = "distributor"
drop_type = "purchaser"

# text = f"{drop_type.capitalize()} drops generation"

drop_runner = DropsRunner(base_cfg=base_cfg, legit_cfg=legit_cfg, \
                            time_cfg=time_cfg, fraud_cfg=fraud_cfg, \
                            drops_cfg=drop_cfg, run_dir=run_dir, \
                            drop_type=drop_type)
print("ready for run")

.\data\generated\history\generation_run_2025-07-24_093233
ready for run


In [76]:
drop_runner.run()
# all_txns = drop_runner.txn_recorder.all_txns

Purchaser drops generation... completed.          


In [77]:
# run_dir = r".\data\generated\history\generation_run_2025-07-24_093233"

In [78]:
path_dist = os.path.join(run_dir, "dist_drops", "dist_drops.parquet")
dist_drops = pd.read_parquet(path_dist)
dist_drops

Unnamed: 0,client_id,city_id,birth_date,sex,region,city,timezone,lat,lon,population,home_ip
0,4304,5,1970-06-09,female,Ростовская,Ростов-на-Дону,UTC+3,47.222436,39.718787,1091544,2.60.15.234
1,712,54,1987-11-28,male,Свердловская,Екатеринбург,UTC+5,56.838633,60.605489,1377738,2.60.2.168
2,4338,30,1969-10-19,male,Вологодская,Вологда,UTC+3,59.248419,39.835646,301642,2.60.16.12
3,4057,70,1947-06-05,female,Новосибирская,Новосибирск,UTC+7,55.028102,82.921058,1498921,2.60.15.1
4,5389,17,1993-08-25,male,Свердловская,Нижний Тагил,UTC+5,57.910104,59.981324,361883,2.60.18.106
5,2733,3,1952-11-04,female,Удмуртская,Ижевск,UTC+4,56.852744,53.211396,628117,2.60.10.27
6,4781,9,1991-08-09,female,Кемеровская,Новокузнецк,UTC+7,53.794276,87.214405,547885,2.60.17.170


In [79]:
path_purch = os.path.join(run_dir, "purch_drops", "purchase_drops.parquet")
purch_drops = pd.read_parquet(path_purch)
purch_drops

Unnamed: 0,client_id,city_id,birth_date,sex,region,city,timezone,lat,lon,population,home_ip
0,2060,43,1996-05-28,female,Ивановская,Иваново,UTC+3,56.999468,40.972823,409277,2.60.7.154
1,4319,43,1971-10-31,male,Ивановская,Иваново,UTC+3,56.999468,40.972823,409277,2.60.15.249
2,3321,13,1994-05-24,female,Краснодарский,Сочи,UTC+3,43.585583,39.723142,343285,2.60.12.74
3,2602,37,1991-08-14,female,Северная Осетия - Алания,Владикавказ,UTC+3,43.020504,44.681938,311635,2.60.9.165
4,612,36,1985-11-04,female,Пермский,Пермь,UTC+5,58.010321,56.234178,1000679,2.60.2.71


In [80]:
accounts = pd.read_csv("./data/generated/latest/accounts.csv")

In [81]:
drops = accounts.query("is_drop == True")
drops

Unnamed: 0,client_id,account_id,is_drop
582,612,10582,True
679,712,10679,True
1945,2060,11945,True
2468,2602,12468,True
2586,2733,12586,True
3145,3321,13145,True
3840,4057,13840,True
4073,4304,14073,True
4088,4319,14088,True
4107,4338,14107,True


In [82]:
drop_ids = pd.concat([purch_drops,dist_drops]).client_id.unique()
drop_ids

array([2060, 4319, 3321, 2602,  612, 4304,  712, 4338, 4057, 5389, 2733,
       4781])

In [83]:
assert drops.loc[~drops.client_id.isin(drop_ids)].empty

In [84]:
path_hist = os.path.join(run_dir, "accounts.csv")
drops_fm_hist = pd.read_csv(path_hist).query("is_drop == True")

In [85]:
accs_merged = drops_fm_hist.merge(drops, how="outer", on="client_id")
accs_merged

Unnamed: 0,client_id,account_id_x,is_drop_x,account_id_y,is_drop_y
0,612,10582,True,10582,True
1,712,10679,True,10679,True
2,2060,11945,True,11945,True
3,2602,12468,True,12468,True
4,2733,12586,True,12586,True
5,3321,13145,True,13145,True
6,4057,13840,True,13840,True
7,4304,14073,True,14073,True
8,4319,14088,True,14088,True
9,4338,14107,True,14107,True


In [86]:
assert accs_merged.isna().any().any() == False

In [87]:
# all_txns = pd.read_parquet("./data/generated/latest/all_txns.parquet")
# all_txns_out = all_txns.query("type != 'inbound' and client_id in @drop_ids")
# all_txns_out.shape

In [88]:
dist_txns_path = os.path.join(run_dir, "dist_drops", "dist_drop_txns.parquet")
dist_txns = pd.read_parquet(dist_txns_path)
purch_txns_path = os.path.join(run_dir, "purch_drops", "purch_drop_txns.parquet")
purch_txns = pd.read_parquet(purch_txns_path)
all_txns = pd.concat([dist_txns, purch_txns])
all_txns_out = all_txns.query("type != 'inbound' and client_id in @drop_ids")
all_txns_out.shape

(120, 19)

In [89]:
txns_n_accs = all_txns_out.merge(accounts, left_on="account", right_on="account_id")
txns_n_accs.head(2)

Unnamed: 0,client_id_x,txn_time,unix_time,amount,type,channel,category,online,merchant_id,trans_city,...,trans_ip,device_id,account,is_fraud,is_suspicious,status,rule,client_id_y,account_id,is_drop
0,4304,2025-01-15 23:22:00,1736983320,16400.0,withdrawal,ATM,not applicable,False,,Ростов-на-Дону,...,not applicable,,14073.0,False,False,approved,not applicable,4304,14073,True
1,4304,2025-01-16 06:20:00,1737008400,20800.0,withdrawal,ATM,not applicable,False,,Ростов-на-Дону,...,not applicable,,14073.0,False,False,approved,not applicable,4304,14073,True


In [93]:
drop_to_drop = txns_n_accs.query("client_id_x != client_id_y").drop_duplicates()
print(drop_to_drop.shape)
drop_to_drop.head(2)

(16, 22)


Unnamed: 0,client_id_x,txn_time,unix_time,amount,type,channel,category,online,merchant_id,trans_city,...,trans_ip,device_id,account,is_fraud,is_suspicious,status,rule,client_id_y,account_id,is_drop
4,712,2025-01-08 16:38:00,1736354280,700.0,outbound,transfer,not applicable,True,,Екатеринбург,...,2.60.2.168,1249.0,14073.0,False,False,approved,not applicable,4304,14073,True
5,712,2025-01-09 16:43:00,1736440980,4000.0,outbound,transfer,not applicable,True,,Екатеринбург,...,2.60.2.168,1250.0,14073.0,False,False,approved,not applicable,4304,14073,True


In [96]:
drop_to_drop.client_id_x.nunique()

6

In [97]:
drop_to_drop[["client_id_x", "client_id_y"]].value_counts()

client_id_x  client_id_y
712          4304           4
4338         4304           3
2733         4338           2
4781         4338           2
5389         712            2
4057         4304           1
4781         2733           1
5389         4057           1
Name: count, dtype: int64

In [51]:
compr_path = os.path.join(run_dir, "compromised", "compr_client_txns.parquet")
compr_txns = pd.read_parquet(compr_path)
compr_txns.head(2)

Unnamed: 0,client_id,txn_time,unix_time,amount,type,channel,category,online,merchant_id,trans_city,trans_lat,trans_lon,trans_ip,device_id,account,is_fraud,is_suspicious,status,rule
0,2315,2025-01-20 13:41:46,1737380506,2500.0,purchase,ecom,shopping_net,True,6844.0,Челябинск,55.160366,61.400786,5.8.14.245,11922.0,,True,False,approved,not applicable
1,2315,2025-01-20 13:45:46,1737380746,4558.74,purchase,ecom,grocery_net,True,6844.0,Челябинск,55.160366,61.400786,5.8.14.245,11922.0,,True,False,approved,not applicable


In [53]:
compr_txns.rule.value_counts()

rule
trans_freq_increase              100
not applicable                    93
new_ip_and_device_high_amount     35
fast_geo_change_online            30
new_device_and_high_amount        25
fast_geo_change                   18
Name: count, dtype: int64

In [5]:
drop_cfg_build.estimate_drops_count(drop_type)

2

In [3]:
all_txns.shape, all_txns.drop_duplicates().shape

((26, 19), (26, 19))

In [4]:
drop_clients.shape, drop_sim.drop_clients.shape

NameError: name 'drop_clients' is not defined

In [5]:
# drop_sim.all_txns

In [6]:
all_txns.head(2)

Unnamed: 0,client_id,txn_time,unix_time,amount,type,channel,category,online,merchant_id,trans_city,trans_lat,trans_lon,trans_ip,device_id,account,is_fraud,is_suspicious,status,rule
0,394,2025-01-07 07:28:00,1736234880,60500.0,inbound,transfer,not applicable,True,,not applicable,,,not applicable,,10373.0,False,False,approved,not applicable
1,394,2025-01-07 08:25:00,1736238300,23000.0,purchase,ecom,travel_net,True,6881.0,Челябинск,55.160366,61.400786,2.60.1.118,687.0,,False,False,approved,not applicable


In [23]:
txn_recorder.directory

'./data/generated/history/generation_run_2025-07-21_154309\\purch_drops'

In [6]:
if drop_type == "distributor":
    file_name = "dist_drop_txns.parquet"
elif drop_type == "purchaser":
    file_name = "purch_drop_txns.parquet"

txns_path = os.path.join(drop_runner.txn_recorder.directory, file_name)
# txns_path = txn_recorder.directory
txns_from_run_dir = pd.read_parquet(txns_path)
txns_from_run_dir.head(2)

Unnamed: 0,client_id,txn_time,unix_time,amount,type,channel,category,online,merchant_id,trans_city,trans_lat,trans_lon,trans_ip,device_id,account,is_fraud,is_suspicious,status,rule
0,243,2025-01-14 20:49:00,1736887740,20800.0,inbound,transfer,not applicable,True,,not applicable,,,not applicable,,10229.0,False,False,approved,not applicable
1,243,2025-01-14 22:19:00,1736893140,14000.0,purchase,ecom,shopping_net,True,6782.0,Чебоксары,56.143938,47.248872,2.60.0.230,419.0,,False,False,approved,not applicable


In [7]:
key_latest = drop_runner.txn_recorder.key_latest
latest_path = base_cfg["data_paths"]["generated"][key_latest]
txns_fm_latest = pd.read_parquet(latest_path)
txns_fm_latest.head(2)

Unnamed: 0,client_id,txn_time,unix_time,amount,type,channel,category,online,merchant_id,trans_city,trans_lat,trans_lon,trans_ip,device_id,account,is_fraud,is_suspicious,status,rule
0,243,2025-01-14 20:49:00,1736887740,20800.0,inbound,transfer,not applicable,True,,not applicable,,,not applicable,,10229.0,False,False,approved,not applicable
1,243,2025-01-14 22:19:00,1736893140,14000.0,purchase,ecom,shopping_net,True,6782.0,Чебоксары,56.143938,47.248872,2.60.0.230,419.0,,False,False,approved,not applicable


In [8]:
all_txns.shape, txns_from_run_dir.shape, txns_fm_latest.shape

((30, 19), (30, 19), (30, 19))

In [11]:
# txns_from_run_dir_direct = pd.read_parquet( \
#         r"C:\Users\iaros\My_documents\Education\projects\fraud_detection_01\data\generated\history\generation_run_2025-07-21_154309\purch_drops\purch_drop_txns.parquet")

In [10]:
# txns_from_run_dir_direct.tail(2)

In [13]:
# txns_from_run_dir_direct02 = pd.read_parquet('./data/generated/history/generation_run_2025-07-21_154309\\purch_drops')
# txns_from_run_dir_direct02.tail()

In [14]:
# txns_merged = txns_from_run_dir.merge(txns_fm_latest, how="left", on=["client_id", "unix_time"])
# txns_merged.loc[txns_merged.trans_city_.isna()]

In [9]:
txns_merged.client_id.unique()

NameError: name 'txns_merged' is not defined

In [10]:
all_txns_minmax = all_txns.client_id.value_counts().agg(["min", "max"])
all_txns_minmax

min    14
max    16
Name: count, dtype: int64

In [11]:
max_client = all_txns.client_id.value_counts().idxmax()
max_client

np.int64(4404)

In [12]:
max_client_txns = all_txns.query("client_id == @max_client")
max_client_txns

Unnamed: 0,client_id,txn_time,unix_time,amount,type,channel,category,online,merchant_id,trans_city,trans_lat,trans_lon,trans_ip,device_id,account,is_fraud,is_suspicious,status,rule
14,4404,2025-01-09 10:14:00,1736417640,24200.0,inbound,transfer,not applicable,True,,not applicable,,,not applicable,,14163.0,False,False,approved,not applicable
15,4404,2025-01-09 12:50:00,1736427000,24200.0,purchase,ecom,shopping_net,True,6892.0,Томск,56.484704,84.948174,2.60.16.68,7560.0,,False,False,approved,not applicable
16,4404,2025-01-09 15:28:00,1736436480,36000.0,inbound,transfer,not applicable,True,,not applicable,,,not applicable,,14163.0,False,False,approved,not applicable
17,4404,2025-01-10 12:46:00,1736513160,14000.0,purchase,ecom,shopping_net,True,6959.0,Томск,56.484704,84.948174,2.60.16.68,7560.0,,False,False,approved,not applicable
18,4404,2025-01-10 13:40:00,1736516400,15000.0,purchase,ecom,shopping_net,True,6950.0,Томск,56.484704,84.948174,2.60.16.68,7560.0,,False,False,approved,not applicable
19,4404,2025-01-10 16:34:00,1736526840,3000.0,purchase,ecom,travel_net,True,6934.0,Томск,56.484704,84.948174,2.60.16.68,7560.0,,False,False,approved,not applicable
20,4404,2025-01-10 19:28:00,1736537280,4000.0,purchase,ecom,travel_net,True,6918.0,Томск,56.484704,84.948174,2.60.16.68,7560.0,,False,False,approved,not applicable
21,4404,2025-01-10 21:29:00,1736544540,63700.0,inbound,transfer,not applicable,True,,not applicable,,,not applicable,,14163.0,False,False,approved,not applicable
22,4404,2025-01-11 00:23:00,1736554980,39000.0,purchase,ecom,shopping_net,True,6889.0,Томск,56.484704,84.948174,2.60.16.68,7560.0,,False,False,approved,not applicable
23,4404,2025-01-11 14:41:00,1736606460,18000.0,purchase,ecom,shopping_net,True,6838.0,Томск,56.484704,84.948174,2.60.16.68,7560.0,,False,False,approved,not applicable


In [13]:
max_client_txns.type.value_counts()

type
purchase    11
inbound      5
Name: count, dtype: int64

In [14]:
all_txns.query("type != 'inbound'")[["client_id","trans_city", "trans_lat", "trans_lon"]].drop_duplicates().shape

(2, 4)

In [15]:
all_txns.client_id.nunique()

2

In [21]:
# проверка что нет дропов у кого не свой девайс

assert all_txns.merge(client_devices, on="device_id").query("client_id_x != client_id_y").empty

NameError: name 'client_devices' is not defined

In [16]:
# кол-во approved и declined вх. транз. в рамках лимитов

inbound_by_status = all_txns.query("type == 'inbound'").groupby(["client_id","status"], as_index=False).agg({"unix_time":"count"}) \
                                .rename(columns={"unix_time":"txns_count"})
inbound_by_status.head(2)

Unnamed: 0,client_id,status,txns_count
0,243,approved,2
1,243,declined,1


In [17]:
inbound_by_status[["status", "txns_count"]].groupby("status").agg(["min", "max"])

Unnamed: 0_level_0,txns_count,txns_count
Unnamed: 0_level_1,min,max
status,Unnamed: 1_level_2,Unnamed: 2_level_2
approved,2,4
declined,1,1


In [18]:
# кол-во approved и declined исх. транз. в рамках лимитов

outbound_by_status = all_txns.query("type != 'inbound'").groupby(["client_id","status"], as_index=False).agg({"unix_time":"count"}) \
                                .rename(columns={"unix_time":"txns_count"})
outbound_by_status.head(2)

Unnamed: 0,client_id,status,txns_count
0,243,approved,8
1,243,declined,3


In [19]:
outbound_by_status[["status", "txns_count"]].groupby("status").agg(["min", "max"])

Unnamed: 0_level_0,txns_count,txns_count
Unnamed: 0_level_1,min,max
status,Unnamed: 1_level_2,Unnamed: 2_level_2
approved,8,8
declined,3,3


клиенты записаны в дропы. проверка записанного accounts.csv

In [26]:
accounts1 = pd.read_csv("./data/base/accounts.csv")
accounts1.head(2)

Unnamed: 0,client_id,account_id,is_drop
0,1,10000,False
1,2,10001,False


In [28]:
drop_clients = drop_runner.configs.clients

assert accounts1.loc[accounts1.client_id.isin(drop_clients.client_id)].query("is_drop == False").empty

дропы не пересекаются с другими сегментами: легальные/compr фрод,  дропы другого типа

In [29]:
leg_path = os.path.join(run_dir, "legit", "legit_clients.parquet")
clients_sample = pd.read_parquet(leg_path)
clients_sample.shape

(500, 11)

In [30]:
if drop_type == "distributor":
    other_drops_path = os.path.join(run_dir, "purch_drops", "purchase_drops.parquet") # purchasers
elif drop_type == "purchaser":
    other_drops_path = os.path.join(run_dir, "dist_drops", "dist_drops.parquet") # distributors
other_drops = pd.read_parquet(other_drops_path)
exclude_clients = pd.concat([clients_sample, other_drops], ignore_index=True)

FileNotFoundError: [Errno 2] No such file or directory: './data/generated/history/generation_run_2025-07-21_154309\\purch_drops\\purchase_drops.parquet'

In [31]:
# Если другой тип дропов не генерировался
assert clients_sample.loc[clients_sample.client_id.isin(all_txns.client_id.unique())].empty

In [28]:
assert exclude_clients.loc[exclude_clients.client_id.isin(all_txns.client_id.unique())].empty

транзакции записаны в файл

In [32]:
drop_txns = pd.read_parquet("./data/generated/latest/dist_drop_txns.parquet", engine="pyarrow") # distributors
# drop_txns = pd.read_parquet("./data/generated/latest/purch_drop_txns.parquet", engine="pyarrow") # purchasers
print(drop_txns.shape)
drop_txns.head(3)

(30, 19)


Unnamed: 0,client_id,txn_time,unix_time,amount,type,channel,category,online,merchant_id,trans_city,trans_lat,trans_lon,trans_ip,device_id,account,is_fraud,is_suspicious,status,rule
0,3081,2025-01-06 10:46:00,1736160360,86800.0,inbound,transfer,not applicable,True,,not applicable,,,not applicable,,12916.0,False,False,approved,not applicable
1,3081,2025-01-06 13:30:00,1736170200,27400.0,withdrawal,ATM,not applicable,False,,Ростов-на-Дону,47.222436,39.718787,not applicable,,12916.0,False,False,approved,not applicable
2,3081,2025-01-06 14:49:00,1736174940,37000.0,outbound,transfer,not applicable,True,,Ростов-на-Дону,47.222436,39.718787,2.60.11.101,5285.0,21549.0,False,False,approved,not applicable


In [71]:
drop_txns_hist = pd.read_parquet(r".\data\generated\history\dist_drop_2025-07-20_094115\dist_drop_txns.parquet", engine="pyarrow") # distributors
# drop_txns_hist = pd.read_parquet(r".\data\generated\history\purch_drop_2025-07-20_092817\purch_drop_txns.parquet", engine="pyarrow") # purchasers
print(drop_txns_hist.shape)
drop_txns_hist.head(3)

(266, 19)


Unnamed: 0,client_id,txn_time,unix_time,amount,type,channel,category,online,merchant_id,trans_city,trans_lat,trans_lon,trans_ip,device_id,account,is_fraud,is_suspicious,status,rule
0,3861,2025-01-18 02:24:00,1737167040,32100.0,inbound,transfer,not applicable,True,,not applicable,,,not applicable,,13655.0,False,False,approved,not applicable
1,3861,2025-01-18 04:36:00,1737174960,13000.0,outbound,transfer,not applicable,True,,Улан-Удэ,51.833438,107.584151,2.60.14.72,6634.0,22201.0,False,False,approved,not applicable
2,3861,2025-01-18 05:45:00,1737179100,13000.0,outbound,transfer,not applicable,True,,Улан-Удэ,51.833438,107.584151,2.60.14.72,6633.0,22165.0,False,False,approved,not applicable


мин и макс разницы во времени у дропов в рамках лимитов

In [33]:
# Добавим колонку со временем предыдущей транзакции каждого клиента
for client in drop_clients.itertuples():
    client_id = client.client_id
    all_txns.loc[all_txns.client_id == client_id, "prev_txn_unix"] \
                = all_txns.loc[all_txns.client_id == client_id, "unix_time"].shift(1)

In [34]:
# all_txns_df.drop(columns="prev_time_diff_m", inplace=True)

In [35]:
# Разница между предыдущей транзакцией клиента в минутах

all_txns["prev_time_diff"] = all_txns.unix_time.sub(all_txns.prev_txn_unix).div(60)
all_txns.head(3)

Unnamed: 0,client_id,txn_time,unix_time,amount,type,channel,category,online,merchant_id,trans_city,...,trans_lon,trans_ip,device_id,account,is_fraud,is_suspicious,status,rule,prev_txn_unix,prev_time_diff
0,3081,2025-01-06 10:46:00,1736160360,86800.0,inbound,transfer,not applicable,True,,not applicable,...,,not applicable,,12916.0,False,False,approved,not applicable,,
1,3081,2025-01-06 13:30:00,1736170200,27400.0,withdrawal,ATM,not applicable,False,,Ростов-на-Дону,...,39.718787,not applicable,,12916.0,False,False,approved,not applicable,1736160000.0,164.0
2,3081,2025-01-06 14:49:00,1736174940,37000.0,outbound,transfer,not applicable,True,,Ростов-на-Дону,...,39.718787,2.60.11.101,5285.0,21549.0,False,False,approved,not applicable,1736170000.0,79.0


In [36]:
# timedelta

all_txns["prev_time_diff_m"] = pd.to_timedelta(all_txns["prev_time_diff"], unit="m")
all_txns.head()

Unnamed: 0,client_id,txn_time,unix_time,amount,type,channel,category,online,merchant_id,trans_city,...,trans_ip,device_id,account,is_fraud,is_suspicious,status,rule,prev_txn_unix,prev_time_diff,prev_time_diff_m
0,3081,2025-01-06 10:46:00,1736160360,86800.0,inbound,transfer,not applicable,True,,not applicable,...,not applicable,,12916.0,False,False,approved,not applicable,,,NaT
1,3081,2025-01-06 13:30:00,1736170200,27400.0,withdrawal,ATM,not applicable,False,,Ростов-на-Дону,...,not applicable,,12916.0,False,False,approved,not applicable,1736160000.0,164.0,0 days 02:44:00
2,3081,2025-01-06 14:49:00,1736174940,37000.0,outbound,transfer,not applicable,True,,Ростов-на-Дону,...,2.60.11.101,5285.0,21549.0,False,False,approved,not applicable,1736170000.0,79.0,0 days 01:19:00
3,3081,2025-01-06 15:44:00,1736178240,18000.0,outbound,transfer,not applicable,True,,Ростов-на-Дону,...,2.60.11.101,5285.0,18646.0,False,False,approved,not applicable,1736175000.0,55.0,0 days 00:55:00
4,3081,2025-01-06 16:42:00,1736181720,3000.0,outbound,transfer,not applicable,True,,Ростов-на-Дону,...,2.60.11.101,5286.0,21559.0,False,False,approved,not applicable,1736178000.0,58.0,0 days 00:58:00


In [38]:
all_txns["prev_time_diff_m"].agg(["min","max"])

min   0 days 00:31:00
max   0 days 23:14:00
Name: prev_time_diff_m, dtype: timedelta64[ns]