In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pyarrow
from dataclasses import dataclass
import yaml
from typing import Union

In [2]:
os.getcwd()

'C:\\Users\\iaros\\My_documents\\Education\\projects\\fraud_detection_01\\notebooks'

In [3]:
os.chdir("..")
os.getcwd()

'C:\\Users\\iaros\\My_documents\\Education\\projects\\fraud_detection_01'

## Загрузка конфигураций

In [4]:
# Общие настройки
with open("./config/base.yaml") as f:
    base_cfg = yaml.safe_load(f)
# Настройки фрода
with open("./config/fraud.yaml") as f:
    fraud_cfg = yaml.safe_load(f)
# Настройки фрода для дропов
with open("./config/drops.yaml", encoding="utf8") as f:
    drops_cfg = yaml.safe_load(f)
# Настройки времени
with open("./config/time.yaml") as f:
    time_cfg = yaml.safe_load(f)

## Класс `DropBatchProcessor`
- модуль `data_generator.fraud.drops.processor`
- Обработка полученной дропом партии (батча) денег

In [5]:
class DropBatchHandler:
    """
    Обработка полученной дропом партии (батча) денег
    ---------------------------
    drop_type: str. 'distributor' или 'purchaser'
    amt_hand: DropAmountHandler. Генератор сумм входящих/исходящих транзакций, сумм снятий.
              Управление балансом текущего дропа.
    behav_hand: DistBehaviorHandler | PurchBehaviorHandler.
                Управление поведением дропа: распределителя или покупателя.
    create_txn: CreateDropTxn. Создание транзакций.
    declined: bool. По умолчанию False. Отклоняются ли транзакции.
    txns_fm_batch: list. Транзакции дропа в текущем батче.
    """

    def __init__(self, base: DropBaseClasses, create_txn: CreateDropTxn):
        """
        base: DropBaseClasses. Объекты основных классов для дропов.
        create_txn: CreateDropTxn. Создание транзакций.
        """
        self.drop_type = base.drop_type
        self.amt_hand = base.amt_hand
        self.behav_hand = base.behav_hand
        self.create_txn = create_txn
        self.declined = False
        self.txns_fm_batch = []


    def should_decline(self):
        """
        Проверка будет ли отклонена транзакция.
        Возвращает True или False в зависимости от достижения лимитов.
        Также записывает это значение в self.declined
        """
        self.declined = self.create_txn.limit_reached()
        # print("is_declined", self.declined)
        return self.declined


    def reset_cache(self, all=False):
        """
        Сброс кэша.
        --------
        all: bool. Если True то сбрасывает атрибуты: txns_fm_batch, declined.
             Если False то declined не сбрсывает
             Также передается в методы классов:
             DistBehaviorHandler | PurchBehaviorHandler,
             DropAmountHandler.
        """
        behav_hand = self.behav_hand
        amt_hand = self.amt_hand

        self.txns_fm_batch = []
        amt_hand.reset_cache(all=all)
        behav_hand.reset_cache(all=all)

        if not all:
            return
        
        self.declined = False


    def distributor(self):
        """
        Обработка партии(батча) денег полученных дропом
        распределителем.
        """
        behav_hand = self.behav_hand
        amt_hand = self.amt_hand
        create_txn = self.create_txn
        
    
        while amt_hand.balance > 0:
            declined = self.should_decline() # будет ли отклонена транзакция
            behav_hand.guide_scenario()

            if behav_hand.to_crypto: # перевод на криптобиржу или нет
                txn_out = create_txn.purchase(declined=declined)
            else: # Иначе перевод/снятие
                to_drop = behav_hand.to_drop # Пробовать ли перевести другому дропу.
                txn_out = create_txn.trf_or_atm(receive=False, 
                                    to_drop=to_drop, declined=declined)
            # Добавляем в список транз-ций батча   
            self.txns_fm_batch.append(txn_out)

            # Сколько попыток будет после первой откл. транз-ции
            behav_hand.attempts_after_decline()

            # Если это не первая отклоненная транзакция, то вычитаем попытку
            # совершить транзакцию после отклонения
            behav_hand.deduct_attempts()
            
            # Решение об остановке после отклоненной транзакции
            if behav_hand.stop_after_decline():
                break
            

    def purchaser(self):
        """
        Обработка партии(батча) денег полученных дропом
        покупателем.
        """
        behav_hand = self.behav_hand
        amt_hand = self.amt_hand
        create_txn = self.create_txn
        
        while amt_hand.balance > 0:
            declined = self.should_decline() # будет ли отклонена транзакция

            txn_out = create_txn.purchase(declined=declined)
            self.txns_fm_batch.append(txn_out)
            
            # Сколько попыток будет после первой откл. транз-ции
            behav_hand.attempts_after_decline()

            # Если это не первая отклоненная транзакция, то вычитаем попытку
            # совершить транзакцию после отклонения
            behav_hand.deduct_attempts()
            
            # Решение об остановке после отклоненной транзакции
            if behav_hand.stop_after_decline():
                break


    def process_batch(self):
        """
        Вызов соответствующего типу дропа метода для обработки
        батча денег.
        Метод выбирается исходя из self.drop_type.
        ---------
        """
        drop_type = self.drop_type

        if drop_type == "distributor":
            self.distributor()
        elif drop_type == "purchaser":
            self.purchaser()

**Тест `DropBatchProcessor`**

In [2]:
# Временный импорт
import os
import yaml
import pandas as pd
import numpy as np
os.chdir("..")

# Общие настройки
with open("./config/base.yaml") as f:
    base_cfg = yaml.safe_load(f)
# Настройки фрода
with open("./config/fraud.yaml") as f:
    fraud_cfg = yaml.safe_load(f)
# Настройки фрода для дропов
with open("./config/drops.yaml", encoding="utf8") as f:
    drops_cfg = yaml.safe_load(f)
# Настройки времени
with open("./config/time.yaml") as f:
    time_cfg = yaml.safe_load(f)

In [3]:
from data_generator.fraud.drops.build.config import DropConfigBuilder
from data_generator.fraud.drops.build.builder import DropBaseClasses
from data_generator.fraud.drops.txns import CreateDropTxn
from data_generator.fraud.drops.processor import DropBatchHandler
from data_generator.general_time import pd_timestamp_to_unix

drop_cfg_build = DropConfigBuilder(base_cfg=base_cfg, fraud_cfg=fraud_cfg, drop_cfg=drops_cfg)
configs = drop_cfg_build.build_dist_cfg()
# configs = drop_cfg_build.build_purch_cfg()

base_agg1 = DropBaseClasses(drop_type="distributor", configs=configs)
# base_agg1 = DropBaseClasses(drop_type="purchaser", configs=configs)
base_agg1.build_all()
# base_agg1.build_all(drop_type="purchaser")
acc_hand1 = base_agg1.acc_hand
amt_hand1 = base_agg1.amt_hand
part_data1 = base_agg1.part_data
time_hand1 = base_agg1.time_hand
behav_hand1 = base_agg1.behav_hand

cr_drop_txn1 = CreateDropTxn(configs=configs, base=base_agg1)
drop_clients = configs.clients

batch_hand = DropBatchHandler(base=base_agg1, create_txn=cr_drop_txn1)

for client in drop_clients.iloc[[8]].itertuples():
    part_data1.client_info = client
    acc_hand1.client_id = client.client_id
    acc_hand1.get_account(own=True)
part_data1.client_info

Pandas(Index=8, client_id=3565, district_id=44, birth_date='1962-07-29', sex='male', region='Ставропольский', area='Ставрополь', timezone='UTC+3', lat=45.0445439, lon=41.9690168, population=398266, home_ip='2.60.13.50', geometry=<MULTIPOLYGON (((41.81 45.006, 41.811 45.006, 41.813 45.007, 41.815 45.007, ...>)

In [4]:
def reset_caches(cr_drop_txn, behav_hand, amt_hand, time_hand, part_data, batch_hand):
    cr_drop_txn.reset_cache(only_counters=False)
    behav_hand.reset_cache(all=True)
    amt_hand.reset_cache(all=True) # batch_txns здесь
    time_hand.reset_cache()
    part_data.reset_cache()
    batch_hand.reset_cache(all=True)

In [4]:
# acc_hand1.outer_accounts.loc[~(acc_hand1.outer_accounts.isin(acc_hand1.used_accounts))].sample(1)
# acc_hand1.outer_accounts.loc[~(acc_hand1.outer_accounts.isin(acc_hand1.used_accounts))]
# acc_hand1.reset_cache()
acc_hand1.used_accounts

Series([], Name: account_id, dtype: object)

**`should_decline`**  
- cr_drop_txn1.out_txns = cr_drop_txn1.out_lim
- cr_drop_txn1.in_txns = cr_drop_txn1.in_lim
- cr_drop_txn1.out_txns = cr_drop_txn1.out_lim - 1
- cr_drop_txn1.in_txns = cr_drop_txn1.in_lim - 1
- cr_drop_txn1.out_txns = cr_drop_txn1.out_lim + 1
- cr_drop_txn1.in_txns = cr_drop_txn1.in_lim + 1

In [5]:
reset_caches(cr_drop_txn1, behav_hand1, amt_hand1, time_hand1, part_data1, batch_hand)
# cr_drop_txn1.out_txns = cr_drop_txn1.out_lim
# cr_drop_txn1.in_txns = cr_drop_txn1.in_lim
# cr_drop_txn1.out_txns = cr_drop_txn1.out_lim - 1
# cr_drop_txn1.in_txns = cr_drop_txn1.in_lim - 1
# cr_drop_txn1.out_txns = cr_drop_txn1.out_lim + 1
# cr_drop_txn1.in_txns = cr_drop_txn1.in_lim + 1
batch_hand.should_decline()

True

### **Тест метода `distributor`**

**баланс > trf_max** - выводить кол-во попыток через print внутри atts_after_decline()  
4 кейса
- scen=`"split_transfer"`, in_chunks = `True`
- scen=`"atm+transfer`", in_chunks = `True`
- scen=`"atm"`, in_chunks = `False`
- scen=`"transfer"`, in_chunks = `False`

In [10]:
reset_caches(cr_drop_txn1, behav_hand1, amt_hand1, time_hand1, part_data1, batch_hand)
acc_hand1.reset_cache()
all_txns1 = []
start_time = pd.to_datetime("2025-07-02 11:15:00", format="%Y-%m-%d %H:%M:%S")
time_hand1.last_unix = pd_timestamp_to_unix(start_time)
time_hand1.start_unix = pd_timestamp_to_unix(start_time)

In [11]:
while not batch_hand.declined:
    amt_hand1.balance = behav_hand1.trf_max + 1000
    behav_hand1.scen = "transfer"
    behav_hand1.in_chunks = False

    batch_hand.distributor()
    txns_fm_batch1 = batch_hand.txns_fm_batch
    all_txns1.extend(txns_fm_batch1)
    batch_hand.reset_cache(all=False)

attempts 3


In [4]:
all_txns_df = pd.DataFrame(all_txns1)
print(all_txns_df.shape[0])
all_txns_df

**С генерацией входящих транзакций и рандомизацией поведения**
- реализация через цикл наподобие run_drop_lifecycle()

In [7]:
reset_caches(cr_drop_txn1, behav_hand1, amt_hand1, time_hand1, part_data1, batch_hand)
acc_hand1.reset_cache()
all_txns1 = []

while True:
    declined = batch_hand.declined
    receive_txn1 = cr_drop_txn1.trf_or_atm(dist=True, receive=True, to_drop=False, declined=declined)
    all_txns1.append(receive_txn1)
    if declined:
        break
        
    behav_hand1.sample_scenario() # выбрать сценарий
    behav_hand1.in_chunks_val() # транзакции по частям или нет 
    
    batch_hand.process_batch()
    txns_fm_batch1 = batch_hand.txns_fm_batch
    all_txns1.extend(txns_fm_batch1)
    batch_hand.reset_cache(all=False)

In [9]:
all_txns_df = pd.DataFrame(all_txns1)
print(all_txns_df.shape[0])
all_txns_df #.query("type != 'inbound'").shape

16


(11, 19)

### **Тест метода `purchaser`**

**баланс > amt_max** - выводить кол-во попыток через print внутри atts_after_decline()
- scen=`"split_money"`, in_chunks = `True`
- scen=`"transfer"`, in_chunks = `False`

In [7]:
reset_caches(cr_drop_txn1, behav_hand1, amt_hand1, time_hand1, part_data1, batch_hand)
acc_hand1.reset_cache()
all_txns2 = []
start_time = pd.to_datetime("2025-07-02 11:15:00", format="%Y-%m-%d %H:%M:%S")
time_hand1.last_unix = pd_timestamp_to_unix(start_time)
time_hand1.start_unix = pd_timestamp_to_unix(start_time)

In [8]:
while not batch_hand.declined:
    amt_hand1.balance = behav_hand1.amt_max + 1000
    behav_hand1.scen = "one_purchase"
    behav_hand1.in_chunks = False

    batch_hand.purchaser()
    txns_fm_batch2 = batch_hand.txns_fm_batch
    all_txns2.extend(txns_fm_batch2)
    batch_hand.reset_cache(all=False)

attempts 0


In [10]:
all_txns_df2 = pd.DataFrame(all_txns2)
print(all_txns_df2.shape[0])
all_txns_df2

**`purchaser` с генерацией входящих транзакций и рандомизацией поведения**
- реализация через цикл наподобие run_drop_lifecycle()

In [7]:
reset_caches(cr_drop_txn1, behav_hand1, amt_hand1, time_hand1, part_data1, batch_hand)
acc_hand1.reset_cache()
all_txns3 = []

while True:
    declined = batch_hand.declined
    receive_txn3 = cr_drop_txn1.trf_or_atm(receive=True, to_drop=False, declined=declined)
    all_txns3.append(receive_txn3)
    if declined:
        break
        
    behav_hand1.sample_scenario() # выбрать сценарий
    behav_hand1.in_chunks_val() # транзакции по частям или нет 
    
    batch_hand.process_batch()
    txns_fm_batch3 = batch_hand.txns_fm_batch
    all_txns3.extend(txns_fm_batch3)
    batch_hand.reset_cache(all=False)

In [30]:
all_txns_df3 = pd.DataFrame(all_txns3)
print(all_txns_df3.shape[0])
all_txns_df3

# from_7th = all_txns_df3.iloc[7:all_txns_df3.index.max()+1].copy()[["type", "amount", "status"]] #.query("type != 'inbound'").shape
# from_7th["balance"] = -from_7th["amount"]
# from_7th.loc[from_7th.index.min(), "balance"] = from_7th.loc[from_7th.index.min(), "amount"]
# from_7th["balance"] = from_7th["balance"].cumsum()
# from_7th

14


Unnamed: 0,client_id,txn_time,unix_time,amount,type,channel,category,online,merchant_id,trans_city,trans_lat,trans_lon,trans_ip,device_id,account,is_fraud,is_suspicious,status,rule
0,3565,2025-01-18 02:17:00,1737166620,40200.0,inbound,transfer,not applicable,True,,not applicable,,,not applicable,,13377.0,False,False,approved,not applicable
1,3565,2025-01-18 03:27:00,1737170820,24000.0,outbound,transfer,not applicable,True,,Ставрополь,45.044544,41.969017,2.60.13.50,6097.0,18376.0,False,False,approved,not applicable
2,3565,2025-01-18 04:18:00,1737173880,16200.0,outbound,transfer,not applicable,True,,Ставрополь,45.044544,41.969017,2.60.13.50,6097.0,19169.0,False,False,approved,not applicable
3,3565,2025-01-18 06:28:00,1737181680,44000.0,inbound,transfer,not applicable,True,,not applicable,,,not applicable,,13377.0,False,False,approved,not applicable
4,3565,2025-01-19 00:33:00,1737246780,19400.0,withdrawal,ATM,not applicable,False,,Ставрополь,45.044544,41.969017,not applicable,,13377.0,False,False,approved,not applicable
5,3565,2025-01-19 02:23:00,1737253380,24000.0,outbound,transfer,not applicable,True,,Ставрополь,45.044544,41.969017,2.60.13.50,6097.0,22998.0,False,False,approved,not applicable
6,3565,2025-01-19 04:22:00,1737260520,600.0,outbound,transfer,not applicable,True,,Ставрополь,45.044544,41.969017,2.60.13.50,6097.0,16475.0,False,False,approved,not applicable
7,3565,2025-01-19 06:36:00,1737268560,49700.0,inbound,transfer,not applicable,True,,not applicable,,,not applicable,,13377.0,False,False,approved,not applicable
8,3565,2025-01-19 08:49:00,1737276540,24000.0,purchase,crypto_exchange,balance_top_up,True,6904.0,Ставрополь,45.044544,41.969017,2.60.13.50,6097.0,,False,False,approved,not applicable
9,3565,2025-01-19 11:14:00,1737285240,17000.0,outbound,transfer,not applicable,True,,Ставрополь,45.044544,41.969017,2.60.13.50,6097.0,20669.0,False,False,approved,not applicable


In [29]:
amt_hand1.balance

np.float64(5700.0)

## Класс `DropLifecycleManager`
- Полный жизненный цикл дропа

In [None]:
# ----------ВСТАВИТЬ ГОТОВЫЙ КЛАСС-------------------------

## **Тест `DropLifecycleManager`**

In [1]:
# Временный импорт
import os
import yaml
import pandas as pd
import numpy as np
os.chdir("..")

# Общие настройки
with open("./config/base.yaml") as f:
    base_cfg = yaml.safe_load(f)
# Настройки фрода
with open("./config/fraud.yaml") as f:
    fraud_cfg = yaml.safe_load(f)
# Настройки фрода для дропов
with open("./config/drops.yaml", encoding="utf8") as f:
    drops_cfg = yaml.safe_load(f)
# Настройки времени
with open("./config/time.yaml") as f:
    time_cfg = yaml.safe_load(f)

In [2]:
from data_generator.fraud.drops.build.config import DropConfigBuilder
from data_generator.fraud.drops.build.builder import DropBaseClasses
from data_generator.fraud.drops.txns import CreateDropTxn
from data_generator.fraud.drops.simulator import DropLifecycleManager

drop_cfg_build = DropConfigBuilder(base_cfg=base_cfg, fraud_cfg=fraud_cfg, drop_cfg=drops_cfg)
# configs = drop_cfg_build.build_dist_cfg()
configs = drop_cfg_build.build_purch_cfg()

# base_agg1 = DropBaseClasses(drop_type="distributor", configs=configs)
base_agg1 = DropBaseClasses(drop_type="purchaser", configs=configs)
base_agg1.build_all()
acc_hand1 = base_agg1.acc_hand
# amt_hand1 = base_agg1.amt_hand
part_data1 = base_agg1.part_data
# time_hand1 = base_agg1.time_hand
# behav_hand1 = base_agg1.behav_hand

cr_drop_txn1 = CreateDropTxn(configs=configs, base=base_agg1)
drop_clients = configs.clients

life_manager = DropLifecycleManager(base=base_agg1, create_txn=cr_drop_txn1)
batch_hand = life_manager.batch_hand

for client in drop_clients.iloc[[8]].itertuples():
    part_data1.client_info = client
    acc_hand1.client_id = client.client_id
    acc_hand1.get_account(own=True)
part_data1.client_info

Pandas(Index=8, client_id=3048, district_id=16, birth_date='1923-01-06', sex='female', region='Ярославская', area='Ярославль', timezone='UTC+3', lat=57.6216145, lon=39.897878, population=591486, home_ip='2.60.11.70', geometry=<MULTIPOLYGON (((39.729 57.727, 39.73 57.727, 39.731 57.727, 39.732 57.728, ...>)

In [3]:
# def reset_caches(cr_drop_txn, behav_hand, amt_hand, time_hand, part_data):
#     cr_drop_txn.reset_cache()
#     behav_hand.reset_cache(all=False)
#     amt_hand.reset_cache(life_end=True) # batch_txns здесь
#     time_hand.reset_cache()
#     part_data.reset_cache()

**`run_drop_lifecycle`**  

In [3]:
all_txns = []
life_manager.run_drop_lifecycle()
drop_txns = life_manager.drop_txns
all_txns.extend(drop_txns)
life_manager.reset_all_caches() # сброс всего кэша после завершения активности дропа

In [4]:
all_txns_df = pd.DataFrame(all_txns)
len(all_txns_df)

16

In [5]:
life_manager.drop_txns

[]

In [5]:
behav_hand1.scen, behav_hand1.in_chunks, behav_hand1.attempts

(None, None, 0)

In [6]:
cr_drop_txn1.last_txn, cr_drop_txn1.in_txns, cr_drop_txn1.out_txns

(None, 0, 0)

In [7]:
amt_hand1.batch_txns, amt_hand1.chunk_size, amt_hand1.balance, amt_hand1.last_amt, amt_hand1.first_decl, amt_hand1.declined_txns

(0, 0, 0, 0, 0, 0)

In [8]:
time_hand1.start_unix, time_hand1.last_unix, time_hand1.in_txns, time_hand1.out_txns,

(0, 0, 0, 0)

In [9]:
part_data1.last_txn

In [12]:
# правильный account_id в aсс_hand.account
own_id = acc_hand1.client_id
own_acc_id = acc_hand1.accounts.query("client_id == @own_id")["account_id"].iat[0]
assert acc_hand1.account == own_acc_id, "aсс_hand.account does not belong to client."

In [13]:
# клиент помечен как дроп в accounts
acc_hand1.accounts.query("client_id == @own_id")

Unnamed: 0,client_id,account_id,is_drop
2057,2174,12057,True


## Класс `DropSimulator`
- Генерация множества дропов

In [None]:
# ----------ВСТАВИТЬ ГОТОВЫЙ КЛАСС-------------------------

**Тест `DropSimulator`**

In [1]:
# Временный импорт
import os
import yaml
import pandas as pd
import numpy as np
import geopandas as gpd
import pyarrow

os.chdir("..")

# Общие настройки
with open("./config/base.yaml") as f:
    base_cfg = yaml.safe_load(f)
# Настройки фрода
with open("./config/fraud.yaml") as f:
    fraud_cfg = yaml.safe_load(f)
# Настройки фрода для дропов
with open("./config/drops.yaml", encoding="utf8") as f:
    drops_cfg = yaml.safe_load(f)
# Настройки времени
with open("./config/time.yaml") as f:
    time_cfg = yaml.safe_load(f)

client_devices = pd.read_csv("./data/cleaned_data/client_devices.csv")

In [2]:
from data_generator.fraud.drops.build.config import DropConfigBuilder
from data_generator.fraud.drops.build.builder import DropBaseClasses
from data_generator.fraud.drops.txns import CreateDropTxn
from data_generator.fraud.drops.simulator import DropLifecycleManager, DropSimulator

# drop_type = "distributor"
drop_type = "purchaser"

drop_cfg_build = DropConfigBuilder(base_cfg=base_cfg, fraud_cfg=fraud_cfg, drop_cfg=drops_cfg)

if drop_type == "distributor":
    configs = drop_cfg_build.build_dist_cfg()
elif drop_type == "purchaser":
    configs = drop_cfg_build.build_purch_cfg()

base = DropBaseClasses(drop_type=drop_type, configs=configs)
base.build_all()
# acc_hand1 = base.acc_hand
# amt_hand1 = base.amt_hand
# part_data1 = base.part_data
# time_hand1 = base.time_hand
# behav_hand1 = base.behav_hand

create_txn = CreateDropTxn(configs=configs, base=base)
drop_clients = configs.clients

# life_manager = DropLifecycleManager(base=base, create_txn=create_txn)
# batch_hand = life_manager.batch_hand
drop_sim = DropSimulator(base_cfg=base_cfg, configs=configs, base=base, create_txn=create_txn)
print("ready for tests")

ready for tests


In [3]:
drop_sim.run()
all_txns = drop_sim.all_txns

100%|██████████| 28/28 [00:00<00:00, 50.69it/s]


In [4]:
all_txns_df = pd.DataFrame(all_txns)
all_txns_df.shape, all_txns_df.drop_duplicates().shape

((442, 19), (442, 19))

In [5]:
drop_clients.shape

(28, 12)

In [6]:
all_txns_df.head()

Unnamed: 0,client_id,txn_time,unix_time,amount,type,channel,category,online,merchant_id,trans_city,trans_lat,trans_lon,trans_ip,device_id,account,is_fraud,is_suspicious,status,rule
0,677,2025-01-28 03:46:00,1738035960,39100.0,inbound,transfer,not applicable,True,,not applicable,,,not applicable,,10645.0,False,False,approved,not applicable
1,677,2025-01-28 06:11:00,1738044660,39100.0,purchase,ecom,travel_net,True,6802.0,Ростов-на-Дону,47.222436,39.718787,2.60.2.134,1164.0,,False,False,approved,not applicable
2,677,2025-01-28 07:30:00,1738049400,18900.0,inbound,transfer,not applicable,True,,not applicable,,,not applicable,,10645.0,False,False,approved,not applicable
3,677,2025-01-29 05:00:00,1738126800,8000.0,purchase,ecom,shopping_net,True,6943.0,Ростов-на-Дону,47.222436,39.718787,2.60.2.134,1164.0,,False,False,approved,not applicable
4,677,2025-01-29 06:07:00,1738130820,6000.0,purchase,ecom,shopping_net,True,6787.0,Ростов-на-Дону,47.222436,39.718787,2.60.2.134,1164.0,,False,False,approved,not applicable


In [7]:
all_txns_minmax = all_txns_df.client_id.value_counts().agg(["min", "max"])
all_txns_minmax

min    13
max    19
Name: count, dtype: int64

In [8]:
max_client = all_txns_df.client_id.value_counts().idxmax()
max_client

np.int64(11978)

In [13]:
max_client_txns = all_txns_df.query("client_id == @max_client")
max_client_txns

Unnamed: 0,client_id,txn_time,unix_time,amount,type,channel,category,online,merchant_id,trans_city,trans_lat,trans_lon,trans_ip,device_id,account,is_fraud,is_suspicious,status,rule
230,11978,2025-01-18 02:30:00,1737167400,57100.0,inbound,transfer,not applicable,True,,not applicable,,,not applicable,,15226.0,False,False,approved,not applicable
231,11978,2025-01-18 03:53:00,1737172380,12000.0,purchase,ecom,shopping_net,True,6781.0,Екатеринбург,56.838633,60.605489,2.60.20.107,9403.0,,False,False,approved,not applicable
232,11978,2025-01-18 06:33:00,1737181980,32000.0,purchase,ecom,shopping_net,True,6947.0,Екатеринбург,56.838633,60.605489,2.60.20.107,9403.0,,False,False,approved,not applicable
233,11978,2025-01-18 09:18:00,1737191880,13100.0,purchase,ecom,misc_net,True,6946.0,Екатеринбург,56.838633,60.605489,2.60.20.107,9403.0,,False,False,approved,not applicable
234,11978,2025-01-18 10:23:00,1737195780,37100.0,inbound,transfer,not applicable,True,,not applicable,,,not applicable,,15226.0,False,False,approved,not applicable
235,11978,2025-01-19 03:42:00,1737258120,37100.0,purchase,ecom,travel_net,True,6777.0,Екатеринбург,56.838633,60.605489,2.60.20.107,9403.0,,False,False,approved,not applicable
236,11978,2025-01-19 05:57:00,1737266220,34800.0,inbound,transfer,not applicable,True,,not applicable,,,not applicable,,15226.0,False,False,approved,not applicable
237,11978,2025-01-19 08:02:00,1737273720,23000.0,purchase,ecom,shopping_net,True,6846.0,Екатеринбург,56.838633,60.605489,2.60.20.107,9403.0,,False,False,approved,not applicable
238,11978,2025-01-19 10:05:00,1737281100,11800.0,purchase,ecom,travel_net,True,6906.0,Екатеринбург,56.838633,60.605489,2.60.20.107,9403.0,,False,False,approved,not applicable
239,11978,2025-01-19 12:23:00,1737289380,21400.0,inbound,transfer,not applicable,True,,not applicable,,,not applicable,,15226.0,False,False,approved,not applicable


In [14]:
max_client_txns.type.value_counts()

type
purchase    13
inbound      6
Name: count, dtype: int64

In [15]:
all_txns_df.query("type != 'inbound'")[["client_id","trans_city", "trans_lat", "trans_lon"]].drop_duplicates().shape

(28, 4)

In [16]:
all_txns_df.client_id.nunique()

28

In [17]:
# проверка что нет дропов у кого не свой девайс

assert all_txns_df.merge(client_devices, on="device_id").query("client_id_x != client_id_y").empty

In [18]:
# кол-во approved и declined вх. транз. в рамках лимитов

inbound_by_status = all_txns_df.query("type == 'inbound'").groupby(["client_id","status"], as_index=False).agg({"unix_time":"count"}) \
                                .rename(columns={"unix_time":"txns_count"})
inbound_by_status.head(2)

Unnamed: 0,client_id,status,txns_count
0,103,approved,3
1,103,declined,1


In [19]:
inbound_by_status[["status", "txns_count"]].groupby("status").agg(["min", "max"])

Unnamed: 0_level_0,txns_count,txns_count
Unnamed: 0_level_1,min,max
status,Unnamed: 1_level_2,Unnamed: 2_level_2
approved,3,6
declined,1,1


In [20]:
# кол-во approved и declined исх. транз. в рамках лимитов

outbound_by_status = all_txns_df.query("type != 'inbound'").groupby(["client_id","status"], as_index=False).agg({"unix_time":"count"}) \
                                .rename(columns={"unix_time":"txns_count"})
outbound_by_status.head(2)

Unnamed: 0,client_id,status,txns_count
0,103,approved,8
1,103,declined,5


In [21]:
outbound_by_status[["status", "txns_count"]].groupby("status").agg(["min", "max"])

Unnamed: 0_level_0,txns_count,txns_count
Unnamed: 0_level_1,min,max
status,Unnamed: 1_level_2,Unnamed: 2_level_2
approved,8,8
declined,1,5


клиенты записаны в дропы. проверка записанного accounts.csv

In [22]:
accounts1 = pd.read_csv("./data/generated_data/accounts.csv")
accounts1.head(2)

Unnamed: 0,client_id,account_id,is_drop
0,1,10000,False
1,2,10001,False


In [23]:
assert accounts1.loc[accounts1.client_id.isin(drop_clients.client_id)].query("is_drop == False").empty

дропы не пересекаются с другими сегментами: легальные/compr фрод,  дропы другого типа

In [24]:
clients_sample = gpd.read_file("./data/cleaned_data/clients_sample.gpkg")

In [25]:
other_drops = gpd.read_file("./data/generated_data/dist_drops.gpkg") # distributors
# other_drops = gpd.read_file("./data/generated_data/purchase_drops.gpkg") # purchasers
exclude_clients = pd.concat([clients_sample, other_drops], ignore_index=True)

In [26]:
assert exclude_clients.loc[exclude_clients.client_id.isin(all_txns_df.client_id.unique())].empty

транзакции записаны в файл

In [27]:
# drop_txns = pd.read_parquet("./data/generated_data/dist_drop_txns.parquet", engine="pyarrow") # distributors
drop_txns = pd.read_parquet("./data/generated_data/purch_drop_txns.parquet", engine="pyarrow") # purchasers
print(drop_txns.shape)
drop_txns.head()

(442, 19)


Unnamed: 0,client_id,txn_time,unix_time,amount,type,channel,category,online,merchant_id,trans_city,trans_lat,trans_lon,trans_ip,device_id,account,is_fraud,is_suspicious,status,rule
0,677,2025-01-28 03:46:00,1738035960,39100.0,inbound,transfer,not applicable,True,,not applicable,,,not applicable,,10645.0,False,False,approved,not applicable
1,677,2025-01-28 06:11:00,1738044660,39100.0,purchase,ecom,travel_net,True,6802.0,Ростов-на-Дону,47.222436,39.718787,2.60.2.134,1164.0,,False,False,approved,not applicable
2,677,2025-01-28 07:30:00,1738049400,18900.0,inbound,transfer,not applicable,True,,not applicable,,,not applicable,,10645.0,False,False,approved,not applicable
3,677,2025-01-29 05:00:00,1738126800,8000.0,purchase,ecom,shopping_net,True,6943.0,Ростов-на-Дону,47.222436,39.718787,2.60.2.134,1164.0,,False,False,approved,not applicable
4,677,2025-01-29 06:07:00,1738130820,6000.0,purchase,ecom,shopping_net,True,6787.0,Ростов-на-Дону,47.222436,39.718787,2.60.2.134,1164.0,,False,False,approved,not applicable


мин и макс разницы во времени у дропов в рамках лимитов

In [28]:
# Добавим колонку со временем предыдущей транзакции каждого клиента
for client in drop_clients.itertuples():
    client_id = client.client_id
    all_txns_df.loc[all_txns_df.client_id == client_id, "prev_txn_unix"] \
                = all_txns_df.loc[all_txns_df.client_id == client_id, "unix_time"].shift(1)

In [29]:
# all_txns_df.drop(columns="prev_time_diff_m", inplace=True)

In [30]:
# Разница между предыдущей транзакцией клиента в минутах

all_txns_df["prev_time_diff"] = all_txns_df.unix_time.sub(all_txns_df.prev_txn_unix).div(60)
all_txns_df.head()

Unnamed: 0,client_id,txn_time,unix_time,amount,type,channel,category,online,merchant_id,trans_city,...,trans_lon,trans_ip,device_id,account,is_fraud,is_suspicious,status,rule,prev_txn_unix,prev_time_diff
0,677,2025-01-28 03:46:00,1738035960,39100.0,inbound,transfer,not applicable,True,,not applicable,...,,not applicable,,10645.0,False,False,approved,not applicable,,
1,677,2025-01-28 06:11:00,1738044660,39100.0,purchase,ecom,travel_net,True,6802.0,Ростов-на-Дону,...,39.718787,2.60.2.134,1164.0,,False,False,approved,not applicable,1738036000.0,145.0
2,677,2025-01-28 07:30:00,1738049400,18900.0,inbound,transfer,not applicable,True,,not applicable,...,,not applicable,,10645.0,False,False,approved,not applicable,1738045000.0,79.0
3,677,2025-01-29 05:00:00,1738126800,8000.0,purchase,ecom,shopping_net,True,6943.0,Ростов-на-Дону,...,39.718787,2.60.2.134,1164.0,,False,False,approved,not applicable,1738049000.0,1290.0
4,677,2025-01-29 06:07:00,1738130820,6000.0,purchase,ecom,shopping_net,True,6787.0,Ростов-на-Дону,...,39.718787,2.60.2.134,1164.0,,False,False,approved,not applicable,1738127000.0,67.0


In [31]:
# timedelta

all_txns_df["prev_time_diff_m"] = pd.to_timedelta(all_txns_df["prev_time_diff"], unit="m")
all_txns_df.head()

Unnamed: 0,client_id,txn_time,unix_time,amount,type,channel,category,online,merchant_id,trans_city,...,trans_ip,device_id,account,is_fraud,is_suspicious,status,rule,prev_txn_unix,prev_time_diff,prev_time_diff_m
0,677,2025-01-28 03:46:00,1738035960,39100.0,inbound,transfer,not applicable,True,,not applicable,...,not applicable,,10645.0,False,False,approved,not applicable,,,NaT
1,677,2025-01-28 06:11:00,1738044660,39100.0,purchase,ecom,travel_net,True,6802.0,Ростов-на-Дону,...,2.60.2.134,1164.0,,False,False,approved,not applicable,1738036000.0,145.0,0 days 02:25:00
2,677,2025-01-28 07:30:00,1738049400,18900.0,inbound,transfer,not applicable,True,,not applicable,...,not applicable,,10645.0,False,False,approved,not applicable,1738045000.0,79.0,0 days 01:19:00
3,677,2025-01-29 05:00:00,1738126800,8000.0,purchase,ecom,shopping_net,True,6943.0,Ростов-на-Дону,...,2.60.2.134,1164.0,,False,False,approved,not applicable,1738049000.0,1290.0,0 days 21:30:00
4,677,2025-01-29 06:07:00,1738130820,6000.0,purchase,ecom,shopping_net,True,6787.0,Ростов-на-Дону,...,2.60.2.134,1164.0,,False,False,approved,not applicable,1738127000.0,67.0,0 days 01:07:00


In [32]:
all_txns_df["prev_time_diff_m"].agg(["min","max"])

min   0 days 00:30:00
max   1 days 00:26:00
Name: prev_time_diff_m, dtype: timedelta64[ns]