- 로그에 누락이 있을 수 있음
- 분할 결제가 있을 수 있음
- 분할 결제의 경우, 결제 완료 시간에서 payment_sequential의 내림차순으로 20~40초 사이의 임의의 시간만큼 차감

In [1]:
import pandas as pd
import os
import numpy as np
np.random.seed(42)
from datetime import timedelta

In [2]:
from pathlib import Path

REDEFINED_DIR = "../../../downloads/olist_redefined"
STREAM_DST = Path(os.path.join(REDEFINED_DIR, 'stream'))
CDC_DST = Path(os.path.join(REDEFINED_DIR, 'cdc'))
os.makedirs(STREAM_DST, exist_ok=True)
os.makedirs(CDC_DST, exist_ok=True)

In [3]:
orders = pd.read_csv("../../../downloads/olist/olist_orders_dataset.csv")

In [4]:
melted_orders = pd.melt(
    orders,
    id_vars=['order_id', 'customer_id'],  # 고정할 컬럼
    value_vars=['order_purchase_timestamp', 'order_approved_at', 
                'order_delivered_carrier_date', 'order_delivered_customer_date', 
                ],  # 변환할 컬럼
    var_name='status',
    value_name='timestamp'  # 값이 타임스탬프이므로 'timestamp'로 설정
).dropna().sort_values('timestamp')

melted_orders.rename(columns={'order_status': "final_status"}, inplace=True)
melted_orders['status'] = melted_orders['status'].apply(lambda x: "_".join(x.split("_")[1:-1]))
melted_orders['status'] = melted_orders['status'].apply(lambda x: "delivered" if x == "delivered_customer" else x)

In [5]:
melted_orders.head(2)

Unnamed: 0,order_id,customer_id,status,timestamp
4541,2e7a8482f6fb09756ca50c10d7bfc047,08c5351a6aca1c1589a38f244edeee9d,purchase,2016-09-04 21:15:19
4396,e5fa5a7210941f7d56d0208e4e071d35,683c54fc24d40ee9f8a6fc179fd9856c,purchase,2016-09-05 00:15:34


In [6]:
orders_stream = melted_orders[['timestamp', 'order_id', 'customer_id', 'status']]
orders_stream = orders_stream.sort_values(['order_id', 'timestamp'])
purchase = orders_stream[orders_stream['status'] == 'purchase'].drop(columns='status')
purchase = purchase.sort_values('timestamp')
purchase.head()

Unnamed: 0,timestamp,order_id,customer_id
4541,2016-09-04 21:15:19,2e7a8482f6fb09756ca50c10d7bfc047,08c5351a6aca1c1589a38f244edeee9d
4396,2016-09-05 00:15:34,e5fa5a7210941f7d56d0208e4e071d35,683c54fc24d40ee9f8a6fc179fd9856c
10071,2016-09-13 15:24:19,809a282bbd5dbcabb6f2f724fca862ec,622e13439d6b5a0b486c435618b2679e
30710,2016-09-15 12:16:38,bfbd0f9bdef84302105ad712db648a6c,86dc2ffce2dfff336de2f386a786e574
83078,2016-10-02 22:07:52,71303d7e93b399f5bcd537d124c0bcfa,b106b360fe2ef8849fbbd056f777b4d5


In [7]:
payments = pd.read_csv("../../../downloads/olist/olist_order_payments_dataset.csv")
payments

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,b81ef226f3fe1789b1e8b2acac839d17,1,credit_card,8,99.33
1,a9810da82917af2d9aefd1278f1dcfa0,1,credit_card,1,24.39
2,25e8ea4e93396b6fa0d3dd708e76c1bd,1,credit_card,1,65.71
3,ba78997921bbcdc1373bb41e913ab953,1,credit_card,8,107.78
4,42fdf880ba16b47b59251dd489d4441a,1,credit_card,2,128.45
...,...,...,...,...,...
103881,0406037ad97740d563a178ecc7a2075c,1,boleto,1,363.31
103882,7b905861d7c825891d6347454ea7863f,1,credit_card,2,96.80
103883,32609bbb3dd69b3c066a6860554a77bf,1,credit_card,1,47.77
103884,b8b61059626efa996a60be9bb9320e10,1,credit_card,5,369.54


In [8]:
payment_log = pd.merge(payments, purchase, on='order_id', how='outer')

from pathlib import Path
customer_origin = pd.read_csv(Path(CDC_DST) / 'customer_origin.tsv', sep='\t')
customer_ids = customer_origin.drop(columns='zip_code')
payment_log = pd.merge(payment_log, customer_ids, on='customer_id', how='left')
payment_log.drop(columns='customer_id', inplace=True)
payment_log.rename(columns={"customer_unique_id": 'customer_id'}, inplace=True)

payment_log.shape

(103887, 7)

In [9]:
payment_log = payment_log[
    ['timestamp', 'order_id', 'customer_id', 'payment_sequential', 'payment_type',
       'payment_installments', 'payment_value']].sort_values(['timestamp', 'order_id', 'payment_sequential'])
payment_log.shape

(103887, 7)

In [10]:
# Mock Payment Log 생성 및 처리
def create_mock_payment_log(payment_log):
    """
    결제 로그의 mock 데이터를 생성하여 중복 timestamp를 처리합니다.
    
    Args:
        payment_log: 원본 결제 로그 DataFrame
        
    Returns:
        DataFrame: 처리된 mock 결제 로그
    """
    # 1. 원본 데이터 복사 및 기본 전처리
    mock_log = payment_log.copy(deep=True)
    mock_log['timestamp'] = pd.to_datetime(mock_log['timestamp'], errors='coerce')
    
    # 2. 정렬 (timestamp, order_id, payment_sequential 순)
    mock_log = mock_log.sort_values(['timestamp', 'order_id', 'payment_sequential'])
    
    # 3. 중복 timestamp 식별
    mock_log['is_duplicate'] = mock_log.duplicated(subset=['timestamp'], keep=False)
    
    # 4. 중복 timestamp에 랜덤 시간 추가
    mock_log = add_random_time_to_duplicates(mock_log)
    
    return mock_log


def add_random_time_to_duplicates(df):
    """
    중복된 timestamp를 가진 결제에 대해 완료 시각에서 역순으로 랜덤 시간을 차감합니다.
    payment_sequential이 1이 아닌 경우, 가장 마지막 결제 시각에서 20~40초씩 순차 차감합니다.
    
    Args:
        df: 처리할 DataFrame
        
    Returns:
        DataFrame: 조정된 timestamp가 적용된 DataFrame
    """
    # 중복 timestamp 그룹별로 처리
    for timestamp_group in df[df['is_duplicate']]['timestamp'].unique():
        if pd.isna(timestamp_group):
            continue
            
        # 해당 timestamp를 가진 모든 행을 payment_sequential 순으로 정렬
        group_mask = (df['timestamp'] == timestamp_group)
        group_data = df[group_mask].copy()
        group_data = group_data.sort_values('payment_sequential')
        
        # 해당 그룹의 모든 payment_sequential 처리 (1개만 있으면 그대로 유지)
        if len(group_data) > 1:
            # 가장 큰 payment_sequential이 원본 완료 시간 (그대로 유지)
            max_payment_seq = group_data['payment_sequential'].max()
            completion_time = timestamp_group
            
            # payment_sequential을 큰 수부터 작은 수 순으로 정렬
            sorted_payments = group_data.sort_values('payment_sequential', ascending=False)
            
            # 가장 큰 번호를 제외하고 순차적으로 시간 차감
            cumulative_seconds = 0
            for idx, row in sorted_payments.iterrows():
                if row['payment_sequential'] == max_payment_seq:
                    # 가장 큰 payment_sequential은 원본 시간 유지
                    continue
                else:
                    # 20~40초 랜덤 시간 생성 후 누적
                    random_seconds = np.random.uniform(20, 40)
                    cumulative_seconds += random_seconds
                    
                    # 완료 시각에서 누적 시간만큼 차감
                    adjusted_time = completion_time - timedelta(seconds=cumulative_seconds)
                    adjusted_time = adjusted_time.replace(microsecond=0)  # 밀리초 절삭
                    df.loc[idx, 'timestamp'] = adjusted_time
    
    return df.drop(columns=['is_duplicate'])


# 사용 예시
mock_payment_log = create_mock_payment_log(payment_log)

In [11]:
mock_payment_log = create_mock_payment_log(payment_log)

In [12]:
payment_log[payment_log['order_id']=='63638a6806d67773f3adba8534553fff']

Unnamed: 0,timestamp,order_id,customer_id,payment_sequential,payment_type,payment_installments,payment_value
40419,2016-10-04 13:22:56,63638a6806d67773f3adba8534553fff,df2988ba3ed226b10521a0e4da849b61,1.0,voucher,1.0,15.1
40421,2016-10-04 13:22:56,63638a6806d67773f3adba8534553fff,df2988ba3ed226b10521a0e4da849b61,2.0,voucher,1.0,11.99
40422,2016-10-04 13:22:56,63638a6806d67773f3adba8534553fff,df2988ba3ed226b10521a0e4da849b61,3.0,voucher,1.0,9.02
40417,2016-10-04 13:22:56,63638a6806d67773f3adba8534553fff,df2988ba3ed226b10521a0e4da849b61,4.0,voucher,1.0,10.56
40416,2016-10-04 13:22:56,63638a6806d67773f3adba8534553fff,df2988ba3ed226b10521a0e4da849b61,5.0,voucher,1.0,34.19
40418,2016-10-04 13:22:56,63638a6806d67773f3adba8534553fff,df2988ba3ed226b10521a0e4da849b61,6.0,voucher,1.0,5.28
40420,2016-10-04 13:22:56,63638a6806d67773f3adba8534553fff,df2988ba3ed226b10521a0e4da849b61,7.0,voucher,1.0,0.74


In [13]:
mock_payment_log[mock_payment_log['order_id']=='63638a6806d67773f3adba8534553fff']

Unnamed: 0,timestamp,order_id,customer_id,payment_sequential,payment_type,payment_installments,payment_value
40419,2016-10-04 13:19:50,63638a6806d67773f3adba8534553fff,df2988ba3ed226b10521a0e4da849b61,1.0,voucher,1.0,15.1
40421,2016-10-04 13:20:23,63638a6806d67773f3adba8534553fff,df2988ba3ed226b10521a0e4da849b61,2.0,voucher,1.0,11.99
40422,2016-10-04 13:20:46,63638a6806d67773f3adba8534553fff,df2988ba3ed226b10521a0e4da849b61,3.0,voucher,1.0,9.02
40417,2016-10-04 13:21:26,63638a6806d67773f3adba8534553fff,df2988ba3ed226b10521a0e4da849b61,4.0,voucher,1.0,10.56
40416,2016-10-04 13:21:56,63638a6806d67773f3adba8534553fff,df2988ba3ed226b10521a0e4da849b61,5.0,voucher,1.0,34.19
40418,2016-10-04 13:22:25,63638a6806d67773f3adba8534553fff,df2988ba3ed226b10521a0e4da849b61,6.0,voucher,1.0,5.28
40420,2016-10-04 13:22:56,63638a6806d67773f3adba8534553fff,df2988ba3ed226b10521a0e4da849b61,7.0,voucher,1.0,0.74


In [22]:
mock_payment_log.sort_values('timestamp').to_csv(f"{STREAM_DST}/payment.tsv", index=False, sep='\t')