In [1]:
import pandas as pd
import os
import numpy as np
from datetime import timedelta

In [2]:
REDEFINED_DIR = "../../../downloads/olist_redefined"
STREAM_DST = os.path.join(REDEFINED_DIR, 'stream')
BATCH_DST = os.path.join(REDEFINED_DIR, 'batch')
os.makedirs(STREAM_DST, exist_ok=True)
os.makedirs(BATCH_DST, exist_ok=True)

In [3]:
orders = pd.read_csv("../../../downloads/olist/olist_orders_dataset.csv")

melted_orders = pd.melt(
    orders,
    id_vars=['order_id', 'customer_id', 'order_status'],  # 고정할 컬럼
    value_vars=['order_purchase_timestamp', 'order_approved_at', 
                'order_delivered_carrier_date', 'order_delivered_customer_date', 
                ],  # 변환할 컬럼
    var_name='current_status',  # 'timestamp'는 값의 유형이 아니라 컬럼의 유형을 나타내므로 'timestamp_type'으로 변경 추천
    value_name='timestamp'  # 값이 타임스탬프이므로 'timestamp'로 설정
).dropna().sort_values('timestamp')

melted_orders.rename(columns={'order_status': "final_status"}, inplace=True)
melted_orders['current_status'] = melted_orders['current_status'].apply(lambda x: "_".join(x.split("_")[1:-1]))
melted_orders['current_status'] = melted_orders['current_status'].apply(lambda x: "delivered" if x == "delivered_customer" else x)

In [4]:
final_order_status = melted_orders[['order_id', 'customer_id', 'final_status']].drop_duplicates()
final_order_status = final_order_status.sort_values('order_id')
final_order_status

Unnamed: 0,order_id,customer_id,final_status
85267,00010242fe8c5a6d1ba2dd792cb16214,3ce436f183e68e07877b285a838db11a,delivered
71853,00018f77f2f0320c557190d7a144bdd3,f6dd3ec061db4e3987629fe6b26e5cce,delivered
6298,000229ec398224ef6ca0657da4fc703e,6489ae5e4333f3693df5ad4372dab6d3,delivered
22550,00024acbcdf0a6daa1e931b038114c75,d4eb9395c8c0431ee92fce09860c5a06,delivered
5247,00042b26cf59d7ce69dfabb4e55b4fd9,58dbd0b2d70206bf40e62cd34e84d795,delivered
...,...,...,...
79550,fffc94f6ce00a00581880bf54a75a037,b51593916b4b8e0d6f66f2ae24f2673d,delivered
70155,fffcd46ef2263f404302a634eb57f7eb,84c5d4fbaf120aae381fad077416eaa0,delivered
52699,fffce4705a9662cd70adb13d4a31832d,29309aa813182aaddc9b259e31b870e6,delivered
59871,fffe18544ffabc95dfada21779c9644f,b5e6afd5a41800fdf401e0272ca74655,delivered


In [5]:
not_delivered_orders = final_order_status[final_order_status['final_status'] != 'delivered']

In [6]:
not_delivered_orders

Unnamed: 0,order_id,customer_id,final_status
35138,0010dedd556712d7bb69a19cb7bbd37a,3a92efdb6e6163dc1734d44f2f5f6d04,unavailable
44054,002f19a65a2ddd70a090297872e6d64e,7fa80efb1ef15ca4104627910c29791c,shipped
85228,00310b0c75bb13015ec4d82d341865a4,0dad07848c618cc5a4679a1bfe1db8d2,canceled
83798,00a500bc03bc4ec968e574c2553bed4b,3d2f26eab3f79dd1fe9977f615e70c2f,unavailable
29454,00a99c50fdff7e36262caba33821875a,7a399396442d5601cbedfbd0a3cf1da4,shipped
...,...,...,...
72188,ff0768e3356919b32e3d7f30baecfcfb,f3e5d57c12abb230c4f16b6a0a23d7f7,shipped
90223,ff358345576214cc9e08488188973c84,8da33bbf65a0b181486063ffb90fa3d6,shipped
33364,ff536d93ae4214b4d51c2894ccfc569f,8891eb5ca0e28df961b2b5b8f3c0eb23,shipped
95406,ff7140ba310a4717112d39c0cd0b4062,3c97bdeb3712ffcfa72a09bf0a8a3e95,processing


In [7]:
orders_stream = melted_orders[['timestamp', 'order_id', 'customer_id', 'current_status']]
orders_stream = orders_stream.sort_values(['order_id', 'timestamp'])
orders_stream

Unnamed: 0,timestamp,order_id,customer_id,current_status
85267,2017-09-13 08:59:02,00010242fe8c5a6d1ba2dd792cb16214,3ce436f183e68e07877b285a838db11a,purchase
184708,2017-09-13 09:45:35,00010242fe8c5a6d1ba2dd792cb16214,3ce436f183e68e07877b285a838db11a,approved
284149,2017-09-19 18:34:16,00010242fe8c5a6d1ba2dd792cb16214,3ce436f183e68e07877b285a838db11a,delivered_carrier
383590,2017-09-20 23:43:48,00010242fe8c5a6d1ba2dd792cb16214,3ce436f183e68e07877b285a838db11a,delivered
71853,2017-04-26 10:53:06,00018f77f2f0320c557190d7a144bdd3,f6dd3ec061db4e3987629fe6b26e5cce,purchase
...,...,...,...,...
358194,2017-08-16 21:59:40,fffe18544ffabc95dfada21779c9644f,b5e6afd5a41800fdf401e0272ca74655,delivered
59741,2018-06-09 17:00:18,fffe41c64501cc87c801fd61db3f6244,96d649da0cc4ff33bb408b199d4c7dcf,purchase
159182,2018-06-09 17:10:13,fffe41c64501cc87c801fd61db3f6244,96d649da0cc4ff33bb408b199d4c7dcf,approved
258623,2018-06-11 14:11:00,fffe41c64501cc87c801fd61db3f6244,96d649da0cc4ff33bb408b199d4c7dcf,delivered_carrier


In [8]:
delivery_status = orders_stream[orders_stream['current_status'] != 'purchase']

In [9]:
delivery_status = pd.concat([delivery_status, not_delivered_orders])
delivery_status['current_status'] = delivery_status['current_status'].fillna(delivery_status['final_status'])
delivery_status = delivery_status.drop(columns='final_status').sort_values(['order_id', 'timestamp'])

# delivery_status의 timestamp가 None인 행 처리
delivery_status['timestamp'] = pd.to_datetime(delivery_status['timestamp'])  # timestamp을 datetime으로 변환
mask = delivery_status['timestamp'].isna()

# 이전 행의 timestamp + 3일 이내 랜덤 값 생성
def fill_random_timestamp(row, prev_timestamp):
    if pd.isna(row['timestamp']) and not pd.isna(prev_timestamp):
        random_days = np.random.uniform(0, 3)  # 0~3일 사이 랜덤
        return prev_timestamp + timedelta(days=random_days)
    return row['timestamp']

# 이전 timestamp 가져오기
delivery_status['prev_timestamp'] = delivery_status['timestamp'].shift(1)

# None 값 채우기
delivery_status['timestamp'] = delivery_status.apply(
    lambda row: fill_random_timestamp(row, row['prev_timestamp']), axis=1
)

# prev_timestamp 컬럼 삭제
delivery_status = delivery_status.drop(columns='prev_timestamp')

In [10]:
# delivery_status[delivery_status['order_id'] == "00310b0c75bb13015ec4d82d341865a4"]

delivery_status.sort_values('timestamp').to_csv(f"{STREAM_DST}/delivery_status.tsv", index=False, sep='\t')

In [12]:
delivery_status[delivery_status['order_id'] == '0016dfedd97fc2950e388d2971d718c7']

Unnamed: 0,timestamp,order_id,customer_id,current_status
174553,2017-04-29 10:05:12,0016dfedd97fc2950e388d2971d718c7,2c8b917c5d7dd720ebe36a5ed3b501ec,approved
273994,2017-05-16 08:40:49,0016dfedd97fc2950e388d2971d718c7,2c8b917c5d7dd720ebe36a5ed3b501ec,delivered_carrier
373435,2017-05-22 18:32:21,0016dfedd97fc2950e388d2971d718c7,2c8b917c5d7dd720ebe36a5ed3b501ec,delivered
