In [46]:
import pandas as pd
import os
import numpy as np
from datetime import timedelta

In [47]:
REDEFINED_DIR = "../../../downloads/olist_redefined"
STREAM_DST = os.path.join(REDEFINED_DIR, 'stream')
BATCH_DST = os.path.join(REDEFINED_DIR, 'batch')
os.makedirs(STREAM_DST, exist_ok=True)
os.makedirs(BATCH_DST, exist_ok=True)

In [48]:
orders = pd.read_csv("../../../downloads/olist/olist_orders_dataset.csv")

In [49]:
# from pathlib import Path
# customers = pd.read_csv(Path(BATCH_DST) / 'customers.tsv', sep='\t')
# customer_ids = customers.drop(columns='zip_code')
# orders = pd.merge(orders, customer_ids, on='customer_id', how='left')
orders.drop(columns='customer_id', inplace=True)
# orders.rename(columns={"customer_unique_id": 'customer_id'}, inplace=True)

In [50]:
melted_orders = pd.melt(
    orders,
    id_vars=['order_id', 'order_status'],  # 고정할 컬럼
    value_vars=['order_purchase_timestamp', 'order_approved_at', 
                'order_delivered_carrier_date', 'order_delivered_customer_date', 
                ],  # 변환할 컬럼
    var_name='status',  # 'timestamp'는 값의 유형이 아니라 컬럼의 유형을 나타내므로 'timestamp_type'으로 변경 추천
    value_name='timestamp'  # 값이 타임스탬프이므로 'timestamp'로 설정
).dropna().sort_values('timestamp')

melted_orders.rename(columns={'order_status': "final_status"}, inplace=True)
melted_orders['status'] = melted_orders['status'].apply(lambda x: "_".join(x.split("_")[1:-1]))
melted_orders['status'] = melted_orders['status'].apply(lambda x: "delivered" if x == "delivered_customer" else x)

In [51]:
final_order_status = melted_orders[['order_id', 'final_status']].drop_duplicates()
final_order_status = final_order_status.sort_values('order_id')
final_order_status

Unnamed: 0,order_id,final_status
85267,00010242fe8c5a6d1ba2dd792cb16214,delivered
71853,00018f77f2f0320c557190d7a144bdd3,delivered
6298,000229ec398224ef6ca0657da4fc703e,delivered
22550,00024acbcdf0a6daa1e931b038114c75,delivered
5247,00042b26cf59d7ce69dfabb4e55b4fd9,delivered
...,...,...
79550,fffc94f6ce00a00581880bf54a75a037,delivered
70155,fffcd46ef2263f404302a634eb57f7eb,delivered
52699,fffce4705a9662cd70adb13d4a31832d,delivered
59871,fffe18544ffabc95dfada21779c9644f,delivered


In [52]:
not_delivered_orders = final_order_status[final_order_status['final_status'] != 'delivered']

In [53]:
not_delivered_orders

Unnamed: 0,order_id,final_status
35138,0010dedd556712d7bb69a19cb7bbd37a,unavailable
44054,002f19a65a2ddd70a090297872e6d64e,shipped
85228,00310b0c75bb13015ec4d82d341865a4,canceled
83798,00a500bc03bc4ec968e574c2553bed4b,unavailable
29454,00a99c50fdff7e36262caba33821875a,shipped
...,...,...
72188,ff0768e3356919b32e3d7f30baecfcfb,shipped
90223,ff358345576214cc9e08488188973c84,shipped
33364,ff536d93ae4214b4d51c2894ccfc569f,shipped
95406,ff7140ba310a4717112d39c0cd0b4062,processing


In [54]:
orders_stream = melted_orders[['timestamp', 'order_id', 'status']]
orders_stream = orders_stream.sort_values(['order_id', 'timestamp'])
orders_stream

Unnamed: 0,timestamp,order_id,status
85267,2017-09-13 08:59:02,00010242fe8c5a6d1ba2dd792cb16214,purchase
184708,2017-09-13 09:45:35,00010242fe8c5a6d1ba2dd792cb16214,approved
284149,2017-09-19 18:34:16,00010242fe8c5a6d1ba2dd792cb16214,delivered_carrier
383590,2017-09-20 23:43:48,00010242fe8c5a6d1ba2dd792cb16214,delivered
71853,2017-04-26 10:53:06,00018f77f2f0320c557190d7a144bdd3,purchase
...,...,...,...
358194,2017-08-16 21:59:40,fffe18544ffabc95dfada21779c9644f,delivered
59741,2018-06-09 17:00:18,fffe41c64501cc87c801fd61db3f6244,purchase
159182,2018-06-09 17:10:13,fffe41c64501cc87c801fd61db3f6244,approved
258623,2018-06-11 14:11:00,fffe41c64501cc87c801fd61db3f6244,delivered_carrier


In [55]:
order_status = orders_stream[orders_stream['status'] != 'purchase']

In [56]:
order_status = pd.concat([order_status, not_delivered_orders])
order_status['status'] = order_status['status'].fillna(order_status['final_status'])
order_status = order_status.drop(columns='final_status').sort_values(['order_id', 'timestamp'])

# order_status의 timestamp가 None인 행 처리
order_status['timestamp'] = pd.to_datetime(order_status['timestamp'])  # timestamp을 datetime으로 변환
mask = order_status['timestamp'].isna()

# 이전 행의 timestamp + 3일 이내 랜덤 값 생성
def fill_random_timestamp(row, prev_timestamp):
    if pd.isna(row['timestamp']) and not pd.isna(prev_timestamp):
        random_days = np.random.uniform(0, 3)  # 0~3일 사이 랜덤
        return prev_timestamp + timedelta(days=random_days)
    return row['timestamp']

# 이전 timestamp 가져오기
order_status['prev_timestamp'] = order_status['timestamp'].shift(1)

# None 값 채우기
order_status['timestamp'] = order_status.apply(
    lambda row: fill_random_timestamp(row, row['prev_timestamp']), axis=1
)

# prev_timestamp 컬럼 삭제
order_status = order_status.drop(columns='prev_timestamp')

In [57]:
# order_status[order_status['order_id'] == "00310b0c75bb13015ec4d82d341865a4"]

order_status.sort_values('timestamp').to_csv(f"{STREAM_DST}/order_status.tsv", index=False, sep='\t')

In [58]:
order_status['status'].unique()

array(['approved', 'delivered_carrier', 'delivered', 'unavailable',
       'shipped', 'canceled', 'invoiced', 'processing', 'created'],
      dtype=object)