In [1]:
import pandas as pd
import os
import numpy as np
from datetime import timedelta

In [2]:
from pathlib import Path

REDEFINED_DIR = "../../../downloads/olist_redefined"
os.makedirs(REDEFINED_DIR, exist_ok=True)

In [3]:
orders = pd.read_csv("../../../downloads/olist/olist_orders_dataset.csv")

In [4]:
orders.drop(columns='customer_id', inplace=True)

In [5]:
melted_orders = pd.melt(
    orders,
    id_vars=['order_id', 'order_status'],  # 고정할 컬럼
    value_vars=['order_purchase_timestamp', 'order_approved_at', 
                'order_delivered_carrier_date', 'order_delivered_customer_date', 
                ],  # 변환할 컬럼
    var_name='status',
    value_name='timestamp'  # 값이 타임스탬프이므로 'timestamp'로 설정
).dropna().sort_values('timestamp')

melted_orders.rename(columns={'order_status': "final_status"}, inplace=True)
melted_orders['status'] = melted_orders['status'].apply(lambda x: "_".join(x.split("_")[1:-1]))
melted_orders['status'] = melted_orders['status'].apply(lambda x: "delivered" if x == "delivered_customer" else x)

In [6]:
orders_stream = melted_orders[['timestamp', 'order_id', 'status']]
orders_stream = orders_stream.sort_values(['timestamp', 'order_id', ])
orders_stream

Unnamed: 0,timestamp,order_id,status
4541,2016-09-04 21:15:19,2e7a8482f6fb09756ca50c10d7bfc047,purchase
4396,2016-09-05 00:15:34,e5fa5a7210941f7d56d0208e4e071d35,purchase
10071,2016-09-13 15:24:19,809a282bbd5dbcabb6f2f724fca862ec,purchase
30710,2016-09-15 12:16:38,bfbd0f9bdef84302105ad712db648a6c,purchase
130151,2016-09-15 12:16:38,bfbd0f9bdef84302105ad712db648a6c,approved
...,...,...,...
31891,2018-10-03 18:55:29,a2ac6dad85cf8af5b0afb510a240fe8c,purchase
354958,2018-10-11 16:41:14,450cb96c63e1e5b49d34f223f67976d2,delivered
68373,2018-10-16 20:16:02,b059ee4de278302d550a3035c4cdb740,purchase
317054,2018-10-17 13:22:46,7e708aed151d6a8601ce8f2eaa712bf4,delivered


In [7]:
orders_stream.shape == orders_stream.drop_duplicates().shape

True

In [8]:
orders_stream.isna().sum()

timestamp    0
order_id     0
status       0
dtype: int64

In [9]:
orders_stream.sort_values(['timestamp']).to_csv(f"{REDEFINED_DIR}/stream_order_status.tsv", index=False, sep='\t')