In [1]:
import pandas as pd
import os
import numpy as np
from datetime import timedelta

In [2]:
from pathlib import Path

REDEFINED_DIR = "../../../downloads/olist_redefined"
STREAM_DST = Path(os.path.join(REDEFINED_DIR, 'stream'))
CDC_DST = Path(os.path.join(REDEFINED_DIR, 'cdc'))
os.makedirs(STREAM_DST, exist_ok=True)
os.makedirs(CDC_DST, exist_ok=True)

In [3]:
orders = pd.read_csv("../../../downloads/olist/olist_orders_dataset.csv")

In [4]:
orders.drop(columns='customer_id', inplace=True)

In [5]:
melted_orders = pd.melt(
    orders,
    id_vars=['order_id', 'order_status'],  # 고정할 컬럼
    value_vars=['order_purchase_timestamp', 'order_approved_at', 
                'order_delivered_carrier_date', 'order_delivered_customer_date', 
                ],  # 변환할 컬럼
    var_name='status',
    value_name='timestamp'  # 값이 타임스탬프이므로 'timestamp'로 설정
).dropna().sort_values('timestamp')

melted_orders.rename(columns={'order_status': "final_status"}, inplace=True)
melted_orders['status'] = melted_orders['status'].apply(lambda x: "_".join(x.split("_")[1:-1]))
melted_orders['status'] = melted_orders['status'].apply(lambda x: "delivered" if x == "delivered_customer" else x)

In [6]:
orders_stream = melted_orders[['timestamp', 'order_id', 'status']]
orders_stream = orders_stream.sort_values(['timestamp', 'order_id', ])
orders_stream

Unnamed: 0,timestamp,order_id,status
4541,2016-09-04 21:15:19,2e7a8482f6fb09756ca50c10d7bfc047,purchase
4396,2016-09-05 00:15:34,e5fa5a7210941f7d56d0208e4e071d35,purchase
10071,2016-09-13 15:24:19,809a282bbd5dbcabb6f2f724fca862ec,purchase
30710,2016-09-15 12:16:38,bfbd0f9bdef84302105ad712db648a6c,purchase
130151,2016-09-15 12:16:38,bfbd0f9bdef84302105ad712db648a6c,approved
...,...,...,...
31891,2018-10-03 18:55:29,a2ac6dad85cf8af5b0afb510a240fe8c,purchase
354958,2018-10-11 16:41:14,450cb96c63e1e5b49d34f223f67976d2,delivered
68373,2018-10-16 20:16:02,b059ee4de278302d550a3035c4cdb740,purchase
317054,2018-10-17 13:22:46,7e708aed151d6a8601ce8f2eaa712bf4,delivered


In [7]:
orders_stream_without_purchase = orders_stream[orders_stream['status'] != 'purchase']
orders_stream_without_purchase.head()

Unnamed: 0,timestamp,order_id,status
130151,2016-09-15 12:16:38,bfbd0f9bdef84302105ad712db648a6c,approved
163499,2016-10-04 09:43:32,1ff217aa612f6cd7c4255c9bfe931c8b,approved
192077,2016-10-04 10:18:57,65d1e226dfaeb8cdc42f665422522d14,approved
106188,2016-10-04 10:19:23,cd3b8574c82b42fc8129f6d502690c3e,approved
166478,2016-10-04 10:25:46,ed8c7b1b3eb256c70ce0c74231e1da88,approved


In [8]:
payment = pd.read_csv(f"{STREAM_DST}/payment.tsv", sep='\t')
payment.head(2)


Unnamed: 0,timestamp,order_id,customer_id,payment_sequential,payment_type,payment_installments,payment_value
0,2016-09-04 21:15:19,2e7a8482f6fb09756ca50c10d7bfc047,b7d76e111c89f7ebf14761390f0f7d17,1.0,credit_card,1.0,136.23
1,2016-09-05 00:15:34,e5fa5a7210941f7d56d0208e4e071d35,4854e9b3feff728c13ee5fc7d1547e92,1.0,credit_card,3.0,75.06


In [9]:
payment_to_concat = payment[['timestamp', 'order_id']].copy()
payment_to_concat.loc[:, 'status'] = 'purchase'
payment_to_concat.head(2)

Unnamed: 0,timestamp,order_id,status
0,2016-09-04 21:15:19,2e7a8482f6fb09756ca50c10d7bfc047,purchase
1,2016-09-05 00:15:34,e5fa5a7210941f7d56d0208e4e071d35,purchase


In [10]:
order_stream_with_payment = pd.concat([orders_stream_without_purchase, payment_to_concat])
order_stream_with_payment

Unnamed: 0,timestamp,order_id,status
130151,2016-09-15 12:16:38,bfbd0f9bdef84302105ad712db648a6c,approved
163499,2016-10-04 09:43:32,1ff217aa612f6cd7c4255c9bfe931c8b,approved
192077,2016-10-04 10:18:57,65d1e226dfaeb8cdc42f665422522d14,approved
106188,2016-10-04 10:19:23,cd3b8574c82b42fc8129f6d502690c3e,approved
166478,2016-10-04 10:25:46,ed8c7b1b3eb256c70ce0c74231e1da88,approved
...,...,...,...
103882,2018-09-29 09:13:03,392ed9afd714e3c74767d0c4d3e3f477,purchase
103883,2018-10-01 15:30:09,616fa7d4871b87832197b2a137a115d2,purchase
103884,2018-10-03 18:55:29,a2ac6dad85cf8af5b0afb510a240fe8c,purchase
103885,2018-10-16 20:16:02,b059ee4de278302d550a3035c4cdb740,purchase


In [11]:
# order_status[order_status['order_id'] == "00310b0c75bb13015ec4d82d341865a4"]

order_stream_with_payment.sort_values(['timestamp','order_id']).to_csv(f"{STREAM_DST}/order_status.tsv", index=False, sep='\t')

In [12]:
order_stream_with_payment['status'].unique()

array(['approved', 'delivered_carrier', 'delivered', 'purchase'],
      dtype=object)