In [24]:
import pandas as pd
import os
import numpy as np
from datetime import timedelta
import random

In [25]:
from pathlib import Path

REDEFINED_DIR = "../../downloads/olist_redefined"
os.makedirs(REDEFINED_DIR, exist_ok=True)

In [26]:
orders = pd.read_csv("../../downloads/olist/olist_orders_dataset.csv")

In [27]:
orders.drop(columns='customer_id', inplace=True)

In [28]:
melted_orders = pd.melt(
    orders,
    id_vars=['order_id', 'order_status'],  # 고정할 컬럼
    value_vars=['order_purchase_timestamp', 'order_approved_at', 
                'order_delivered_carrier_date', 'order_delivered_customer_date', 
                ],  # 변환할 컬럼: datetime이 있는 모든 열
    var_name='status',
    value_name='timestamp'  # 값이 타임스탬프이므로 'datetime'으로 설정
).sort_values(['order_id', 'timestamp'])

melted_orders.rename(columns={'order_status': "final_status"}, inplace=True)
melted_orders['status'] = melted_orders['status'].apply(lambda x: "_".join(x.split("_")[1:-1]))
melted_orders['timestamp'] = pd.to_datetime(melted_orders['timestamp'])

In [29]:
melted_orders.head()

Unnamed: 0,order_id,final_status,status,timestamp
85267,00010242fe8c5a6d1ba2dd792cb16214,delivered,purchase,2017-09-13 08:59:02
184708,00010242fe8c5a6d1ba2dd792cb16214,delivered,approved,2017-09-13 09:45:35
284149,00010242fe8c5a6d1ba2dd792cb16214,delivered,delivered_carrier,2017-09-19 18:34:16
383590,00010242fe8c5a6d1ba2dd792cb16214,delivered,delivered_customer,2017-09-20 23:43:48
71853,00018f77f2f0320c557190d7a144bdd3,delivered,purchase,2017-04-26 10:53:06


In [30]:
melted_orders['final_status'].unique()

array(['delivered', 'unavailable', 'shipped', 'canceled', 'invoiced',
       'processing', 'approved', 'created'], dtype=object)

In [31]:
check = melted_orders[melted_orders['final_status'] == 'shipped']
check = check[check['timestamp'].isna()]
check.groupby('order_id')['status'].nunique().to_csv('shipped.csv')
# shipped[~shipped['timestamp'].isna()].groupby('status').nunique()

In [32]:
print('Before processing ...')
same_status_list = ['created', 'approved', 'canceled']
for same_status in same_status_list:
    print(f"The status count for {same_status}: ", melted_orders[melted_orders['final_status']== same_status].shape)
print(melted_orders['final_status'].unique())

Before processing ...
The status count for created:  (20, 4)
The status count for approved:  (8, 4)
The status count for canceled:  (2500, 4)
['delivered' 'unavailable' 'shipped' 'canceled' 'invoiced' 'processing'
 'approved' 'created']


In [33]:
melted_orders.loc[melted_orders['final_status'].isin(same_status_list), 'final_status'] = 'canceled'

In [34]:
print('After processing ...')
for same_status in same_status_list:
    print(f"The status count for {same_status}: ", melted_orders[melted_orders['final_status']== same_status].shape)

After processing ...
The status count for created:  (0, 4)
The status count for approved:  (0, 4)
The status count for canceled:  (2528, 4)


In [35]:
modified_orders = melted_orders.copy()
modified_orders['status'] = np.where(
    modified_orders['timestamp'].isnull(), # 조건
    modified_orders['final_status'],       # 조건이 참일 때 값
    modified_orders['status']              # 조건이 거짓일 때 값
)

modified_orders.drop(columns=['final_status'], inplace=True)
modified_orders.drop_duplicates(inplace=True)
modified_orders['status'].unique()

array(['purchase', 'approved', 'delivered_carrier', 'delivered_customer',
       'unavailable', 'shipped', 'canceled', 'invoiced', 'processing',
       'delivered'], dtype=object)

In [36]:
modified_orders[modified_orders['timestamp'].isna()]['status'].unique()

array(['unavailable', 'shipped', 'canceled', 'invoiced', 'processing',
       'delivered'], dtype=object)

In [37]:
# 배송완료인데, delivered_customer_date가 없는 경우
tmp_index = modified_orders[modified_orders['status'] == 'delivered'].index
modified_orders.loc[tmp_index,'status'] = 'delivered_customer'

In [38]:

modified_orders[modified_orders['order_id'] == '2d1e2d5bf4dc7227b3bfebb81328c15f']

Unnamed: 0,order_id,status,timestamp
3002,2d1e2d5bf4dc7227b3bfebb81328c15f,purchase,2017-11-28 17:44:07
102443,2d1e2d5bf4dc7227b3bfebb81328c15f,approved,2017-11-28 17:56:40
201884,2d1e2d5bf4dc7227b3bfebb81328c15f,delivered_carrier,2017-11-30 18:12:23
301325,2d1e2d5bf4dc7227b3bfebb81328c15f,delivered_customer,NaT


In [39]:
modified_orders = modified_orders.sort_values(['order_id', 'timestamp'])
modified_orders.reset_index(inplace=True, drop=True)

In [40]:
target_mock_index = modified_orders[modified_orders['timestamp'].isna()].index
target_mock_index

Index([    98,    294,    309,    916,    944,    983,   1001,   1080,   1230,
         1289,
       ...
       393904, 393907, 394038, 394057, 394285, 394433, 394725, 394889, 395028,
       395031],
      dtype='int64', length=2980)

In [41]:
modified_orders.iloc[target_mock_index]['status'].unique()

array(['unavailable', 'shipped', 'canceled', 'invoiced', 'processing',
       'delivered_customer'], dtype=object)

In [42]:
# modified_orders.loc[393639]

In [43]:
modified_orders[modified_orders['timestamp'].isna()]['status'].unique()

array(['unavailable', 'shipped', 'canceled', 'invoiced', 'processing',
       'delivered_customer'], dtype=object)

In [44]:
"""
CAUTION:
- 실시간 스트림 처리 시스템의 로직 테스트를 위해 비즈니스 규칙에 기반한 상태 순서에 따라, 시간을 논리적으로 추정.
- 분석 목적에서는 null 값을 임의로 채우지 않고 그대로 유지한채, dlq나 격리된 스토리지에 저장하여 논의 후 처리.
"""

df = modified_orders.copy()
three_days = timedelta(days=3)
random_seconds = random.uniform(0, three_days.total_seconds())
random_timedelta = timedelta(seconds=random_seconds)
for index in df.index:
    if pd.isnull(df.loc[index, 'timestamp']):
        if index == 0:
            mock_timestamp = df.loc[index + 1, 'timestamp'] - random_timedelta
        else:
            mock_timestamp = df.loc[index - 1, 'timestamp'] + random_timedelta
    
        df.loc[index, 'timestamp'] = mock_timestamp

In [45]:
df.iloc[target_mock_index].isna().value_counts()

order_id  status  timestamp
False     False   False        2980
Name: count, dtype: int64

In [46]:
df.sort_values(['timestamp']).to_csv(f"{REDEFINED_DIR}/order_status.tsv", index=False, sep='\t')