In [405]:
import pandas as pd
import os
import numpy as np
from datetime import timedelta
import random

In [406]:
from pathlib import Path

REDEFINED_DIR = "../../downloads/olist_redefined"
os.makedirs(REDEFINED_DIR, exist_ok=True)

In [407]:
orders = pd.read_csv("../../downloads/olist/olist_orders_dataset.csv")

In [408]:
orders.drop(columns='customer_id', inplace=True)

In [409]:
melted_orders = pd.melt(
    orders,
    id_vars=['order_id', 'order_status'],  # 고정할 컬럼
    value_vars=['order_purchase_timestamp', 'order_approved_at', 
                'order_delivered_carrier_date', 'order_delivered_customer_date', 
                ],  # 변환할 컬럼: datetime이 있는 모든 열
    var_name='status',
    value_name='timestamp'  # 값이 타임스탬프이므로 'datetime'으로 설정
).sort_values(['order_id', 'timestamp'])

melted_orders.rename(columns={'order_status': "final_status"}, inplace=True)
melted_orders['status'] = melted_orders['status'].apply(lambda x: "_".join(x.split("_")[1:-1]))
melted_orders['timestamp'] = pd.to_datetime(melted_orders['timestamp'])

In [410]:
melted_orders['final_status'].unique()

array(['delivered', 'unavailable', 'shipped', 'canceled', 'invoiced',
       'processing', 'approved', 'created'], dtype=object)

In [411]:
melted_orders['final_status'].unique()

array(['delivered', 'unavailable', 'shipped', 'canceled', 'invoiced',
       'processing', 'approved', 'created'], dtype=object)

In [412]:
print('Before processing ...')
for same_status in ['created', 'canceled']:
    print(f"The status count for {same_status}: ", melted_orders[melted_orders['final_status']== same_status].shape)

Before processing ...
The status count for created:  (20, 4)
The status count for canceled:  (2500, 4)


In [413]:
melted_orders.loc[melted_orders['final_status'] == 'created', 'final_status'] = 'canceled'

In [414]:
print('After processing ...')
for same_status in ['created', 'canceled']:
    print(f"The status count for {same_status}: ", melted_orders[melted_orders['final_status']== same_status].shape)

After processing ...
The status count for created:  (0, 4)
The status count for canceled:  (2520, 4)


In [415]:
modified_orders = melted_orders.copy()
modified_orders['status'] = np.where(
    modified_orders['timestamp'].isnull(), # 조건
    modified_orders['final_status'],       # 조건이 참일 때 값
    modified_orders['status']              # 조건이 거짓일 때 값
)

modified_orders.drop(columns=['final_status'], inplace=True)
modified_orders.drop_duplicates(inplace=True)
modified_orders['status'].unique()

array(['purchase', 'approved', 'delivered_carrier', 'delivered_customer',
       'unavailable', 'shipped', 'canceled', 'invoiced', 'processing',
       'delivered'], dtype=object)

In [416]:
modified_orders[modified_orders['timestamp'].isna()]

Unnamed: 0,order_id,status,timestamp
234020,0010dedd556712d7bb69a19cb7bbd37a,unavailable,NaT
342377,002f19a65a2ddd70a090297872e6d64e,shipped,NaT
284110,00310b0c75bb13015ec4d82d341865a4,canceled,NaT
282680,00a500bc03bc4ec968e574c2553bed4b,unavailable,NaT
327777,00a99c50fdff7e36262caba33821875a,shipped,NaT
...,...,...,...
370511,ff0768e3356919b32e3d7f30baecfcfb,shipped,NaT
388546,ff358345576214cc9e08488188973c84,shipped,NaT
331687,ff536d93ae4214b4d51c2894ccfc569f,shipped,NaT
294288,ff7140ba310a4717112d39c0cd0b4062,processing,NaT


In [417]:
modified_orders = modified_orders.sort_values(['order_id', 'timestamp'])
modified_orders.reset_index(inplace=True, drop=True)

In [418]:
target_mock_index = modified_orders[modified_orders['timestamp'].isna()].index
target_mock_index

Index([    98,    294,    309,    916,    944,    983,   1001,   1080,   1230,
         1289,
       ...
       393904, 393907, 394038, 394057, 394285, 394433, 394725, 394889, 395028,
       395031],
      dtype='int64', length=2980)

In [419]:
# modified_orders.loc[393639]

In [420]:
df = modified_orders.copy()
three_days = timedelta(days=3)
random_seconds = random.uniform(0, three_days.total_seconds())
random_timedelta = timedelta(seconds=random_seconds)
for index in df.index:
    if pd.isnull(df.loc[index, 'timestamp']):
        if index == 0:
            mock_timestamp = df.loc[index + 1, 'timestamp'] - random_timedelta
        else:
            mock_timestamp = df.loc[index - 1, 'timestamp'] + random_timedelta
    
        df.loc[index, 'timestamp'] = mock_timestamp

In [421]:
df.iloc[target_mock_index]

Unnamed: 0,order_id,status,timestamp
98,0010dedd556712d7bb69a19cb7bbd37a,unavailable,2017-10-25 01:40:26.993392
294,002f19a65a2ddd70a090297872e6d64e,shipped,2018-03-22 22:28:29.993392
309,00310b0c75bb13015ec4d82d341865a4,canceled,2018-08-16 13:19:19.993392
916,00a500bc03bc4ec968e574c2553bed4b,unavailable,2017-11-26 09:09:32.993392
944,00a99c50fdff7e36262caba33821875a,shipped,2018-08-19 07:21:54.993392
...,...,...,...
394433,ff0768e3356919b32e3d7f30baecfcfb,shipped,2018-04-06 21:14:19.993392
394725,ff358345576214cc9e08488188973c84,shipped,2018-03-06 15:33:52.993392
394889,ff536d93ae4214b4d51c2894ccfc569f,shipped,2018-01-04 16:37:03.993392
395028,ff7140ba310a4717112d39c0cd0b4062,processing,2017-05-20 16:45:21.993392


In [422]:
df.sort_values(['timestamp']).to_csv(f"{REDEFINED_DIR}/order_status.tsv", index=False, sep='\t')