In [7]:
import pandas as pd
from src.load_df import *
import duckdb

In [5]:
save_dir = os.path.join(ARTIFACT_DIR, 'features')
os.makedirs(save_dir, exist_ok=True)

## 배송 지연 일 수

In [6]:
orders = get_origin_df(OlistFileName.ORDERS)
orders.head(1)

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00


In [21]:
duckdb.query("""
    SELECT order_id FROM tmp_df 
    WHERE order_id LIKE '%481%'
""")

┌──────────────────────────────────┐
│             order_id             │
│             varchar              │
├──────────────────────────────────┤
│ e481f51cbdc54678b7cc49136f2d6af7 │
│ 47aa4816b27ba60ec948cd019cc1afc1 │
│ 9defaf92cff22420e4e8ef7784815a55 │
│ e22b71f6e4a481445ec4527cb4c405f7 │
│ 66cf92fc3ec94fbafc4edd77e0814818 │
│ 5a630e3d4aeb6fca86404481eb19cfe7 │
│ d5211af24819d319c02a285e7bb51750 │
│ 4dcc01481b7ab4f5f37c8eb4c4c67841 │
│ 045c9852284420b9c02d156436e1481c │
│ 68553e9e7b2c06807226ff348142d3ba │
│                ·                 │
│                ·                 │
│                ·                 │
│ 92d16bd458db6ad7e89154817101087f │
│ 41481f110ca58f4d3ee2e077d0313ff3 │
│ a41b4eceec5fb4df1207586f935af481 │
│ 2911b82b07d1481125508ab8b0292446 │
│ af4812c0286cb623517321205ffb5d1e │
│ 2b5bd09c3c5b318e1b7293015fdfb481 │
│ a0fd703a1c6a122520246bd8d481e528 │
│ 498bd6074c1c42f8c5bf3481c471c67e │
│ d81e1aea4814c6e5744458bcd1f94693 │
│ 3cea94817a51f34aa5937784fb4a3219 │
├

In [20]:
duckdb.query("""
    SELECT order_id FROM tmp_df 
    WHERE order_id IN ('481')
""")

┌──────────┐
│ order_id │
│ varchar  │
├──────────┤
│  0 rows  │
└──────────┘

In [4]:
keywords = {"time", "timestamp", "date", "at"}

cols_time = [
    col for col in orders.columns
    if any(tok in keywords for tok in col.lower().split("_"))
]
cols_time

['order_purchase_timestamp',
 'order_approved_at',
 'order_delivered_carrier_date',
 'order_delivered_customer_date',
 'order_estimated_delivery_date']

In [5]:
orders_times = orders[cols_time].apply(
    pd.to_datetime, errors="coerce"
)
orders_times['order_id'] = orders.order_id
orders_times

Unnamed: 0,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,order_id
0,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18,e481f51cbdc54678b7cc49136f2d6af7
1,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13,53cdb2fc8bc7dce0b6741e2150273451
2,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04,47770eb9100c2d0c44946d9cf07ec65d
3,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15,949d5b44dbf5de918fe9c16f97b45f8a
4,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26,ad21c59c0840e6cb83a9ceb5573f8159
...,...,...,...,...,...,...
99436,2017-03-09 09:54:05,2017-03-09 09:54:05,2017-03-10 11:18:03,2017-03-17 15:08:01,2017-03-28,9c5dedf39a927c1b2549525ed64a053c
99437,2018-02-06 12:58:58,2018-02-06 13:10:37,2018-02-07 23:22:42,2018-02-28 17:37:56,2018-03-02,63943bddc261676b46f01ca7ac2f7bd8
99438,2017-08-27 14:46:43,2017-08-27 15:04:16,2017-08-28 20:52:26,2017-09-21 11:24:17,2017-09-27,83c1379a015df1e13d02aae0204711ab
99439,2018-01-08 21:28:27,2018-01-08 21:36:21,2018-01-12 15:35:03,2018-01-25 23:32:54,2018-02-15,11c177c8e97725db2631073c19f07b62


In [6]:
orders_times["delivery_diff_days"] = orders_times['order_estimated_delivery_date'] - orders_times['order_delivered_customer_date']

In [7]:
save_path = os.path.join(save_dir, "delivery_diff_days.csv")
orders_times[['order_id', 'delivery_diff_days']].to_csv(save_path)

## 배송지연 여부

In [8]:
# 확인: order_delivered_customer_date가 없는 것도 있음

mask_late = orders_times.delivery_diff_days.dt.total_seconds() < 0
mask_nat  = orders_times.delivery_diff_days.isna()

is_late_delivery = orders_times[mask_late | mask_nat][
    ["order_id", "delivery_diff_days"]
]

save_path = os.path.join(save_dir, "is_late_delivery.csv")
is_late_delivery.to_csv(save_path, index=False)

## 주문 별 상품 수

In [9]:
order_items = get_origin_df(OlistFileName.ORDER_ITEMS)
order_items.head()

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.9,19.93
2,000229ec398224ef6ca0657da4fc703e,1,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,2018-01-18 14:48:30,199.0,17.87
3,00024acbcdf0a6daa1e931b038114c75,1,7634da152a4610f1595efa32f14722fc,9d7a1d34a5052409006425275ba1c2b4,2018-08-15 10:10:18,12.99,12.79
4,00042b26cf59d7ce69dfabb4e55b4fd9,1,ac6c3623068f30de03045865e4e10089,df560393f3a51e74553ab94004ba5c87,2017-02-13 13:57:51,199.9,18.14


In [10]:
save_path = os.path.join(save_dir, 'order_items_count.csv')
order_items['order_id'].value_counts().to_csv(save_path)

## 주문 별 판매자 개수

In [11]:
save_path = os.path.join(save_dir, "seller_count_by_order.csv")
seller_count_by_order = order_items.groupby('order_id')['seller_id'].nunique()
seller_count_by_order.to_csv(save_path)

In [12]:
seller_count_by_order[seller_count_by_order > 1]

order_id
002f98c0f7efd42638ed6100ca699b42    2
00bcee890eba57a9767c7b5ca12d3a1b    2
01144cadcf64b6427f0a6580a3033220    2
013a98b3a668bcef05b98898177f6923    2
014405982914c2cde2796ddcf0b8703d    2
                                   ..
feded510efe2a76dd134d8533af626f9    2
feea1134dca132388da564d8f026a2ca    2
ff00a56fe9475a175cd651d77c707a09    2
ff2a353ee8dfd97ba95c73cca4ed2636    2
ffb8f7de8940249a3221252818937ecb    3
Name: seller_id, Length: 1278, dtype: int64

In [13]:
# 확인: 한 주문에 여러 다른 물품 구매
order_items[order_items.order_id == "002f98c0f7efd42638ed6100ca699b42"]

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
80,002f98c0f7efd42638ed6100ca699b42,1,d41dc2f2979f52d75d78714b378d4068,7299e27ed73d2ad986de7f7c77d919fa,2017-08-10 09:30:15,8.99,32.57
81,002f98c0f7efd42638ed6100ca699b42,2,880be32f4db1d9f6e2bec38fb6ac23ab,fa40cc5b934574b62717c68f3d678b6d,2017-08-10 09:30:15,44.9,7.16


## 거리 및 지역 정보

In [14]:
customers = get_origin_df(OlistFileName.CUSTOMERS)
sellers = get_origin_df(OlistFileName.SELLERS)
geolocation = get_preprocessed_df(OlistFileName.GEOLOCATION)

geolocation.shape

(19023, 5)

In [15]:
sellers

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
0,3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP
1,d1b65fc7debc3361ea86b5f14c68d2e2,13844,mogi guacu,SP
2,ce3ad9de960102d0677a81f5d0bb7b2d,20031,rio de janeiro,RJ
3,c0f3eea2e14555b6faeea3dd58c1b1c3,4195,sao paulo,SP
4,51a04a8a6bdcb23deccc82b0b80742cf,12914,braganca paulista,SP
...,...,...,...,...
3090,98dddbc4601dd4443ca174359b237166,87111,sarandi,PR
3091,f8201cab383e484733266d1906e2fdfa,88137,palhoca,SC
3092,74871d19219c7d518d0090283e03c137,4650,sao paulo,SP
3093,e603cf3fec55f8697c9059638d6c8eb5,96080,pelotas,RS


In [49]:
seller_location = pd.merge(sellers[['seller_id', 'seller_zip_code_prefix']], geolocation[['geolocation_zip_code_prefix', 'geolocation_lat','geolocation_lng']], left_on='seller_zip_code_prefix', right_on="geolocation_zip_code_prefix", how='left')
seller_location.drop_duplicates(inplace=True)
seller_location.head()

Unnamed: 0,seller_id,seller_zip_code_prefix,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng
0,3442f8959a84dea7ee197c632cb2df15,13023,13023.0,-22.893317,-47.060596
1,d1b65fc7debc3361ea86b5f14c68d2e2,13844,13844.0,-22.383375,-46.948142
2,ce3ad9de960102d0677a81f5d0bb7b2d,20031,20031.0,-22.909446,-43.18024
3,c0f3eea2e14555b6faeea3dd58c1b1c3,4195,4195.0,-23.657118,-46.61273
4,51a04a8a6bdcb23deccc82b0b80742cf,12914,12914.0,-22.964546,-46.534214


In [50]:
customer_location = pd.merge(customers[['customer_id', 'customer_zip_code_prefix']], geolocation[['geolocation_zip_code_prefix', 'geolocation_lat','geolocation_lng']], left_on='customer_zip_code_prefix', right_on="geolocation_zip_code_prefix", how='left')
customer_location.drop_duplicates(inplace=True)
customer_location.head()

Unnamed: 0,customer_id,customer_zip_code_prefix,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng
0,06b8999e2fba1a1fbc88172c00ba8bc7,14409,14409.0,-20.499273,-47.396658
1,18955e83d337fd6b2def6b18a428ac77,9790,9790.0,-23.728396,-46.54225
2,4e7b3e00288586ebd08712fdd0374a03,1151,1151.0,-23.531309,-46.65669
3,b2b6027bc5c5109e529d4dc6358b12c3,8775,8775.0,-23.50067,-46.186348
4,4f2d8ab171c80ec8364f7c12e35b23ad,13056,13056.0,-22.975708,-47.14314


In [51]:
order_items = get_origin_df(OlistFileName.ORDER_ITEMS)
orders = get_origin_df(OlistFileName.ORDERS)

In [59]:
joined_customer_seller_order = pd.merge(orders[['order_id', 'customer_id']], order_items[['order_id', 'seller_id']], on='order_id', how='left' )
joined_customer_seller_order

Unnamed: 0,order_id,customer_id,seller_id
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,3504c0cb71d7fa48d967e0e4c94d59d9
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,289cdb325fb7e7f891c38608bf9e0962
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,4869f7a5dfa277a7dca6462dcf3b52b2
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,66922902710d126a0e7d26b0e3805106
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,2c9e548be18521d1c43cde1c582c6de8
...,...,...,...
113420,63943bddc261676b46f01ca7ac2f7bd8,1fca14ff2861355f6e5f14306ff977a7,1f9ab4708f3056ede07124aad39a2554
113421,83c1379a015df1e13d02aae0204711ab,1aa71eb042121263aafbe80c1b562c9c,d50d79cb34e38265a8649c383dcffd48
113422,11c177c8e97725db2631073c19f07b62,b331b74b18dc79bcdf6532d51e1637c1,a1043bafd471dff536d0c462352beb48
113423,11c177c8e97725db2631073c19f07b62,b331b74b18dc79bcdf6532d51e1637c1,a1043bafd471dff536d0c462352beb48


In [75]:
locations = pd.merge(joined_customer_seller_order, customer_location[['customer_id', 'geolocation_lat','geolocation_lng']], on='customer_id', how='left')
locations = pd.merge(locations, seller_location[['seller_id', 'geolocation_lat','geolocation_lng']], on='seller_id', how='left', suffixes=['_customer', '_seller'])
locations.drop_duplicates(inplace=True)
locations.shape

(100785, 7)

In [76]:
locations

Unnamed: 0,order_id,customer_id,seller_id,geolocation_lat_customer,geolocation_lng_customer,geolocation_lat_seller,geolocation_lng_seller
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,3504c0cb71d7fa48d967e0e4c94d59d9,-23.577482,-46.587077,-23.680862,-46.444311
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,289cdb325fb7e7f891c38608bf9e0962,-12.186877,-44.540232,-19.807885,-43.980818
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,4869f7a5dfa277a7dca6462dcf3b52b2,-16.745150,-48.514783,-21.363473,-48.229588
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,66922902710d126a0e7d26b0e3805106,-5.774002,-35.270976,-19.836871,-43.923241
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,2c9e548be18521d1c43cde1c582c6de8,-23.676257,-46.514580,-23.541525,-46.262148
...,...,...,...,...,...,...,...
113419,9c5dedf39a927c1b2549525ed64a053c,39bd1228ee8140590ac3aca26f2dfe00,e24fc9fcd865784fb25705606fe3dfe7,-23.177943,-45.882139,-22.960926,-46.524336
113420,63943bddc261676b46f01ca7ac2f7bd8,1fca14ff2861355f6e5f14306ff977a7,1f9ab4708f3056ede07124aad39a2554,-24.001334,-46.450022,-21.930464,-50.498065
113421,83c1379a015df1e13d02aae0204711ab,1aa71eb042121263aafbe80c1b562c9c,d50d79cb34e38265a8649c383dcffd48,-17.898045,-39.373106,-23.553949,-46.453257
113422,11c177c8e97725db2631073c19f07b62,b331b74b18dc79bcdf6532d51e1637c1,a1043bafd471dff536d0c462352beb48,-22.563909,-42.695343,-20.940712,-45.827195
