1. 피쳐 생성
    
    1.1 주문 정보

    1.2 유저 정보
    
    1.3 제품 정보
2. 유저별 구매내역으로 데이터 재가공

    2.1 유저별 구매내역 생성

    2.2 데이터를 하나로 병합
 
3. 훈련/테스트 데이터 분리하기

    3.1 훈련/테스트로 데이터 분리

    3.2 훈련 데이터에 타겟 추가

In [1]:
import numpy as np
import pandas as pd

RAW_PATH = '/content/drive/MyDrive/data/instacart-market-basket-analysis/raw/'
PREPARED_PATH = '/content/drive/MyDrive/data/instacart-market-basket-analysis/prepared/'

In [21]:
def reduce_memory(df):
    start_mem_usg = df.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    for col in df.columns:
        if df[col].dtypes in ["int64", "int32", "int16"]:
            cmin = df[col].min()
            cmax = df[col].max()
            if cmin > np.iinfo(np.int8).min and cmax < np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif cmin > np.iinfo(np.int16).min and cmax < np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif cmin > np.iinfo(np.int32).min and cmax < np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
        if df[col].dtypes in ["float64", "float32"]:
            cmin = df[col].min()
            cmax = df[col].max()
            if cmin > np.finfo(np.float16).min and cmax < np.finfo(np.float16).max:
                df[col] = df[col].astype(np.float16)
            elif cmin > np.finfo(np.float32).min and cmax < np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
    print("")
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = df.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return df

# 1. 피쳐 생성

In [12]:
orders = pd.read_csv(RAW_PATH + 'orders.csv', dtype={
                            'order_id': np.uint32,              # 1 ~ 3421083
                            'user_id': np.uint32,               # 1 ~ 206209
                            'eval_set': 'category',             # 3가지
                            'order_number': np.uint8,           # 1 ~ 100
                            'order_dow': np.uint8,              # 0 ~ 6 
                            'order_hour_of_day': np.uint8,      # 0 ~ 23
                            'days_since_prior_order': np.float32})
priors = pd.read_csv(RAW_PATH + 'order_products__prior.csv', dtype={
                            'order_id': np.uint32,          # 1 ~ 3421083
                            'product_id': np.uint16,        # 1 ~ 49688
                            'add_to_cart_order': np.uint8,  # 1 ~ 80 
                            'reordered': np.uint8})         # 0 ~ 1
train = pd.read_csv(RAW_PATH + 'order_products__train.csv', dtype={
                            'order_id': np.uint32,          # 1 ~ 3421083
                            'product_id': np.uint16,        # 1 ~ 49688
                            'add_to_cart_order': np.uint8,  # 1 ~ 80 
                            'reordered': np.uint8})         # 0 ~ 1
products = pd.read_csv(RAW_PATH + 'products.csv', dtype={
                              'product_id': np.uint16,     # 1 ~ 49688
                              'aisle_id': np.uint8,        # 1 ~ 134
                              'department_id': np.uint8}) # 1 ~ 21
aisles = pd.read_csv(RAW_PATH + 'aisles.csv', dtype={'product_id': np.uint16})
departments = pd.read_csv(RAW_PATH + 'departments.csv', dtype={'product_id': np.uint16})

In [6]:
# 이전 구매내역에 주문 정보 추가
prior_df = priors.merge(orders, on='order_id', how='left')
prior_df = prior_df.merge(products, on='product_id', how='left')

print(prior_df.shape)
prior_df.head()

In [7]:
prior_df.to_pickle(PREPARED_PATH + 'prior_df.pkl')

## 1.1 주문 정보

이전(prior) 주문별 정보

---
    items : 주문당 구매 제품 수
    reorder_in_order : 주문당 재주문 제품 비율

In [4]:
order_feats = prior_df.groupby(['user_id','order_number']).agg(basket_size=('product_id','count'), 
                                                               reorder_in_order=('reordered', 'mean'))
order_feats.reset_index(inplace=True)
order_feats = order_feats.merge(orders.drop('eval_set', axis=1), on=['user_id','order_number'], how='left')
col = ['user_id', 'order_number', 'basket_size','reorder_in_order', 'order_dow','order_hour_of_day', 'days_since_prior_order', 'order_id']
order_feats = order_feats[col]

print(order_feats.shape)
order_feats.head()

(3214874, 8)


Unnamed: 0,user_id,order_number,basket_size,reorder_in_order,order_dow,order_hour_of_day,days_since_prior_order,order_id
0,1,1,5,0.0,2,8,,2539329
1,1,2,6,0.5,3,7,15.0,2398795
2,1,3,5,0.6,3,12,21.0,473747
3,1,4,5,1.0,4,7,29.0,2254736
4,1,5,8,0.625,4,15,28.0,431534


In [5]:
# 마지막 주문 3개 뽑기
last_three_orders = order_feats.sort_values('order_number', ascending=False)\
                    .groupby('user_id').head(3).sort_values(['user_id', 'order_number'], ignore_index=True)
last_three_orders['rank'] = last_three_orders.groupby('user_id')['order_number'].rank()

print(last_three_orders.shape)
last_three_orders.head()

(618627, 9)


Unnamed: 0,user_id,order_number,basket_size,reorder_in_order,order_dow,order_hour_of_day,days_since_prior_order,order_id,rank
0,1,8,6,0.666667,1,14,14.0,3108588,1.0
1,1,9,6,1.0,1,16,0.0,2295261,2.0
2,1,10,9,0.666667,4,8,30.0,2550362,3.0
3,2,12,19,0.578947,1,9,28.0,3186735,1.0
4,2,13,9,0.0,4,11,30.0,3268552,2.0


In [6]:
last_three_orders.to_pickle(PREPARED_PATH + 'last_three_orders.pkl')

## 1.2 유저 정보

유저별 정보

---
    U_total_orders : 총 주문 수
    U_total_products : 총 구매 제품 수
    U_unique_products : 총 구매 제품 종류 수
    U_total_reorders : 총 재주문 수
    U_unique_reordered_products : 총 재주문 제품 종류 수
    U_avg_basket_size : 주문당 평균 구매 제품 수
    U_avg_reorder_in_order : 주문당 평균 재구매 제품 비율
    U_basket_size_order : 마지막 3개의 주문 구매 제품 수
    U_re_in_order : 마지막 3개의 주문 재주문 제품 포함 비율

In [42]:
user_feats = prior_df.groupby('user_id').agg(U_total_orders=('order_number', 'max'), 
                                             U_total_products=('product_id', 'count'), 
                                             U_unique_products=('product_id', 'nunique'),
                                             U_total_reorders=('reordered', 'sum'))
user_feats['U_unique_reordered_products'] = prior_df[prior_df.reordered == 1].drop_duplicates(['user_id', 'product_id']).groupby('user_id').product_id.nunique()
user_feats['U_unique_reordered_products'].fillna(0, inplace=True)
user_feats['U_avg_basket_size'] = order_feats.groupby('user_id')['basket_size'].mean()
user_feats['U_avg_reorder_in_order'] = order_feats[order_feats['order_number'] != 1].groupby('user_id')['reorder_in_order'].mean()
user_feats.reset_index(inplace=True)

last_order_feats = last_three_orders.pivot_table(index = 'user_id', columns = ['rank'], 
                                                 values=['basket_size', 'reorder_in_order']).\
                                                 reset_index(drop = False)
last_order_feats.columns = ['user_id','U_basket_size_order_3', 'U_basket_size_order_2', 'U_basket_size_order_1', 'U_re_in_order_3', 'U_re_in_order_2', 'U_re_in_order_1']
user_feats = user_feats.merge(last_order_feats, on='user_id', how='left')

print(user_feats.shape)
user_feats.head()

(206209, 14)


Unnamed: 0,user_id,U_total_orders,U_total_products,U_unique_products,U_total_reorders,U_unique_reordered_products,U_avg_basket_size,U_avg_reorder_in_order,U_basket_size_order_3,U_basket_size_order_2,U_basket_size_order_1,U_re_in_order_3,U_re_in_order_2,U_re_in_order_1
0,1,10,59,18,41.0,10.0,5.9,0.784259,6,6,9,0.666667,1.0,0.666667
1,2,14,195,102,93.0,37.0,13.928571,0.482419,19,9,16,0.578947,0.0,0.625
2,3,12,88,33,55.0,19.0,7.333333,0.71871,6,5,6,0.833333,1.0,1.0
3,4,5,18,17,1.0,1.0,3.6,0.035714,7,2,3,0.142857,0.0,0.0
4,5,4,37,23,14.0,8.0,9.25,0.503704,9,5,12,0.444444,0.4,0.666667


In [69]:
user_feats.to_pickle(PREPARED_PATH + 'user_feats.pkl')

## 1.3 제품 정보

전체 주문 제품 통계

---
    P_total_orders : 총 판매 수 
    P_unique_users : 구매한 유저 수
    P_total_reorders : 총 재주문 수 
    P_reorder_rate : 재주문 율
    P_mean_cart_order : 평균 장바구니 순위
    is_organic : 유기농 제품 여부
    is_low_fat : 저지방/무지방 제품 여부

    소분류/대분류 별
    total_orders : 총 판매 수
    total_reorders : 총 재주문 수
    reorder_rate : 재주문 율
    mean_cart_order : 평균 장바구니 순위

In [13]:
prod_feats = prior_df.groupby('product_id').agg(P_total_orders=('product_id', 'count'), 
                                                P_unique_users=('user_id','nunique'), 
                                                P_total_reorders=('reordered', 'sum'), 
                                                P_reorder_rate=('reordered', 'mean'), 
                                                P_mean_cart_order=('add_to_cart_order', 'mean'))
prod_feats = prod_feats.merge(products, on='product_id', how='left')
prod_feats['is_organic'] = prod_feats['product_name'].apply(lambda x: 1 if 'Organic' in x else 0)
prod_feats['is_low_fat'] = prod_feats['product_name'].apply(lambda x: 1 if 'fat' in x or 'Fat' in x else 0)
prod_feats.sort_values(by='P_total_orders', inplace=True, ascending=False, ignore_index=True)

print(prod_feats.shape)
prod_feats.head()

(49677, 11)


Unnamed: 0,product_id,P_total_orders,P_unique_users,P_total_reorders,P_reorder_rate,P_mean_cart_order,product_name,aisle_id,department_id,is_organic,is_low_fat
0,24852,472565,73956,398609.0,0.843501,4.894129,Banana,24,4,0,0
1,13176,379450,63537,315913.0,0.832555,5.095947,Bag of Organic Bananas,24,4,1,0
2,21137,264683,58838,205845.0,0.777704,7.248902,Organic Strawberries,24,4,1,0
3,21903,241921,55037,186884.0,0.7725,7.42964,Organic Baby Spinach,123,4,1,0
4,47209,213584,43453,170131.0,0.796553,6.775011,Organic Hass Avocado,24,4,1,0


In [14]:
# 소분류
aisle_feats = prior_df.groupby('aisle_id').agg(A_total_orders=('reordered', 'count'), 
                                               A_total_reorders=('reordered', 'sum'), 
                                               A_reorder_rate=('reordered', 'mean'), 
                                               A_avg_cart_order=('add_to_cart_order', 'mean'))
aisle_feats.sort_values(by='A_total_orders', inplace=True, ascending=False)
aisle_feats.reset_index(inplace=True)

print(aisle_feats.shape)
aisle_feats.head()

(134, 5)


Unnamed: 0,aisle_id,A_total_orders,A_total_reorders,A_reorder_rate,A_avg_cart_order
0,24,3642188,2615469.0,0.718104,7.144228
1,83,3418021,2032172.0,0.594546,8.852012
2,123,1765313,1127177.0,0.638514,8.399527
3,120,1452343,997018.0,0.686489,7.861644
4,21,979763,573383.0,0.585226,9.078364


In [15]:
# 대분류
dp_feats = prior_df.groupby('department_id').agg(D_total_orders=('reordered', 'count'), 
                                                 D_total_reorders=('reordered', 'sum'), 
                                                 D_reorder_rate=('reordered', 'mean'), 
                                                 D_avg_cart_order=('add_to_cart_order', 'mean'))
dp_feats.sort_values(by='D_total_orders', inplace=True, ascending=False)
dp_feats.reset_index(inplace=True)

print(dp_feats.shape)
dp_feats.head()

(21, 5)


Unnamed: 0,department_id,D_total_orders,D_total_reorders,D_reorder_rate,D_avg_cart_order
0,4,9479291,6160710.0,0.649913,8.022875
1,16,5414016,3627221.0,0.669969,7.495423
2,19,2887550,1657973.0,0.57418,9.187743
3,7,2690129,1757892.0,0.65346,6.976699
4,1,2236432,1211890.0,0.541885,8.996414


In [16]:
prod_feats = prod_feats.merge(aisle_feats, on='aisle_id', how='left')
prod_feats = prod_feats.merge(dp_feats, on='department_id', how='left')

print(prod_feats.shape)
prod_feats.head()

(49677, 19)


Unnamed: 0,product_id,P_total_orders,P_unique_users,P_total_reorders,P_reorder_rate,P_mean_cart_order,product_name,aisle_id,department_id,is_organic,is_low_fat,A_total_orders,A_total_reorders,A_reorder_rate,A_avg_cart_order,D_total_orders,D_total_reorders,D_reorder_rate,D_avg_cart_order
0,24852,472565,73956,398609.0,0.843501,4.894129,Banana,24,4,0,0,3642188,2615469.0,0.718104,7.144228,9479291,6160710.0,0.649913,8.022875
1,13176,379450,63537,315913.0,0.832555,5.095947,Bag of Organic Bananas,24,4,1,0,3642188,2615469.0,0.718104,7.144228,9479291,6160710.0,0.649913,8.022875
2,21137,264683,58838,205845.0,0.777704,7.248902,Organic Strawberries,24,4,1,0,3642188,2615469.0,0.718104,7.144228,9479291,6160710.0,0.649913,8.022875
3,21903,241921,55037,186884.0,0.7725,7.42964,Organic Baby Spinach,123,4,1,0,1765313,1127177.0,0.638514,8.399527,9479291,6160710.0,0.649913,8.022875
4,47209,213584,43453,170131.0,0.796553,6.775011,Organic Hass Avocado,24,4,1,0,3642188,2615469.0,0.718104,7.144228,9479291,6160710.0,0.649913,8.022875


In [17]:
prod_feats.to_pickle(PREPARED_PATH + 'prod_feats.pkl')

# 2. 유저별 구매내역으로 데이터 재가공

유저별 구매 내역 제품 통계

---
    UP_total_orders : 총 주문 수
    UP_total_reorders : 총 재주문 수
    UP_reorder_ratio : 해당 제품을 처음 구매한 이후 재주문이 가능한 기간(주문 수)동안의 재주문 율
    UP_mean_cart_order : 평균 장바구니 순위
    order-3,2,1 : 마지막 3번의 주문에서 재주문 여부 (구매 된적이 없다면 -1)

In [7]:
user_feats = pd.read_pickle(PREPARED_PATH + 'user_feats.pkl')
prod_feats = pd.read_pickle(PREPARED_PATH + 'prod_feats.pkl')

prior_df = pd.read_pickle(PREPARED_PATH + 'prior_df.pkl')
last_three_orders = pd.read_pickle(PREPARED_PATH + 'last_three_orders.pkl')

## 2.1 유저별 구매내역 생성

In [8]:
user_prod_feats = prior_df.groupby(['user_id', 'product_id']).agg(UP_total_orders=('reordered', 'count'),
                                                                  UP_total_reorders=('reordered', 'sum'),
                                                                  UP_first_order_num=('order_number', 'min'),
                                                                  UP_mean_cart_order=('add_to_cart_order', 'mean'))
user_prod_feats.reset_index(inplace=True)

# 재구매 가능 기간 재주문율 추가
user_prod_feats = user_prod_feats.merge(user_feats[['user_id', 'U_total_orders']], on='user_id', how='left')
user_prod_feats['order_range_D'] = user_prod_feats.U_total_orders - user_prod_feats.UP_first_order_num
user_prod_feats['UP_reorder_ratio'] = user_prod_feats.UP_total_reorders / user_prod_feats['order_range_D']
user_prod_feats.UP_reorder_ratio.fillna(0, inplace=True)
user_prod_feats.drop(['U_total_orders', 'UP_first_order_num', 'order_range_D'], axis=1, inplace=True)

# 마지막 3번의 주문에서 재구매여부
last_three_orders_prods = prior_df.merge(last_three_orders[['user_id', 'order_number']], on=['user_id', 'order_number'], how='inner')
last_three_orders_prods['rank'] = last_three_orders_prods.groupby(['user_id', 'product_id'])['order_number'].rank()
product_purchase_history = last_three_orders_prods.pivot_table(index = ['user_id', 'product_id'],\
                                                               columns='rank', values = 'reordered')
product_purchase_history.columns = ['UP_order_3', 'UP_order_2', 'UP_order_1']
product_purchase_history.reset_index(inplace=True)
user_prod_feats = user_prod_feats.merge(product_purchase_history, on=['user_id', 'product_id'], how='left')
user_prod_feats = user_prod_feats.replace(np.nan, -1)
del last_three_orders, last_three_orders_prods, product_purchase_history

user_prod_feats.sort_values(['user_id', 'UP_total_reorders'], ascending=[True, False], ignore_index=True, inplace=True)
user_prod_feats = user_prod_feats.astype({'UP_total_orders':np.int8, 
                                          'UP_mean_cart_order':np.float32,
                                          'UP_reorder_ratio': np.float32, 
                                          'UP_order_3': np.int8, 
                                          'UP_order_2': np.int8, 
                                          'UP_order_1': np.int8})

print(user_prod_feats.shape)
user_prod_feats.head()

(13307953, 11)


Unnamed: 0,user_id,product_id,UP_total_orders,UP_total_reorders,UP_mean_cart_order,aisle_id,department_id,UP_reorder_ratio,UP_order_3,UP_order_2,UP_order_1
0,1,196,10,9,1.4,77,7,1.0,1,1,1
1,1,12427,10,9,3.3,23,19,1.0,1,1,1
2,1,10258,9,8,3.333333,117,19,1.0,1,1,1
3,1,25133,8,7,4.0,21,16,1.0,1,1,1
4,1,13032,3,2,6.333333,121,14,0.25,1,-1,-1


In [23]:
user_prod_feats.to_pickle(PREPARED_PATH + 'user_prod_feats.pkl')

## 2.2 데이터 하나로 병합

In [22]:
prepared_df = user_prod_feats.merge(prod_feats, on='product_id', how='left')
prepared_df = prepared_df.merge(user_feats, on='user_id', how='left')
reduce_memory(prepared_df)

print(prepared_df.shape)
prepared_df.head()

Memory usage of properties dataframe is : 3350.543586730957  MB

___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  1357.985468864441  MB
This is  40.53030303030303 % of the initial size
(13307953, 40)


Unnamed: 0,user_id,product_id,UP_total_orders,UP_total_reorders,UP_mean_cart_order,UP_reorder_ratio,UP_order_3,UP_order_2,UP_order_1,P_total_orders,...,U_total_reorders,U_unique_reordered_products,U_avg_basket_size,U_avg_reorder_in_order,U_basket_size_order_3,U_basket_size_order_2,U_basket_size_order_1,U_re_in_order_3,U_re_in_order_2,U_re_in_order_1
0,1,196,10,9,1.400391,1.0,1,1,1,35791,...,41.0,10.0,5.898438,0.78418,6,6,9,0.666504,1.0,0.666504
1,1,12427,10,9,3.300781,1.0,1,1,1,6476,...,41.0,10.0,5.898438,0.78418,6,6,9,0.666504,1.0,0.666504
2,1,10258,9,8,3.333984,1.0,1,1,1,1946,...,41.0,10.0,5.898438,0.78418,6,6,9,0.666504,1.0,0.666504
3,1,25133,8,7,4.0,1.0,1,1,1,6196,...,41.0,10.0,5.898438,0.78418,6,6,9,0.666504,1.0,0.666504
4,1,13032,3,2,6.332031,0.25,1,-1,-1,3751,...,41.0,10.0,5.898438,0.78418,6,6,9,0.666504,1.0,0.666504


In [24]:
prepared_df.to_pickle(PREPARED_PATH + 'prepared_df.pkl')

# 3. 훈련/테스트 데이터 분리하기

In [2]:
orders = pd.read_csv(RAW_PATH + 'orders.csv', dtype={
                                'order_id': np.uint32,              # 1 ~ 3421083
                                'user_id': np.uint32,               # 1 ~ 206209
                                'eval_set': 'category',             # 3가지
                                'order_number': np.uint8,           # 1 ~ 100
                                'order_dow': np.uint8,              # 0 ~ 6 
                                'order_hour_of_day': np.uint8,      # 0 ~ 23
                                'days_since_prior_order': np.float32})
train = pd.read_csv(RAW_PATH + 'order_products__train.csv', dtype={
                               'order_id': np.uint32,          # 1 ~ 3421083
                               'product_id': np.uint16,        # 1 ~ 49688
                               'add_to_cart_order': np.uint8,  # 1 ~ 80 
                               'reordered': np.uint8})         # 0 ~ 1
prepared_df = pd.read_pickle(PREPARED_PATH + 'prepared_df.pkl')

In [5]:
prepared_df.drop('product_name', axis=1, inplace=True)

train = train.merge(orders, on='order_id', how='left')

## 3.1 훈련/테스트로 데이터 분리

In [None]:
# 1. train, test 셋 유저 명단 뽑기 
train_users = orders[orders.eval_set == 'train'].user_id.unique() 
test_users = orders[orders.eval_set == 'test'].user_id.unique() 

# 2. train, test set 분리
train_df = prepared_df[prepared_df['user_id'].isin(train_users)]
test_df = prepared_df[prepared_df['user_id'].isin(test_users)]

## 3.2 훈련 데이터에 타겟 추가

In [7]:
train_df = train_df.merge(train[['user_id', 'product_id', 'reordered']], on=['user_id','product_id'], how='left') 
train_df['reordered'].fillna(0, inplace=True)
train_df = train_df.astype({'reordered': np.uint8})

train_df.sort_values(by=['user_id', 'UP_total_reorders'], ascending=[True, False], inplace=True, ignore_index=True)
train_df.shape, test_df.shape

((8474661, 40), (4833292, 39))

In [8]:
train_df.to_pickle(PREPARED_PATH + 'train_df.pkl')
test_df.to_pickle(PREPARED_PATH + 'test_df.pkl')