In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc
pd.options.mode.chained_assignment = None

root = '/content/drive/MyDrive/data/instacart-market-basket-analysis/'

# 데이터 불러오기

In [2]:
orders = pd.read_csv(root + 'orders.csv', 
                 dtype={
                        'order_id': np.int32,
                        'user_id': np.int64,
                        'eval_set': 'category',
                        'order_number': np.int16,
                        'order_dow': np.int8,
                        'order_hour_of_day': np.int8,
                        'days_since_prior_order': np.float32})


order_products_train = pd.read_csv(root + 'order_products__train.csv', 
                                 dtype={
                                        'order_id': np.int32,
                                        'product_id': np.uint16,
                                        'add_to_cart_order': np.int16,
                                        'reordered': np.int8})

order_products_prior = pd.read_csv(root + 'order_products__prior.csv', 
                                 dtype={
                                        'order_id': np.int32,
                                        'product_id': np.uint16,
                                        'add_to_cart_order': np.int16,
                                        'reordered': np.int8})

product_features = pd.read_pickle(root + 'product_features.pkl')

user_features = pd.read_pickle(root + 'user_features.pkl')

user_product_features = pd.read_pickle(root + 'user_product_features.pkl')

# train 구매내역에 orders 병합

inner 병합으로 train 주문 (마지막 주문 중 일부)만 남게된다.

train 데이터의 구매 내역이다.

In [3]:
orders.shape, order_products_train.shape

((3421083, 7), (1384617, 4))

In [4]:
train_orders = orders.merge(order_products_train, on = 'order_id', how = 'inner')
print(train_orders.shape)
train_orders.head(3)

(1384617, 10)


Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered
0,1187899,1,train,11,4,8,14.0,196,1,1
1,1187899,1,train,11,4,8,14.0,25133,2,1
2,1187899,1,train,11,4,8,14.0,38928,3,1


In [5]:
train_orders.reordered.value_counts()

1    828824
0    555793
Name: reordered, dtype: int64

마지막 주문에서 55만 개의 제품이 처음 구매되었다. 

필요없는 컬럼은 드롭해준다. 

In [6]:
train_orders.drop(['eval_set', 'add_to_cart_order', 'order_id'], axis = 1, inplace = True)
train_orders.head(3)

Unnamed: 0,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,reordered
0,1,11,4,8,14.0,196,1
1,1,11,4,8,14.0,25133,1
2,1,11,4,8,14.0,38928,1


train 데이터인 유저 목록을 뽑는다.

In [7]:
train_users = train_orders.user_id.unique()
train_users[:10]

array([ 1,  2,  5,  7,  8,  9, 10, 13, 14, 17])

유저 & 제품 df에서 train 데이터 유저만 남긴다.

train 데이터 유저의 전체 구매내역 df

In [8]:
df = user_product_features[user_product_features.user_id.isin(train_users)]
print(df.shape)
df.head(3)

(8474661, 11)


Unnamed: 0,user_id,product_id,user_product_reorder_percentage,total_product_orders_by_user,total_product_reorders_by_user,avg_add_to_cart_by_user,avg_days_since_last_bought,last_ordered_in,order_-3,order_-2,order_-1
0,1,196,0.9,10,9,1.4,19.555555,10,1.0,1.0,1.0
1,1,10258,0.888889,9,8,3.333333,19.555555,10,1.0,1.0,1.0
2,1,10326,0.0,1,0,5.0,28.0,5,,,


# 마지막 구매내역과 전체 구매내역 병합

1. train 데이터 유저의 전체 구매내역 : (8,474,661,  11)
2. train 데이터 유저의 마지막 구매내역 : (1,384,617,  10)

1,2를 병합한다. 

마지막 주문에 처음 구매된 제품은 다른 컬럼들이 모두 결측치이다. 우리의 관심 밖이다.





In [9]:
df = df.merge(train_orders, on = ['user_id', 'product_id'], how = 'outer')
print(df.shape)
df.head()

(9030454, 16)


Unnamed: 0,user_id,product_id,user_product_reorder_percentage,total_product_orders_by_user,total_product_reorders_by_user,avg_add_to_cart_by_user,avg_days_since_last_bought,last_ordered_in,order_-3,order_-2,order_-1,order_number,order_dow,order_hour_of_day,days_since_prior_order,reordered
0,1.0,196,0.9,10.0,9.0,1.4,19.555555,10.0,1.0,1.0,1.0,11.0,4.0,8.0,14.0,1.0
1,1.0,10258,0.888889,9.0,8.0,3.333333,19.555555,10.0,1.0,1.0,1.0,11.0,4.0,8.0,14.0,1.0
2,1.0,10326,0.0,1.0,0.0,5.0,28.0,5.0,,,,,,,,
3,1.0,12427,0.9,10.0,9.0,3.3,19.555555,10.0,1.0,1.0,1.0,,,,,
4,1.0,13032,0.666667,3.0,2.0,6.333333,21.666666,10.0,1.0,,,11.0,4.0,8.0,14.0,1.0


결측치 채우기

`order_number, order_dow, order_hour_of_day, days_since_prior_order, reordered` 컬럼은 마지막 주문 정보이기 때문에 마지막 주문에 포함되지 않은 제품은 결측치를 가지게 된다. 이 결측치는 유저별로 동일하게 채워준다. 마지막 주문에 포함되지 않아서 가지지 못한 정보이기 때문에 같은 수치를 가지게 만든다.

In [10]:
df['order_number'] = df.groupby('user_id')['order_number'].transform('mean')
df['order_dow'] = df.groupby('user_id')['order_dow'].transform('mean')
df['order_hour_of_day'] = df.groupby('user_id')['order_hour_of_day'].transform('mean')
df['days_since_prior_order'] = df.groupby('user_id')['days_since_prior_order'].transform('mean')
df.head()

Unnamed: 0,user_id,product_id,user_product_reorder_percentage,total_product_orders_by_user,total_product_reorders_by_user,avg_add_to_cart_by_user,avg_days_since_last_bought,last_ordered_in,order_-3,order_-2,order_-1,order_number,order_dow,order_hour_of_day,days_since_prior_order,reordered
0,1.0,196,0.9,10.0,9.0,1.4,19.555555,10.0,1.0,1.0,1.0,11.0,4.0,8.0,14.0,1.0
1,1.0,10258,0.888889,9.0,8.0,3.333333,19.555555,10.0,1.0,1.0,1.0,11.0,4.0,8.0,14.0,1.0
2,1.0,10326,0.0,1.0,0.0,5.0,28.0,5.0,,,,11.0,4.0,8.0,14.0,
3,1.0,12427,0.9,10.0,9.0,3.3,19.555555,10.0,1.0,1.0,1.0,11.0,4.0,8.0,14.0,
4,1.0,13032,0.666667,3.0,2.0,6.333333,21.666666,10.0,1.0,,,11.0,4.0,8.0,14.0,1.0


유저가 마지막 주문에 처음 구매한 제품은 뺀다.

In [11]:
df = df[df.reordered != 0]
print(df.shape)
df.head()

(8474661, 16)


Unnamed: 0,user_id,product_id,user_product_reorder_percentage,total_product_orders_by_user,total_product_reorders_by_user,avg_add_to_cart_by_user,avg_days_since_last_bought,last_ordered_in,order_-3,order_-2,order_-1,order_number,order_dow,order_hour_of_day,days_since_prior_order,reordered
0,1.0,196,0.9,10.0,9.0,1.4,19.555555,10.0,1.0,1.0,1.0,11.0,4.0,8.0,14.0,1.0
1,1.0,10258,0.888889,9.0,8.0,3.333333,19.555555,10.0,1.0,1.0,1.0,11.0,4.0,8.0,14.0,1.0
2,1.0,10326,0.0,1.0,0.0,5.0,28.0,5.0,,,,11.0,4.0,8.0,14.0,
3,1.0,12427,0.9,10.0,9.0,3.3,19.555555,10.0,1.0,1.0,1.0,11.0,4.0,8.0,14.0,
4,1.0,13032,0.666667,3.0,2.0,6.333333,21.666666,10.0,1.0,,,11.0,4.0,8.0,14.0,1.0


reordered == target 결측치 채우기

지금 결측치라는 의미는 마지막 주문에 포함되지 않았다는 뜻이다. 따라서 0으로 채우면 우리가 필요한 레이블을 얻을 수 있다.

In [12]:
df.reordered.fillna(0, inplace = True)

In [13]:
df.isnull().sum()

user_id                                  0
product_id                               0
user_product_reorder_percentage          0
total_product_orders_by_user             0
total_product_reorders_by_user           0
avg_add_to_cart_by_user                  0
avg_days_since_last_bought          552218
last_ordered_in                          0
order_-3                           5343210
order_-2                           7756823
order_-1                           8287521
order_number                             0
order_dow                                0
order_hour_of_day                        0
days_since_prior_order                   0
reordered                                0
dtype: int64

# product, user features 병합

In [14]:
df = df.merge(product_features, on = 'product_id', how = 'left')
df = df.merge(user_features, on = 'user_id', how = 'left')
print(df.shape)
df.head()

(8474661, 69)


Unnamed: 0,user_id,product_id,user_product_reorder_percentage,total_product_orders_by_user,total_product_reorders_by_user,avg_add_to_cart_by_user,avg_days_since_last_bought,last_ordered_in,order_-3,order_-2,...,reorder_propotion_by_user,total_reorders_by_user,average_order_size,reorder_in_order,prod_order_3,prod_order_2,prod_order_1,re_in_order_3,re_in_order_2,re_in_order_1
0,1.0,196,0.9,10.0,9.0,1.4,19.555555,10.0,1.0,1.0,...,0.694915,41.0,5.9,0.705833,6,6,9,0.666667,1.0,0.666667
1,1.0,10258,0.888889,9.0,8.0,3.333333,19.555555,10.0,1.0,1.0,...,0.694915,41.0,5.9,0.705833,6,6,9,0.666667,1.0,0.666667
2,1.0,10326,0.0,1.0,0.0,5.0,28.0,5.0,,,...,0.694915,41.0,5.9,0.705833,6,6,9,0.666667,1.0,0.666667
3,1.0,12427,0.9,10.0,9.0,3.3,19.555555,10.0,1.0,1.0,...,0.694915,41.0,5.9,0.705833,6,6,9,0.666667,1.0,0.666667
4,1.0,13032,0.666667,3.0,2.0,6.333333,21.666666,10.0,1.0,,...,0.694915,41.0,5.9,0.705833,6,6,9,0.666667,1.0,0.666667


제품별 마지막 주문과 총 주문 차이 피쳐 추가.

마지막 주문과 제품의 마지막 주문이 같다면 차이는 0이고 마지막 주문에 구매했다는 뜻이다.

차이가 크다면 해당 제품을 마지막으로 구매하고 그 차이만큼 주문할 동안 해당 제품을 구매하지 않았다는 뜻이다.

In [15]:
df['order_diff'] = df.order_number - df.last_ordered_in

### 용량 줄이기

4.5GB -> 1.3GB

In [16]:
def reduce_memory(df):
    
    """
    This function reduce the dataframe memory usage by converting it's type for easier handling.
    
    Parameters: Dataframe
    Return: Dataframe
    """
    
    start_mem_usg = df.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    
    for col in df.columns:
        if df[col].dtypes in ["int64", "int32", "int16"]:
            
            cmin = df[col].min()
            cmax = df[col].max()
            
            if cmin > np.iinfo(np.int8).min and cmax < np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            
            elif cmin > np.iinfo(np.int16).min and cmax < np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            
            elif cmin > np.iinfo(np.int32).min and cmax < np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
        
        if df[col].dtypes in ["float64", "float32"]:
            
            cmin = df[col].min()
            cmax = df[col].max()
            
            if cmin > np.finfo(np.float16).min and cmax < np.finfo(np.float16).max:
                df[col] = df[col].astype(np.float16)
            
            elif cmin > np.finfo(np.float32).min and cmax < np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
    
    print("")
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = df.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    
    return df

df = reduce_memory(df)

Memory usage of properties dataframe is : 4493.628993988037  MB

___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  1293.1306457519531  MB
This is  28.776978417266186 % of the initial size


# Saving data

In [17]:
df.to_pickle(root + 'Finaldata.pkl')

In [18]:
# 확인
df2 = pd.read_pickle(root +'Finaldata.pkl')
df2.head()

Unnamed: 0,user_id,product_id,user_product_reorder_percentage,total_product_orders_by_user,total_product_reorders_by_user,avg_add_to_cart_by_user,avg_days_since_last_bought,last_ordered_in,order_-3,order_-2,...,total_reorders_by_user,average_order_size,reorder_in_order,prod_order_3,prod_order_2,prod_order_1,re_in_order_3,re_in_order_2,re_in_order_1,order_diff
0,1.0,196,0.899902,10.0,9.0,1.400391,19.5625,10.0,1.0,1.0,...,41.0,5.898438,0.706055,6,6,9,0.666504,1.0,0.666504,1.0
1,1.0,10258,0.888672,9.0,8.0,3.333984,19.5625,10.0,1.0,1.0,...,41.0,5.898438,0.706055,6,6,9,0.666504,1.0,0.666504,1.0
2,1.0,10326,0.0,1.0,0.0,5.0,28.0,5.0,,,...,41.0,5.898438,0.706055,6,6,9,0.666504,1.0,0.666504,6.0
3,1.0,12427,0.899902,10.0,9.0,3.300781,19.5625,10.0,1.0,1.0,...,41.0,5.898438,0.706055,6,6,9,0.666504,1.0,0.666504,1.0
4,1.0,13032,0.666504,3.0,2.0,6.332031,21.671875,10.0,1.0,,...,41.0,5.898438,0.706055,6,6,9,0.666504,1.0,0.666504,1.0
