In [3]:
import os
import 
import pickle
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from PIL import Image
from tqdm import tqdm 


In [4]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  
    
seed = 42
seed_everything(42)

In [5]:
def dump_pickle(data, path):
    with open(path, "wb") as file:
        pickle.dump(data, file)

def load_pickle(path):
    with open(path, "rb") as file:
        data = pickle.load(file)
    return data

def save_pt(data, path):
    with open(path, "wb") as file:
        torch.save(data, file)


#### 데이터 load 및 shape 확인

In [44]:
item_data = pd.read_csv("./data/articles.csv")
interaction_data = pd.read_csv("./data/transactions_train.csv")

In [45]:
print("shape of Item data : ",item_data.shape)
print("shape of interaction data : ", interaction_data.shape)

shape of Item data :  (105542, 25)
shape of interaction data :  (31788324, 5)


#### 이미지가 없는 아이템 찾기

In [46]:
def img_by_id(df, article_id:int, no_list:list, echo:int=1, img_show:bool=True):
    if article_id in no_list:
        return
    if echo:
        display(df[df.article_id == article_id])

    img_id = "0"+str(article_id)
    img = Image.open("./data/images/"+img_id[0:3]+"/"+img_id+".jpg")

    if img_show:
        img.show()

def find_no_img_item(df):
    no_img = []

    for item in tqdm(df.iterrows(), total=len(df)):
        try:
            img_by_id(df, item[1][0], no_list=no_img, echo=0, img_show=False)
        except FileNotFoundError:
            no_img.append(item[0])

    return no_img

In [47]:
no_img_idx = find_no_img_item(item_data)


100%|██████████| 105542/105542 [00:24<00:00, 4312.82it/s]


In [48]:
print("# of non-img item : ",len(no_img_idx))

# of non-img item :  442


In [49]:
# 이미지가 없는 아이템 삭제
no_img_item = {idx:item_data.iloc[idx].article_id for idx in no_img_idx}
n_item_data = item_data.drop(no_img_idx, axis=0).reset_index(drop=True) # 이미지 없는 아이템을 삭제한 데이터
print("shape of n_item_data : ", n_item_data.shape)

shape of n_item_data :  (105100, 25)


In [50]:
# product_type에 속하는 상품이 10개 미만인 경우 삭제 131 -> 94로 줄어듦, 아이템은 약 130개 사라짐
n_item_data = n_item_data.groupby('product_type_no').filter(lambda x: len(x) >= 10).reset_index(drop=True)
print("shape of n_item_data : ", n_item_data.shape)

shape of n_item_data :  (104973, 25)


In [51]:
# product_type 중 불필요한 것 삭제 -> 84로 줄어듧, 아이템 400개 정도 사라짐
rm_list = ["Umbrella", "Bracelet", "Giftbox", "Waterbottle", 
           "Nipple covers", 'Chem. cosmetics', "Fine cosmetics", "Soft Toys",
           "Bra extender", "Cushion", "Side table", "Dog Wear", "Keychain",
           "Sewing kit", "Towel", "Mobile case", "Zipper head",
           "Wireless earphone case", "Stain remover spray",
           "Clothing mist", "Hair ties"]
n_item_data = n_item_data[~n_item_data['product_type_name'].isin(rm_list)].reset_index(drop=True)
print("shape of n_item_data : ", n_item_data.shape)
print("# of product_type : ", n_item_data.product_type_name.nunique())


shape of n_item_data :  (104572, 25)
# of product_type :  84


In [52]:
# interacion data에서 앞선 과정에서 삭제된 데이터 제거
n_interaction_data = interaction_data[interaction_data['article_id'].isin(n_item_data['article_id'])].reset_index(drop=True)
print("shape of interaction data : ", interaction_data.shape)
print("shape of n_interaction_data : ", n_interaction_data.shape)

shape of interaction data :  (31788324, 5)
shape of n_interaction_data :  (31550289, 5)


In [53]:
n_item_data = n_item_data[["article_id","product_type_no"]]

### 작은 데이터 생성 (기존 데이터의 20%)

In [54]:
from sklearn.model_selection import train_test_split

small_interaction_data, _ = train_test_split(n_interaction_data, train_size=0.2, random_state=seed)

In [55]:
small_interaction_data

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
21533031,2020-01-26,bc9df9b087d9f54aba50fe2a0ab838cb0a994ee67ac72c...,810838009,0.042356,2
28284508,2020-07-02,95de2d7aef6bd87b885f49144f948c1d5efbaa1106e118...,759479001,0.006763,2
20236749,2019-12-20,adad94ea244db54bd23c873a1d63776d5f554c1ef43633...,564311019,0.025407,1
1509783,2018-10-22,c57de756989c61f9aec47c3a4a832bbc25dff314e4e2c1...,675443003,0.011525,1
23521929,2020-03-25,14a687723e8ca90ce78f95049eeea9a8ce8ba8c521954a...,821397001,0.025407,2
...,...,...,...,...,...
21081788,2020-01-13,568ff20446d7e630ec7488a99f29eab971749a04dc161e...,781806005,0.015237,2
26858567,2020-06-10,8df8a7008288a8354e7f2eb261d83b8f812bc5b6293a4a...,856310003,0.025407,2
23327850,2020-03-18,39b7ca4eb886a72f80a34e5bea0fff9bb18a49734ddb5e...,858453001,0.041508,2
16094478,2019-09-04,2435ac495a1979f4ff35ffa1851dd9d8de597c44915824...,708311020,0.044051,1


In [56]:
# 이력이 7초과인 유저만 남김
small_interaction_data = small_interaction_data.groupby('customer_id').filter(lambda x: len(x) > 7).reset_index(drop=True)
print("shape of n_interaction_data : ", small_interaction_data.shape)

shape of n_interaction_data :  (4209754, 5)


In [57]:
user2idx = {v:k for k,v in enumerate(small_interaction_data['customer_id'].unique())}
item2idx = {v:k for k,v in enumerate(n_item_data['article_id'].unique())}

print("# of user", len(user2idx))
print("# of item", len(item2idx))

# of user 248113
# of item 104572


In [58]:
# train/test split
test_data = small_interaction_data.groupby("customer_id").nth(-1)
train_data = small_interaction_data[~small_interaction_data.index.isin(test_data.index)]
test_data = test_data.reset_index(drop=True).drop(["t_dat", "price","sales_channel_id"], axis=1)
train_data = train_data.reset_index(drop=True).drop(["t_dat", "price","sales_channel_id"], axis=1)

print("shape of test_data : ", test_data.shape)
print("shape of train_data : ", train_data.shape)


shape of test_data :  (248113, 2)
shape of train_data :  (3961641, 2)


In [59]:
train_data["customer_id"] = train_data["customer_id"].map(user2idx)
test_data["customer_id"] = test_data["customer_id"].map(user2idx)
train_data["article_id"] = train_data["article_id"].map(item2idx)
test_data["article_id"] = test_data["article_id"].map(item2idx)
n_item_data["article_id"] = n_item_data["article_id"].map(item2idx)

In [60]:
train_data.head()

Unnamed: 0,customer_id,article_id
0,0,69390
1,1,98120
2,2,42313
3,3,45500
4,4,85871


In [61]:
test_data[test_data.customer_id==0]

Unnamed: 0,customer_id,article_id
51671,0,44272


In [62]:
n_item_data.head()

Unnamed: 0,article_id,product_type_no
0,0,253
1,1,253
2,2,253
3,3,306
4,4,306


In [63]:
# product_type마다 속하는 item 목록 생성 {product_type_no : [items]}
items_by_prod_type = dict(list(n_item_data.groupby("product_type_no")))
for k in items_by_prod_type.keys():
    items_by_prod_type[k] = items_by_prod_type[k].article_id.reset_index(drop=True)

dump_pickle(items_by_prod_type, "items_by_prod_type.pkl")

In [64]:
# user의 pos item 목록 생성 {user_id : [items]}
pos_items_each_user = dict(list(train_data.groupby("customer_id")))
for k in pos_items_each_user.keys():
    pos_items_each_user[k] = pos_items_each_user[k].article_id.reset_index(drop=True)

dump_pickle(pos_items_each_user, "pos_items_each_user_small.pkl")

### DATASET 생성

In [65]:
class HMTrainDataset(Dataset):
    def __init__(self, df, item_df, items_by_prod_type, pos_items_each_user) -> None:
        super().__init__()
        self.df = df
        self.item_df = item_df
        self.items_by_prod_type = items_by_prod_type
        self.pos_items_each_user = pos_items_each_user
        self.df['neg'] = np.zeros(len(self.df), dtype=int)
        self._make_triples_data()
    
    def __getitem__(self, index):
        user = self.df.customer_id[index]
        pos = self.df.article_id[index]
        neg = self.df.neg[index]
        return user, pos, neg
            
    def _neg_sampling(self, pos_list, prod_type_no):
        # 같은 prod_type_no 내에서 neg sampling
        neg = random.choice(self.items_by_prod_type[prod_type_no]) 
        while neg in pos_list:
            neg = random.choice(self.items_by_prod_type[prod_type_no])
        return neg

    def _make_triples_data(self):
        for user_id, rows in tqdm(self.df.groupby("customer_id")):
            # pos_list = {k:1 for k in pos_items_each_user[user_id]}
            pos_list = self.pos_items_each_user[user_id]
            for idx, row in rows.iterrows():
                item_id = row.article_id
                prod_type_no = self.item_df[self.item_df["article_id"] == item_id].product_type_no.item()
                self.df.at[idx, 'neg'] = self._neg_sampling(pos_list, prod_type_no)
    
    def __len__(self):
        return len(self.df)
    

In [66]:
class HMTestDataset(Dataset):
    def __init__(self, df) -> None:
        super().__init__()
        self.df = df
        
    def __getitem__(self, index):
        user = self.df.customer_id[index]
        pos = self.df.article_id[index]
        return user, pos

    def __len__(self):
        return len(self.df)

In [67]:
train_dataset = HMTrainDataset(train_data, n_item_data, items_by_prod_type, pos_items_each_user)
save_pt(train_dataset, "./dataset/train_dataset_v1.pt")

100%|██████████| 248113/248113 [09:15<00:00, 446.70it/s]


In [None]:
test_dataset = HMTestDataset(test_data)
save_pt(test_dataset, "./dataset/test_dataset_v1.pt")

#### 아이템 이미지 임베딩 생성

In [None]:
from fashion_clip.fashion_clip import FashionCLIP
fclip = FashionCLIP('fashion-clip')

images = ["./data/images/" + "0" + str(k)[0:2] + "/" + "0"+str(k) + ".jpg" for k in n_item_data["article_id"].tolist()]
image_embeddings = fclip.encode_images(images, batch_size=64)

In [None]:
# 이미지 임베딩을 pkl 파일로 저장하고 싶은 경우 실행
dump_pickle(image_embeddings, "./data/img_emb.pkl")

#### 실행 시간 비교

In [4]:
import timeit
import numpy as np
import random

def function_1(pos_list, n_item):
    neg = np.random.randint(0, n_item, 1) 
    while neg in pos_list:
        neg = np.random.randint(0, n_item, 1) 
    return neg

def function_2(pos_list, n_item):
    neg = random.sample(range(0,n_item), 1) 
    while neg in pos_list:
        neg = random.sample(range(0,n_item), 1) 
    return neg

def function_3(pos_list, n_item):
    neg = random.choice(range(0,n_item)) 
    while neg in pos_list:
        neg = random.choice(range(0,n_item)) 
    return neg

def function_4(pos_list, n_item):
    neg = np.random.choice(np.arange(n_item), 1)
    while neg in pos_list:
        neg =  np.random.choice(np.arange(n_item), 1)
    return neg

n_item = len(item2idx)
pos = random.sample(range(1,n_item), 40)
n = 115500

time_1 = timeit.timeit('function_1(pos, n_item)', globals=globals(), number=n)
time_2 = timeit.timeit('function_2(pos, n_item)', globals=globals(), number=n)
time_3 = timeit.timeit('function_3(pos, n_item)', globals=globals(), number=n)
time_4 = timeit.timeit('function_4(pos, n_item)', globals=globals(), number=n)

print(f'np.random.randint 실행 시간: {time_1} 초')
print(f'random.sample 실행 시간: {time_2} 초')
print(f'random.choice 실행 시간: {time_3} 초')
print(f'np.random.choice 실행 시간: {time_4} 초')



np.random.randint 실행 시간: 6.5607867789804 초
random.sample 실행 시간: 0.37414733300101943 초
random.choice 실행 시간: 0.17139070699340664 초
np.random.choice 실행 시간: 13.791083243995672 초


In [None]:
def function_1(pos_list, n_item):
    neg = random.choice(range(0,n_item)) 
    while neg in pos_list:
        neg = random.choice(range(0,n_item)) 
    return neg

def function_2(pos_list, n_item):
    pos_list = {k:1 for k in pos_list}
    neg = random.choice(range(0,n_item)) 
    while neg in pos_list:
        neg = random.choice(range(0,n_item)) 
    return neg

n_item = len(item2idx)
pos = random.sample(range(1,n_item), 40)
n = 315500

time_1 = timeit.timeit('function_1(pos, n_item)', globals=globals(), number=n)
time_2 = timeit.timeit('function_2(pos, n_item)', globals=globals(), number=n)

print(f'list 실행 시간: {time_1} 초')
print(f'dict 실행 시간: {time_2} 초')


In [10]:
import numpy as np
import timeit
import random
import torch

def function_1(pos_list, all_item):
    candicate_items = torch.tensor(list(all_item - set(pos_list)))

def function_2(pos_list, all_item):
    candicate_items = torch.tensor(np.setdiff1d(all_item, pos_list))
   
def function_3():
    items_set = set(range(1,104573))
    
def function_4():
    items_np = np.arange(104573, dtype=np.int32)
    

items_set = set(range(1,104573))
items_np = np.arange(104573, dtype=np.int32)
pos = random.sample(range(1,104573), 40)
n1 = 74570
n2 = 4000

time_1 = timeit.timeit('function_1(pos, items_set)', globals=globals(), number=n1)
print(f'list(set) 실행 시간: {time_1} 초')

time_2 = timeit.timeit('function_2(pos, items_np)', globals=globals(), number=n1)
print(f'np.setdiff1d 실행 시간: {time_2} 초')

time_3 = timeit.timeit('function_3()', globals=globals(), number=n2)
print(f'set 실행 시간: {time_3} 초')

time_4 = timeit.timeit('function_4()', globals=globals(), number=n2)
print(f'np.arange 실행 시간: {time_4} 초')


list(set) 실행 시간: 508.1635248339999 초
np.setdiff1d 실행 시간: 69.18620454200027 초
set 실행 시간: 5.8264562089998435 초
np.arange 실행 시간: 0.028278708000470942 초


In [16]:
import numpy as np
import timeit
import random
import torch

def function_1(pos_list, all_item):
    candicate_items = torch.tensor(np.random.choice(np.setdiff1d(all_item, pos_list), 100))


def function_2(pos_list, all_item):
    candicate_items = torch.tensor(random.sample(list(np.setdiff1d(all_item, pos_list)), 100))
    

# items_set = set(range(1,104573))
items_np = np.arange(104573, dtype=np.int32)
pos = random.sample(range(1,104573), 40)
n1 = 24570

time_1 = timeit.timeit('function_1(pos, items_np)', globals=globals(), number=n1)
print(f'np.random.choice 실행 시간: {time_1} 초')

time_2 = timeit.timeit('function_2(pos, items_np)', globals=globals(), number=n1)
print(f'random.sample + list 실행 시간: {time_2} 초')


np.random.choice 실행 시간: 41.994027569977334 초
random.sample + list 실행 시간: 174.8324626859976 초
