In [2]:
import os
import random
import pickle
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from PIL import Image
from tqdm import tqdm 


In [3]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  
    
seed = 42
seed_everything(42)

In [4]:
def dump_pickle(data, path):
    with open(path, "wb") as file:
        pickle.dump(data, file)

def load_pickle(path):
    with open(path, "rb") as file:
        data = pickle.load(file)
    return data

def save_pt(data, path):
    with open(path, "wb") as file:
        torch.save(data, file)


#### 데이터 load 및 shape 확인

In [5]:
item_data = pd.read_csv("./data/articles.csv")
interaction_data = pd.read_csv("./data/transactions_train.csv")

In [6]:
print("shape of Item data : ",item_data.shape)
print("shape of interaction data : ", interaction_data.shape)

shape of Item data :  (105542, 25)
shape of interaction data :  (31788324, 5)


#### 이미지가 없는 아이템 찾기

In [7]:
def img_by_id(df, article_id:int, no_list:list, echo:int=1, img_show:bool=True):
    if article_id in no_list:
        return
    if echo:
        display(df[df.article_id == article_id])

    img_id = "0"+str(article_id)
    img = Image.open("./data/images/"+img_id[0:3]+"/"+img_id+".jpg")

    if img_show:
        img.show()

def find_no_img_item(df):
    no_img = []

    for item in tqdm(df.iterrows(), total=len(df)):
        try:
            img_by_id(df, item[1][0], no_list=no_img, echo=0, img_show=False)
        except FileNotFoundError:
            no_img.append(item[0])

    return no_img

In [8]:
no_img_idx = find_no_img_item(item_data)

  img_by_id(df, item[1][0], no_list=no_img, echo=0, img_show=False)
100%|██████████| 105542/105542 [00:22<00:00, 4722.48it/s]


In [9]:
print("# of non-img item : ",len(no_img_idx))

# of non-img item :  442


In [10]:
# 이미지가 없는 아이템 삭제
no_img_item = {idx:item_data.iloc[idx].article_id for idx in no_img_idx}
n_item_data = item_data.drop(no_img_idx, axis=0).reset_index(drop=True) # 이미지 없는 아이템을 삭제한 데이터
print("shape of n_item_data : ", n_item_data.shape)

shape of n_item_data :  (105100, 25)


#### 상품 카테고리(product_type_no)에 따른 필터링

In [11]:
# product_type에 속하는 상품이 10개 미만인 경우 삭제 131 -> 94로 줄어듦, 아이템은 약 130개 사라짐
n_item_data = n_item_data.groupby('product_type_no').filter(lambda x: len(x) >= 10).reset_index(drop=True)
print("shape of n_item_data : ", n_item_data.shape)

shape of n_item_data :  (104973, 25)


In [12]:
# product_type 중 불필요한 것 삭제 -> 84로 줄어듧, 아이템 400개 정도 사라짐
rm_list = ["Umbrella", "Bracelet", "Giftbox", "Waterbottle", 
           "Nipple covers", 'Chem. cosmetics', "Fine cosmetics", "Soft Toys",
           "Bra extender", "Cushion", "Side table", "Dog Wear", "Keychain",
           "Sewing kit", "Towel", "Mobile case", "Zipper head",
           "Wireless earphone case", "Stain remover spray",
           "Clothing mist", "Hair ties"]
n_item_data = n_item_data[~n_item_data['product_type_name'].isin(rm_list)].reset_index(drop=True)
print("shape of n_item_data : ", n_item_data.shape)
print("# of product_type : ", n_item_data.product_type_name.nunique())


shape of n_item_data :  (104572, 25)
# of product_type :  84


In [13]:
# interacion data에서 앞선 과정에서 삭제된 데이터 제거
n_interaction_data = interaction_data[interaction_data['article_id'].isin(n_item_data['article_id'])].reset_index(drop=True)
print("shape of interaction data : ", interaction_data.shape)
print("shape of n_interaction_data : ", n_interaction_data.shape)

shape of interaction data :  (31788324, 5)
shape of n_interaction_data :  (31550289, 5)


In [14]:
n_item_data = n_item_data[["article_id","product_type_no"]]

#### 아이템 및 유저의 상호작용 수에 따른 데이터 처리

In [15]:
# 아이템/유저 등장 빈도에 따른 데이터 구성
# 유저: 상호작용이 threshold 이하인 경우 삭제, 아이템 : 상호작용이 20 이하인 경우 삭제
# 반복적으로 실행하여, 모든 유저, 아이템이 조건을 만족하도록 함

def data_cutter(origin_data, threshold=20):
    while True:
        new_data = origin_data.groupby('customer_id').filter(lambda x: len(x) > threshold).reset_index(drop=True)
        new_data = new_data.groupby('article_id').filter(lambda x: len(x) > 20).reset_index(drop=True)
        
        if new_data.equals(origin_data):
            print("finish")
            break
        origin_data = new_data
        print("cut again")

    print("shape of n_interaction_data : ", new_data.shape)
    print("num of user : ", new_data.customer_id.nunique())
    print("num of item : ", new_data.article_id.nunique())
    print("data density : ", new_data.shape[0]/(new_data.customer_id.nunique()*new_data.article_id.nunique())*100, "%")
    
    return new_data

In [16]:
# n_interaction_data_10 = data_cutter(n_interaction_data, 10)
# n_interaction_data_20 = data_cutter(n_interaction_data, 20)
n_interaction_data_30 = data_cutter(n_interaction_data, 40) # 아이템: 20이하 삭제, 유저: 40이하 삭제

cut again
cut again
cut again
cut again
cut again
finish
shape of n_interaction_data :  (19082117, 5)
num of user :  218878
num of item :  62675
data density :  0.1391009512767936 %


In [17]:
n_interaction_data_30

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
1,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687001,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221001,0.020322,2
...,...,...,...,...,...
19082112,2020-09-22,ffd4cf2217de4a0a3f9f610cdec334c803692a18af08ac...,856440002,0.042356,2
19082113,2020-09-22,fff2282977442e327b45d8c89afde25617d00124d0f999...,929511001,0.059305,2
19082114,2020-09-22,fff2282977442e327b45d8c89afde25617d00124d0f999...,891322004,0.042356,2
19082115,2020-09-22,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,833459002,0.006763,1


In [18]:
new_itme_data = n_item_data[n_item_data['article_id'].isin(n_interaction_data_30['article_id'])].reset_index(drop=True)

In [19]:
new_itme_data

Unnamed: 0,article_id,product_type_no
0,108775015,253
1,108775044,253
2,108775051,253
3,110065001,306
4,110065002,306
...,...,...
62670,949198001,272
62671,949551001,252
62672,949551002,252
62673,953763001,253


In [20]:
new_itme_data.shape

(62675, 2)

#### 아이템 이미지 임베딩 생성 with fashion clip

In [21]:
from fashion_clip.fashion_clip import FashionCLIP
fclip = FashionCLIP('fashion-clip')

images = ["./data/images/" + "0" + str(k)[0:2] + "/" + "0"+str(k) + ".jpg" for k in new_itme_data["article_id"].tolist()]
image_embeddings = fclip.encode_images(images, batch_size=64)

  from .autonotebook import tqdm as notebook_tqdm
model.safetensors: 100%|██████████| 605M/605M [00:12<00:00, 46.9MB/s] 
980it [44:50,  2.75s/it]                         


In [22]:
#이미지 임베딩을 pkl 파일로 저장하고 싶은 경우 실행
dump_pickle(image_embeddings, "./data/img_emb_small.pkl")

In [23]:
image_embeddings = load_pickle("./data/img_emb_small.pkl")

#### id mapping

In [24]:
user2idx = {v:k for k,v in enumerate(n_interaction_data_30['customer_id'].unique())}
item2idx = {v:k for k,v in enumerate(new_itme_data['article_id'].unique())}

print("# of user", len(user2idx))
print("# of item", len(item2idx))


# of user 218878
# of item 62675


In [25]:
n_interaction_data_30["customer_id"] = n_interaction_data_30["customer_id"].map(user2idx)
n_interaction_data_30["article_id"] = n_interaction_data_30["article_id"].map(item2idx)
new_itme_data["article_id"] = new_itme_data["article_id"].map(item2idx)

#### train/test split

In [26]:
test_data = n_interaction_data_30.groupby("customer_id").nth(-1)
train_data = n_interaction_data_30[~n_interaction_data_30.index.isin(test_data.index)]
test_data = test_data.reset_index(drop=True).drop(["t_dat", "price","sales_channel_id"], axis=1)
train_data = train_data.reset_index(drop=True).drop(["t_dat", "price","sales_channel_id"], axis=1)

print("shape of test_data : ", test_data.shape)
print("shape of train_data : ", train_data.shape)


shape of test_data :  (218878, 2)
shape of train_data :  (18863239, 2)


#### data 확인

In [27]:
train_data.head()

Unnamed: 0,customer_id,article_id
0,0,3157
1,0,23845
2,0,23846
3,0,23843
4,0,3156


In [28]:
test_data.head()

Unnamed: 0,customer_id,article_id
0,7282,5657
1,8790,1935
2,32306,12615
3,7320,13320
4,41875,12769


In [29]:
# item 등장 빈도에 대한 목록 생성 {article_id : cnt}, trian_data에 있는 것만 반영
# item_occur_cnt = train_data.groupby("article_id").count().reset_index()
# item_occur_cnt.rename(columns={"customer_id":"cnt"}, inplace=True)

In [30]:
# # product_type마다 속하는 item 목록 생성 {product_type_no : [[items], [cnts]]}
# items_by_prod_type_cnt = dict(list(n_item_data.groupby("product_type_no")))
# for k in items_by_prod_type_cnt.keys():
#     items_by_prod_type_cnt[k].rename(columns={"product_type_no":"cnt"}, inplace=True)
#     items_by_prod_type_cnt[k] = items_by_prod_type_cnt[k].reset_index(drop=True)
#     items_by_prod_type_cnt[k].cnt = 0

#     for idx, row in items_by_prod_type_cnt[k].iterrows():
#         if row.article_id in item_occur_cnt.article_id.values:
#             items_by_prod_type_cnt[k].at[idx, 'cnt'] = item_occur_cnt.loc[item_occur_cnt['article_id'] == row.article_id, 'cnt'].values[0]

# dump_pickle(items_by_prod_type_cnt, "items_by_prod_type_cnt.pkl")

In [45]:
# product_type마다 속하는 item 목록 생성 {product_type_no : [items]}
items_by_prod_type = dict(list(new_itme_data.groupby("product_type_no")))
for k in items_by_prod_type.keys():
    items_by_prod_type[k] = items_by_prod_type[k].article_id.reset_index(drop=True).tolist()

dump_pickle(items_by_prod_type, "items_by_prod_type_small.pkl")

In [32]:
# user의 pos item 목록 생성 {user_id : [items]}
pos_items_each_user = dict(list(train_data.groupby("customer_id")))
for k in pos_items_each_user.keys():
    pos_items_each_user[k] = pos_items_each_user[k].article_id.reset_index(drop=True)

dump_pickle(pos_items_each_user, "pos_items_each_user_small.pkl")

### DATASET 생성

In [6]:
class HMTrainDataset(Dataset):
    def __init__(self, df, item_df, items_by_prod_type, pos_items_each_user) -> None:
        super().__init__()
        self.df = df
        self.item_df = item_df
        self.items_by_prod_type = items_by_prod_type
        self.pos_items_each_user = pos_items_each_user
        self.df['neg'] = np.zeros(len(self.df), dtype=int)
        self._make_triples_data()
    
    def __getitem__(self, index):
        user = self.df.iloc[index]['customer_id']
        pos = self.df.iloc[index]['article_id']
        neg = self.df.iloc[index]['neg']
        return user, pos, neg
    
    def _neg_sampling(self, pos_list, prod_type_no):
        # 같은 prod_type_no 내에서 neg sampling
        neg = random.choice(self.items_by_prod_type[prod_type_no]) 
        while neg in pos_list:
            neg = random.choice(self.items_by_prod_type[prod_type_no]) 
        return neg

    def _make_triples_data(self):
        for user_id, rows in tqdm(self.df.groupby("customer_id")):
            pos_list = self.pos_items_each_user[user_id]
            for idx, row in rows.iterrows():
                item_id = row.article_id
                prod_type_no = self.item_df[self.item_df["article_id"] == item_id].product_type_no.item()
                self.df.at[idx, 'neg'] = self._neg_sampling(pos_list, prod_type_no)
    
    def __len__(self):
        return len(self.df)

In [7]:
class HMTestDataset(Dataset):
    def __init__(self, df) -> None:
        super().__init__()
        self.df = df
        
    def __getitem__(self, index):
        user = self.df.iloc[index]['customer_id']
        pos = self.df.iloc[index]['article_id']
        return user, pos

    def __len__(self):
        return len(self.df)

In [48]:
train_dataset = HMTrainDataset(train_data, new_itme_data, items_by_prod_type, pos_items_each_user)
save_pt(train_dataset, "./dataset/train_dataset_small.pt")

100%|██████████| 218878/218878 [1:43:33<00:00, 35.23it/s]   


In [49]:
test_dataset = HMTestDataset(test_data)
save_pt(test_dataset, "./dataset/test_dataset_small.pt")

#### candidate item set 

In [50]:
all_items = np.arange(len(item2idx))
sample_size = 500
candidate_items_each_user = {}

In [51]:
for user, target in tqdm(test_dataset):
    candidate_items = torch.tensor(np.append(np.random.choice(np.setdiff1d(all_items, pos_items_each_user[user]), sample_size), target))
    candidate_items_each_user[user] = candidate_items

dump_pickle(candidate_items_each_user, "./data/candidate_items_each_user_small.pkl")

100%|██████████| 218878/218878 [02:21<00:00, 1541.40it/s]


#### 실행 시간 비교

In [None]:
import timeit
import numpy as np
import random

def function_1(pos_list, n_item):
    neg = np.random.randint(0, n_item, 1) 
    while neg in pos_list:
        neg = np.random.randint(0, n_item, 1) 
    return neg

def function_2(pos_list, n_item):
    neg = random.sample(range(0,n_item), 1) 
    while neg in pos_list:
        neg = random.sample(range(0,n_item), 1) 
    return neg

def function_3(pos_list, n_item):
    neg = random.choice(range(0,n_item)) 
    while neg in pos_list:
        neg = random.choice(range(0,n_item)) 
    return neg

def function_4(pos_list, n_item):
    neg = np.random.choice(np.arange(n_item), 1)
    while neg in pos_list:
        neg =  np.random.choice(np.arange(n_item), 1)
    return neg

n_item = len(item2idx)
pos = random.sample(range(1,n_item), 40)
n = 115500

time_1 = timeit.timeit('function_1(pos, n_item)', globals=globals(), number=n)
time_2 = timeit.timeit('function_2(pos, n_item)', globals=globals(), number=n)
time_3 = timeit.timeit('function_3(pos, n_item)', globals=globals(), number=n)
time_4 = timeit.timeit('function_4(pos, n_item)', globals=globals(), number=n)

print(f'np.random.randint 실행 시간: {time_1} 초')
print(f'random.sample 실행 시간: {time_2} 초')
print(f'random.choice 실행 시간: {time_3} 초')
print(f'np.random.choice 실행 시간: {time_4} 초')



np.random.randint 실행 시간: 6.5607867789804 초
random.sample 실행 시간: 0.37414733300101943 초
random.choice 실행 시간: 0.17139070699340664 초
np.random.choice 실행 시간: 13.791083243995672 초


In [None]:
def function_1(pos_list, n_item):
    neg = random.choice(range(0,n_item)) 
    while neg in pos_list:
        neg = random.choice(range(0,n_item)) 
    return neg

def function_2(pos_list, n_item):
    pos_list = {k:1 for k in pos_list}
    neg = random.choice(range(0,n_item)) 
    while neg in pos_list:
        neg = random.choice(range(0,n_item)) 
    return neg

n_item = len(item2idx)
pos = random.sample(range(1,n_item), 40)
n = 315500

time_1 = timeit.timeit('function_1(pos, n_item)', globals=globals(), number=n)
time_2 = timeit.timeit('function_2(pos, n_item)', globals=globals(), number=n)

print(f'list 실행 시간: {time_1} 초')
print(f'dict 실행 시간: {time_2} 초')


In [None]:
import numpy as np
import timeit
import random
import torch

def function_1(pos_list, all_item):
    candicate_items = torch.tensor(list(all_item - set(pos_list)))

def function_2(pos_list, all_item):
    candicate_items = torch.tensor(np.setdiff1d(all_item, pos_list))
   
def function_3():
    items_set = set(range(1,104573))
    
def function_4():
    items_np = np.arange(104573, dtype=np.int32)
    

items_set = set(range(1,104573))
items_np = np.arange(104573, dtype=np.int32)
pos = random.sample(range(1,104573), 40)
n1 = 74570
n2 = 4000

time_1 = timeit.timeit('function_1(pos, items_set)', globals=globals(), number=n1)
print(f'list(set) 실행 시간: {time_1} 초')

time_2 = timeit.timeit('function_2(pos, items_np)', globals=globals(), number=n1)
print(f'np.setdiff1d 실행 시간: {time_2} 초')

time_3 = timeit.timeit('function_3()', globals=globals(), number=n2)
print(f'set 실행 시간: {time_3} 초')

time_4 = timeit.timeit('function_4()', globals=globals(), number=n2)
print(f'np.arange 실행 시간: {time_4} 초')


list(set) 실행 시간: 508.1635248339999 초
np.setdiff1d 실행 시간: 69.18620454200027 초
set 실행 시간: 5.8264562089998435 초
np.arange 실행 시간: 0.028278708000470942 초


In [None]:
import numpy as np
import timeit
import random
import torch

def function_1(pos_list, all_item):
    candicate_items = torch.tensor(np.random.choice(np.setdiff1d(all_item, pos_list), 100))


def function_2(pos_list, all_item):
    candicate_items = torch.tensor(random.sample(list(np.setdiff1d(all_item, pos_list)), 100))
    

# items_set = set(range(1,104573))
items_np = np.arange(104573, dtype=np.int32)
pos = random.sample(range(1,104573), 40)
n1 = 24570

time_1 = timeit.timeit('function_1(pos, items_np)', globals=globals(), number=n1)
print(f'np.random.choice 실행 시간: {time_1} 초')

time_2 = timeit.timeit('function_2(pos, items_np)', globals=globals(), number=n1)
print(f'random.sample + list 실행 시간: {time_2} 초')


np.random.choice 실행 시간: 41.994027569977334 초
random.sample + list 실행 시간: 174.8324626859976 초
