In [None]:
!pip install -U fashion-clip
!pip install kaggle

In [1]:
import os

# 캐글 계정 -> 셋팅 -> 토큰생성 -> kaggle.json 파일 다운로드 됨
# root/.kaggle 폴더 생성 후, kaggle.json 파일을 .kaggle 폴더로 이동
# kaggle.json 파일 내부에 적혀있는 username과 key를 아래 코드에 넣어 설정 후 H&M 데이터 압축파일 다운로드

os.environ["KAGGLE_USERNAME"] = "lijm1358"
os.environ["KAGGLE_KEY"] = "4cad47e5a4e3f3512be8b72f3b4dcdea"

!kaggle competitions download -c h-and-m-personalized-fashion-recommendations


h-and-m-personalized-fashion-recommendations.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
!mkdir data # 데이터 폴더 생성
!pwd # 현재 경로 확인

In [None]:
# 원하는 경로에 압축 풀고 싶다면 -d 이후에 원하는 경로로 변경
!unzip -q h-and-m-personalized-fashion-recommendations.zip -d ./data 

### data

In [2]:
import pandas as pd
import torch


#### load raw data

In [1]:
import pandas as pd

item_data = pd.read_csv("../data/articles.csv")
interaction_data = pd.read_csv("../data/transactions_train.csv")
# user_data = pd.read_csv("./data/customers.csv")

#### img prepare

In [3]:
from PIL import Image
from tqdm import tqdm
import torch
import torch.nn as nn
import numpy as np

torch.manual_seed(42)
np.random.seed(42)

def img_by_id(df, article_id:int, no_list:list, echo:int=1, img_show:bool=True):
    '''
    article_id를 입력으로 받아 결과를 출력하는 함수
    echo==1, 해당 아이템의 df row 출력
    img_show=True, 해당 아이템의 이미지 출력
    '''
    if article_id in no_list:
        return
    if echo:
        display(df[df.article_id == article_id])

    img_id = "0"+str(article_id)
    img = Image.open("../data/images/"+img_id[0:3]+"/"+img_id+".jpg")

    if img_show:
        img.show()

def find_no_img_item(df):
    '''
    이미지가 없는 아이템 찾아내는 함수
    '''
    no_img = []

    for item in tqdm(df.iterrows(), total=len(df)):
        try:
            img_by_id(df, item[1][0], no_list=no_img, echo=0, img_show=False)
        except FileNotFoundError:
            no_img.append(item[0])

    return no_img

In [4]:
no_img_ids = find_no_img_item(item_data)

  img_by_id(df, item[1][0], no_list=no_img, echo=0, img_show=False)
100%|██████████| 105542/105542 [01:20<00:00, 1307.05it/s]


In [5]:
len(no_img_ids)

442

In [None]:
# 이미지가 없는 아이템을 구매 데이터와 아이템 데이터에서 삭제
no_img_article_id = [item_data.iloc[x].article_id for x in no_img_ids]
n_item_data = item_data.drop(no_img_ids, axis=0).reset_index(drop=True)
n_interaction_data = interaction_data[~interaction_data["article_id"].isin(no_img_article_id)].reset_index(drop=True)

# train/test split을 위해 이력이 3초과인 유저만 남김
n_interaction_data = n_interaction_data.groupby('customer_id').filter(lambda x: len(x) > 3).reset_index(drop=True)

# (user/item)id를 index 맵핑
user2idx = {v:k for k,v in enumerate(n_interaction_data['customer_id'].unique())}
item2idx = {v:k for k,v in enumerate(n_item_data['article_id'].unique())}

In [None]:
len(item2idx) # 전체 아이템 개수

In [None]:
# from torchvision.models import alexnet, AlexNet_Weights, resnet18, ResNet18_Weights, vgg16, VGG16_Weights
from fashion_clip.fashion_clip import FashionCLIP


# # load pretrained alexnet
# model_alex = alexnet(weights=AlexNet_Weights.IMAGENET1K_V1)
# model_res = resnet18(weights=ResNet18_Weights.IMAGENET1K_V1)
# model_vgg = vgg16(weights=VGG16_Weights.IMAGENET1K_V1)

# # del last clf layer
# model_alex.classifier = model_alex.classifier[:-3]
# model_res.fc = nn.Identity()
# model_vgg.classifier = model_vgg.classifier[:-3]

fclip = FashionCLIP('fashion-clip')

# images = ["./data/images/" + "0" + str(k)[0:2] + "/" + "0"+str(k) + ".jpg" for k in n_item_data["article_id"].tolist()]
# 패션 clip을 통한 이미지 임베딩 얻기
# image_embeddings = fclip.encode_images(images, batch_size=32) 
# img_emb = torch.tensor(image_embeddings)



# feat_map_res = make_feature_map(model_res,n_item_data, book2idx)
# feat_map_alex = make_feature_map(model_alex, n_book_data, book2idx)
# feat_map_vgg = make_feature_map(model_vgg,n_book_data, book2idx)

In [None]:
# 만들어둔 임베딩 csv로 저장해두기
# pd.DataFrame(images).to_csv("img_list.csv", index=False)
# pd.DataFrame(image_embeddings).to_csv("img_emb.csv", index=False)

In [None]:
# 저장된 임베딩 csv를 사용하는 경우
# img_list = pd.read_csv("img_list.csv")
img_emb = pd.read_csv("./data/img_emb.csv")
img_emb = torch.tensor(img_emb.values)

In [None]:
print(img_emb.shape)

In [None]:
# img 16="0118458003", 17="0118458004"
# 임베딩 잘 되었는지 확인하기
res = nn.functional.cosine_similarity(img_emb[16], img_emb[17], dim=0)
res

#### make custom dataset

In [None]:
import torch
from tqdm import tqdm
import numpy as np
from torch.utils.data import Dataset, DataLoader

torch.manual_seed(42)
np.random.seed(42)

class HMDataset(Dataset):
    def __init__(self, df, user2idx, item2idx, is_train:bool=True) -> None:
        super().__init__()
        self.df = df
        self.is_train = is_train
        self.user2idx = user2idx
        self.item2idx = item2idx
        self.n_user = len(self.user2idx)
        self.n_item = len(self.item2idx)
        # mapping id2idx
        self.df['article_id'] = self.df['article_id'].map(self.item2idx)
        self.df['customer_id'] = self.df['customer_id'].map(self.user2idx)
        
        # train 데이터인 경우에만 neg 아이템이 생성
        if is_train:
            self.df['neg'] = np.zeros(len(self.df), dtype=int)
            self._make_triples_data()
    
    def __getitem__(self, index):
        user = self.df.customer_id[index]
        pos = self.df.article_id[index]
        
        if self.is_train:
            neg = self.df.neg[index]
            return user, pos, neg
        
        return user, pos
    
    def _neg_sampling(self, pos_list):
        '''
        사용된 아이템 리스트(pos_list)에 없는 아이템 하나를 negative sample로 추출
        '''
        neg = np.random.randint(0,self.n_item,1) 
        while neg in pos_list:
            neg = np.random.randint(0,self.n_item,1) 
        return neg

    def _make_triples_data(self):
        for id in tqdm(range(self.n_user)):
            user_df = self.df[self.df.customer_id==id] # 유저 한 명 선택 
            pos_list = (user_df.article_id).tolist()   # 해당 유저가 사용한 아이템 모두 추출
            for i in range(len(user_df)): # 유저의 모든 구매 이력에 neg sample을 추가해줌
                idx = user_df.index[i] 
                self.df.at[idx, 'neg'] = self._neg_sampling(pos_list)
    
    def __len__(self):
        return len(self.df)
    
class HMTestDataset(Dataset):
    def __init__(self, df, user2idx, item2idx, train_df) -> None:
        super().__init__()
        self.df = df
        self.train_df = train_df
        self.user2idx = user2idx
        self.item2idx = item2idx
        self.n_user = len(self.user2idx)
        self.n_item = len(self.item2idx)
        # mapping id2idx
        self.df['article_id'] = self.df['article_id'].map(self.item2idx)
        self.df['customer_id'] = self.df['customer_id'].map(self.user2idx)
        self.df['neg'] = np.zeros(len(self.df), dtype=int)
        self._make_triples_data()
    
    def __getitem__(self, index):
        user = self.df.customer_id[index]
        pos = self.df.article_id[index]
        neg = self.df.neg[index]
        return user, pos, neg
            
    def _neg_sampling(self, pos_list):
        '''
        사용된 아이템 리스트(pos_list)에 없는 아이템 하나를 negative sample로 추출
        '''
        neg = np.random.randint(0,self.n_item,1) 
        while neg in pos_list:
            neg = np.random.randint(0,self.n_item,1) 
        return neg

    def _make_triples_data(self):
        for idx, row in tqdm(self.df.iterrows(), total=len(self.df)):
            user_id = row.customer_id
            user_df = self.train_df[self.train_df.customer_id==user_id]
            pos_list = (user_df.article_id).tolist()   # 해당 유저가 사용한 아이템 모두 추출
            self.df.at[idx, 'neg'] = self._neg_sampling(pos_list)
    
    def __len__(self):
        return len(self.df)


In [None]:
test_df = n_interaction_data.groupby('customer_id').nth(-1) # 가장 마지막 구매 이력만 추출
train_df = n_interaction_data[~n_interaction_data.index.isin(test_df.index)] # test에 해당하지 않는 데이터 모두 추출
test_df = test_df.reset_index(drop=True)
train_df = train_df.reset_index(drop=True)

In [None]:
print(n_interaction_data.shape, test_df.shape, train_df.shape, test_df.shape[0]+train_df.shape[0])

In [None]:
# split 잘 되었는지 확인해보기
test_df.iloc[0].customer_id
display(test_df[test_df.customer_id == '0021da829b898f82269fc51feded4eac2129058ee95bd75bb1591e2eb14ecc79'])
display(train_df[train_df.customer_id == '0021da829b898f82269fc51feded4eac2129058ee95bd75bb1591e2eb14ecc79'])

In [None]:
# train_dataset = HMDataset(train_df, user2idx, item2idx)
# test_dataset = HMDataset(test_df, user2idx, item2idx, is_train=False)

In [None]:
# def save_pt(data, path):
#     with open(path, "wb") as file:
#         torch.save(data, file)

# save_pt(train_dataset, "./dataset/train_dataset.pt")
# save_pt(test_dataset, "./dataset/test_dataset.pt")

#### eval을 위한 testset 만드는 부분 추가


In [None]:
def save_pt(data, path):
    with open(path, "wb") as file:
        torch.save(data, file)

import torch
train_dataset = torch.load("./dataset/train_dataset.pt")
test_dataset = HMTestDataset(test_df, user2idx, item2idx, train_dataset.df)
save_pt(test_dataset, "./dataset/new_test_dataset.pt")

In [None]:
batch_size = 256
train_dataloader = DataLoader(train_dataset, batch_size=batch_size)

In [None]:
train_dataset.df.head()

In [None]:
test_dataset.df.head()

### model

In [None]:
from torch import nn

class VBPR(nn.Module):
    def __init__(self, n_user, n_item, K, D, img_embedding) -> None:
        super().__init__()
        self.feat_map= img_embedding.float() # user * 512
        self.n_user = n_user
        self.n_item = n_item
        self.K = K
        self.D = D
        self.F = self.feat_map.shape[1] 

        self.offset = nn.Parameter(torch.zeros(1))
        self.user_bias = nn.Embedding(self.n_user,1) # user*1
        self.item_bias = nn.Embedding(self.n_item,1) # item*1
        self.vis_bias = nn.Embedding(self.F,1)       # 512*1
        self.user_emb = nn.Embedding(self.n_user,self.K) # user*K
        self.item_emb = nn.Embedding(self.n_item,self.K) # item*K
        self.item_vis_emb = nn.Embedding(self.D, self.F) # D*K
        self.user_vis_emb = nn.Embedding(self.n_user, self.D) # user*D
    
        self._init_weights()

    def _init_weights(self):
        nn.init.xavier_uniform_(self.user_bias.weight)
        nn.init.xavier_uniform_(self.item_bias.weight.data)
        nn.init.xavier_uniform_(self.vis_bias.weight.data)
        nn.init.xavier_uniform_(self.user_emb.weight.data)
        nn.init.xavier_uniform_(self.item_emb.weight.data)
        nn.init.xavier_uniform_(self.item_vis_emb.weight.data)
        nn.init.xavier_uniform_(self.user_vis_emb.weight.data)
    
    def cal_each(self, user, item):
        vis_term = (self.user_vis_emb(user)@(self.item_vis_emb.weight@(self.feat_map[item].T))).sum(dim=1) + (self.vis_bias.weight.T)@(self.feat_map[item].T)
        mf_term = self.offset + self.user_bias(user).T + self.item_bias(item).T + (self.user_emb(user)@self.item_emb(item).T).sum(dim=1).unsqueeze(dim=0)
        params = (self.offset, self.user_bias(user), self.item_bias(item), self.vis_bias.weight, self.user_emb(user), self.item_emb(item), self.item_vis_emb.weight, self.user_vis_emb(user))
        return (mf_term+vis_term).squeeze(), params
    
    def forward(self, user, pos, neg):
        xui, pos_params = self.cal_each(user,pos)
        xuj, neg_params = self.cal_each(user,neg)
        return (xui-xuj), pos_params, neg_params


In [None]:
class BPRLoss(nn.Module):
    def __init__(self, reg_theta, reg_beta, reg_e) -> None:
        super().__init__()
        self.reg_theta = reg_theta
        self.reg_beta = reg_beta
        self.reg_e = reg_e

    
    def _cal_l2(self, *tensors):
        total = 0
        for tensor in tensors:
            total += tensor.pow(2).sum()
        return 0.5 * total

    def _reg_term(self, pos_params, neg_params):
        alpha, beta_u, beta_pos, beta_prime_pos, gamma_u, gamma_pos, e_pos, theta_u = pos_params
        _, _, beta_neg, beta_prime_neg, _, gamma_neg, e_neg, _ = neg_params

        reg_out = self.reg_theta * self._cal_l2(alpha, beta_u, beta_pos, beta_neg, theta_u, gamma_u, gamma_pos, gamma_neg)
        reg_out += self.reg_beta * self._cal_l2(beta_prime_pos, beta_prime_neg)
        reg_out += self.reg_e * self._cal_l2(e_pos, e_neg)

        return reg_out

    def forward(self, diff, pos_params, neg_params):
        loss = -nn.functional.logsigmoid(diff).sum() # sigma(x_uij)
        loss += self._reg_term(pos_params, neg_params) # reg_term

        return loss

In [None]:
def train(model, optimizer, dataloader, criterion, device):
    model.train()
    total_loss = 0

    for user, pos, neg in tqdm(dataloader):
        user = user.to(device)
        pos = pos.to(device)
        neg = neg.to(device)

        diff, pos_params, neg_params = model(user, pos, neg)
        loss = criterion(diff, pos_params, neg_params)
        
        model.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    
    return total_loss/len(dataloader)

In [None]:
from torch.optim import Adam

n_user = train_dataset.n_user
n_item = train_dataset.n_item

K = 20
D = 20
reg_theta = 0.1
reg_beta = 0.1
reg_e = 0

lr = 0.001
epoch = 10

device = "cuda" if torch.cuda.is_available() else "cpu" 
criterion = BPRLoss(reg_theta, reg_beta, reg_e).to(device)
img_emb = img_emb.to(device)

In [None]:
vbpr_20 = VBPR(n_user, n_item, K, D, img_emb).to(device)
optimizer = Adam(params = vbpr_20.parameters(), lr=lr)
train_loss_20 = []

for i in range(epoch):
    train_loss_20.append(train(vbpr_20, optimizer, train_dataloader, criterion, device))
    print(f'EPOCH : {i} | LOSS : {train_loss_20[-1]:.10}')

In [None]:
K = 40

vbpr_40 = VBPR(n_user, n_item, K, D, img_emb).to(device)
optimizer = Adam(params = vbpr_40.parameters(), lr=lr)
train_loss_40 = []

for i in range(epoch):
    train_loss_40.append(train(vbpr_40, optimizer, train_dataloader, criterion, device))
    print(f'EPOCH : {i} | LOSS : {train_loss_40[-1]:.10}')

In [None]:
K = 60

vbpr_60 = VBPR(n_user, n_item, K, D, img_emb).to(device)
optimizer = Adam(params = vbpr_60.parameters(), lr=lr)
train_loss_60 = []

for i in range(epoch):
    train_loss_60.append(train(vbpr_60, optimizer, train_dataloader, criterion, device))
    print(f'EPOCH : {i} | LOSS : {train_loss_60[-1]:.10}')

In [None]:
import matplotlib.pyplot as plt

plt.plot(range(epoch), train_loss_20, label="VBPR(factor 20)")
plt.plot(range(epoch), train_loss_40, label="VBPR(factor 40)")
plt.plot(range(epoch), train_loss_60, label="VBPR(factor 60)")
plt.legend()

### Top K rec test (해당 셀 이후부터는 아직 수정되지 않음)

In [None]:
from sklearn.preprocessing import MaxAbsScaler

class Recommender:
    def __init__(self, model, query_img, train_dataset, n_item, img_emb, device) -> None:
        self.model = model
        self.train_df = train_dataset.dataset.df
        self.all_item = set(range(0,n_item))
        self.query_img = query_img
        self.img_emb = img_emb
        self.device = device

    def _get_img_sim(self, itemset:list):
        print("GET IMG SIM")
        res = []
        for item in itemset:
            res.append(nn.functional.cosine_similarity(self.query_img, self.img_emb[item.item()]))
        return res

    def _get_unobs_items(self, user_idx):
        obs_item_set = set(self.train_df[self.train_df.user_id==user_idx].isbn)
        return list(self.all_item - obs_item_set)

    def user_rank(self, user_idx:int, top_k:int=None, img_sim_weight:float=0.5):
        self.model.eval()
        unobs_itemset = self._get_unobs_items(user_idx)
        scaler = MaxAbsScaler()

        with torch.no_grad():
            itemset = torch.tensor(unobs_itemset).to(self.device)
            user = torch.tensor(np.full(len(itemset), user_idx)).to(self.device)
            img_sim = torch.tensor(self._get_img_sim(itemset))

            out, _ = self.model.cal_each(user, itemset)
            out = scaler.fit_transform(out) # range [-1~1]
            out = out + img_sim_weight*img_sim # range [-1.5~1.5]

            scores = np.array(torch.concat((user.unsqueeze(dim=1),itemset.unsqueeze(dim=1),out.unsqueeze(dim=1)), dim=1))
       
        sorted_scores = scores[(-scores[:, 2]).argsort()]
        return sorted_scores[:top_k]

In [None]:
def eval(recommender, test_dataset):
    df = test_dataset.dataset.df
    user_list = df['user_id'].unique()
    res_true = {}
    res_topk = {}
    res_hit = {}
    
    for user in tqdm(df.iterrows(), total = len(df)):
        res = recommender.user_rank(user, 20)
        topk_item = res[:,1]
        true_item = 
        hit = len(set(true_item).intersection(set(topk)))
        res_true[user] = list(true_item)
        res_topk[user] = list(topk)
        res_hit[user] = hit
    
    return res_true, res_topk, res_hit

In [None]:
query = fclip.encode_images(query_img, batch_size=1)
# res = img_sim(query, feat_map_vgg)
# res
recommender = Recommender(vbpr_20, query, train_dataset, n_item, img_emb, device)

res_true, res_topk, res_hit = eval(recommender, test_dataset)