In [1]:
import pandas as pd
import numpy as np

In [2]:
import re

In [3]:
links = pd.read_csv("../data/ml-100k/ml-latest-small/links.csv")
movies = pd.read_csv("../data/ml-100k/ml-latest-small/movies.csv")
ratings = pd.read_csv("../data/ml-100k/ml-latest-small/ratings.csv")
tags = pd.read_csv("../data/ml-100k/ml-latest-small/tags.csv")

# ML-100K DataLoader

## RAW Dataset Structure

** link.csv 는 사용안하는듯 **

In [4]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [5]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [7]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


## Step 1 Preprocessing

***컬럼명 변경***

- userId -> uid (공통 유저 ID)
- movieId -> sid (공통 아이템 ID)
- rating -> rating
- timestamp -> timestamp

In [8]:
# load_ratings_df

df = pd.read_csv("../data/ml-100k/ml-latest-small/ratings.csv")
df.columns = ['uid', 'sid', 'rating', 'timestamp']
df.head()

Unnamed: 0,uid,sid,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


## Step 2 Preprocessing

In [9]:
# load_meta_dict (dataframe -> dictionary)

movies_df = pd.read_csv("../data/ml-100k/ml-latest-small/movies.csv")
meta_dict = {}
for row in movies_df.itertuples():
    title = row[2][:-7] # remove year (optional)
    year = row[2][-7:]

    title = re.sub('\(.*?\)', '', title).strip()

    if any(', ' + x in title.lower()[-5:] for x in ['a', 'an', 'the']):
        title_pre = title.split(', ')[:-1]
        title_post = title.split(', ')[-1]
        title_pre = ', '.join(title_pre)

        title = title_post + ' ' + title_pre

    meta_dict[row[1]] = title + year


- 영화 제목에서 괄호 제거 및 공백 처리
- 제목에 관사(a, an, the)가 포함된 경우 제목의 앞쪽으로 이동 (ex: "Lion, The" -> "The Lion")

최종 결과:
- key: movieId / value: title + year 형태의 딕셔너리(meta_dict) 생성

In [10]:
# meta_dict 5개 출력
for i in list(meta_dict.keys())[:5]:
    print(f"{i} : '{meta_dict[i]}'")

1 : 'Toy Story (1995)'
2 : 'Jumanji (1995)'
3 : 'Grumpier Old Men (1995)'
4 : 'Waiting to Exhale (1995)'
5 : 'Father of the Bride Part II (1995)'


In [11]:
df = df[df['sid'].isin(meta_dict)]

In [12]:
df.head()

Unnamed: 0,uid,sid,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


## Step 3 Preprocessing

uid 열과 sid 열의 고유 값 각각에 새로운 정수 인덱스를 할당 <br/>
-> 효율성 때문에 </br>
- 메모리 효율성을 높임
- 임베딩 레이어 크기 축소
- 모델의 정확도와 효율성 향상

In [13]:
# densify_index

umap = {u: i for i, u in enumerate(set(df['uid']), start=1)}
smap = {s: i for i, s in enumerate(set(df['sid']), start=1)}

df['uid'] = df['uid'].map(umap)
df['sid'] = df['sid'].map(smap)

df.head()

Unnamed: 0,uid,sid,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


## Step 4 Preprocessing

1. uid 열을 기준으로 데이터프레임을 그룹화 (user_group 변수)
2. timestamp 및 sid 순서대로 정렬후 sid 값만 가진 리스트로 변환
3. split 작업 수행 (train - 처음부터 마지막 두번째 전 (n개), val - 마지막에서 두번째 (1개), test - 마지막 항목 (1개))

In [14]:
# split_df

# 1.
user_count = len(umap)
user_group = df.groupby('uid')

# 2.
user2items = user_group.apply(
    lambda d: list(d.sort_values(by=['timestamp', 'sid'])['sid']))

# 3.
train, val, test = {}, {}, {}
for i in range(user_count):
    user = i + 1
    items = user2items[user]
    # train은 첫번째 ~ 맨뒤 + 2 사이의 값들 (list)
    # val은 맨뒤에서 2번째 값
    # test은 맨뒤 값
    train[user], val[user], test[user] = items[:-2], items[-2:-1], items[-1:]

  user2items = user_group.apply(


In [15]:
meta = {smap[k]: v for k, v in meta_dict.items() if k in smap}

In [16]:
# 5개 출력
for i in list(meta_dict.keys())[:5]:
    print(f"{i} : '{meta[i]}'")

1 : 'Toy Story (1995)'
2 : 'Jumanji (1995)'
3 : 'Grumpier Old Men (1995)'
4 : 'Waiting to Exhale (1995)'
5 : 'Father of the Bride Part II (1995)'


In [17]:
dataset = {
    'train': train,
    'val': val,
    'test': test,
    'meta': meta,
    'umap': umap,
    'smap': smap
}

**최종 결과는** <br/>
- key: uid (유저 ID) 
- value: 아이템 ID(sid) 리스트 (pair된 timestamp 값에 따라 오름차순 정렬)

In [18]:
for i in list(train.keys())[:5]:
    print(f"{i} : '{train[i]}'")

1 : '[710, 1033, 1705, 2241, 2416, 3033, 3062, 3176, 99, 423, 2437, 1269, 2563, 223, 921, 342, 212, 1289, 2297, 2039, 2765, 1303, 2774, 1345, 1480, 2213, 2931, 219, 319, 519, 905, 205, 478, 2612, 3236, 1514, 3, 2167, 2132, 935, 983, 1522, 1079, 1616, 805, 3103, 248, 931, 952, 1853, 2451, 353, 1654, 1692, 1774, 1841, 1872, 2388, 1786, 1824, 2956, 1738, 616, 872, 1909, 2961, 1609, 2920, 2921, 1019, 1021, 1037, 2289, 3223, 1020, 2513, 2622, 1715, 2192, 2518, 439, 1043, 1045, 1114, 3135, 108, 821, 1641, 2516, 2517, 1687, 6, 2071, 2155, 2559, 557, 2557, 2991, 3081, 1098, 1343, 2489, 302, 1219, 1929, 2050, 2154, 2922, 459, 1352, 405, 3161, 2248, 70, 660, 911, 1742, 3172, 2011, 801, 2553, 2713, 974, 2058, 1796, 556, 335, 597, 1731, 348, 1780, 2106, 2230, 1324, 1435, 2088, 2249, 528, 663, 894, 1189, 1775, 886, 1, 1105, 2355, 2477, 888, 895, 1733, 1818, 1820, 2595, 561, 605, 1759, 1771, 1777, 1822, 892, 887, 1797, 893, 1718, 3402, 50, 569, 944, 1036, 1382, 283, 1924, 2201, 1208, 1531, 2551, 138

In [19]:
for i in list(val.keys())[:5]:
    print(f"{i} : '{val[i]}'")

1 : '[1699]'
2 : '[7319]'
3 : '[4218]'
4 : '[4100]'
5 : '[286]'


In [20]:
for i in list(test.keys())[:5]:
    print(f"{i} : '{test[i]}'")

1 : '[2123]'
2 : '[7205]'
3 : '[2068]'
4 : '[3594]'
5 : '[453]'


In [21]:
for i in list(meta.keys())[:5]:
    print(f"{i} : '{meta[i]}'")

1 : 'Toy Story (1995)'
2 : 'Jumanji (1995)'
3 : 'Grumpier Old Men (1995)'
4 : 'Waiting to Exhale (1995)'
5 : 'Father of the Bride Part II (1995)'


In [22]:
for i in list(umap.keys())[:5]:
    print(f"{i} : '{umap[i]}'")

1 : '1'
2 : '2'
3 : '3'
4 : '4'
5 : '5'


In [23]:
for i in list(smap.keys())[:5]:
    print(f"{i} : '{smap[i]}'")

1 : '1'
2 : '2'
3 : '3'
4 : '4'
5 : '5'


# LRUDataLoader

In [32]:
import torch
import numpy as np

In [24]:
import easydict

# jupyter didn't support argparse. so, I use 'easydict' module
args = easydict.EasyDict({
    ################
    # Dataset
    ################
    'dataset_code': 'ml-100k', # ml-100k, beauty, games
    'min_rating': 0,  # default: 0
    'min_uc': 5,  # default: 5
    'min_sc': 5,  # default: 5
    'seed': 42,  # default: 42

    ################
    # Dataloader
    ################
    'train_batch_size': 64,  # default: 64
    'val_batch_size': 64,  # default: 64
    'test_batch_size': 64,  # default: 64
    'num_workers': 0,  # default: 8
    'sliding_window_size': 1.0,  # default: 1.0
    'negative_sample_size': 10,  # default: 10

    ################
    # Trainer
    ################
    # optimization #
    'device': 'cuda',  # default: 'cuda'  # choices: ['cpu', 'cuda']
    'num_epochs': 500,  # default: 500
    'optimizer': 'AdamW',  # default: 'AdamW'  # choices: ['AdamW', 'Adam']
    'weight_decay': 0.01,  # default: None
    'adam_epsilon': 1e-9,  # default: 1e-9
    'momentum': None,  # default: None
    'lr': 0.001,  # default: 0.001
    'max_grad_norm': 5.0,  # default: 5.0
    'enable_lr_schedule': True,  # default: True
    'decay_step': 10000,  # default: 10000
    'gamma': 1,  # default: 1
    'enable_lr_warmup': True,  # default: True
    'warmup_steps': 100,  # default: 100

    # evaluation #
    'val_strategy': 'iteration',  # default: 'iteration'  # choices: ['epoch', 'iteration']
    'val_iterations': 500,  # default: 500  # only for iteration val_strategy
    'early_stopping': True,  # default: True
    'early_stopping_patience': 20,  # default: 20
    'metric_ks': [1, 5, 10, 20, 50],  # default: [1, 5, 10, 20, 50]
    'rerank_metric_ks': [1, 5, 10],  # default: [1, 5, 10]
    'best_metric': 'Recall@10',  # default: 'Recall@10'
    'rerank_best_metric': 'NDCG@10',  # default: 'NDCG@10'
    'use_wandb': False,  # default: False

    ################
    # Retriever Model
    ################
    'model_code': 'lru',  # default: None
    'bert_max_len': 50,  # default: 50
    'bert_hidden_units': 64,  # default: 64
    'bert_num_blocks': 2,  # default: 2
    'bert_num_heads': 2,  # default: 2
    'bert_head_size': 32,  # default: 32
    'bert_dropout': 0.2,  # default: 0.2
    'bert_attn_dropout': 0.2,  # default: 0.2
    'bert_mask_prob': 0.25,  # default: 0.25

    ################
    # LLM Model
    ################
    'llm_base_model': 'meta-llama/Llama-2-7b-hf',  # default: 'meta-llama/Llama-2-7b-hf'
    'llm_base_tokenizer': 'meta-llama/Llama-2-7b-hf',  # default: 'meta-llama/Llama-2-7b-hf'
    'llm_max_title_len': 32,  # default: 32
    'llm_max_text_len': 1536,  # default: 1536
    'llm_max_history': 20,  # default: 20
    'llm_train_on_inputs': False,  # default: False
    'llm_negative_sample_size': 19,  # default: 19  # 19 negative & 1 positive
    'llm_system_template': "Given user history in chronological order, recommend an item from the candidate pool with its index letter.",  # default: "Given user history in chronological order, recommend an item from the candidate pool with its index letter."
    'llm_input_template': 'User history: {}; \n Candidate pool: {}',  # default: 'User history: {}; \n Candidate pool: {}'
    'llm_load_in_4bit': True,  # default: True
    'llm_retrieved_path': None,  # default: None
    'llm_cache_dir': None,  # default: None

    ################
    # Lora
    ################
    'lora_r': 8,  # default: 8
    'lora_alpha': 32,  # default: 32
    'lora_dropout': 0.05,  # default: 0.05
    'lora_target_modules': ['q_proj', 'v_proj'],  # default: ['q_proj', 'v_proj']
    'lora_num_epochs': 1,  # default: 1
    'lora_val_iterations': 100,  # default: 100
    'lora_early_stopping_patience': 20,  # default: 20
    'lora_lr': 1e-4,  # default: 1e-4
    'lora_micro_batch_size': 16,  # default: 16
})


In [26]:
train = dataset['train']
val = dataset['val']
test = dataset['test']
umap = dataset['umap']
smap = dataset['smap']
rng = np.random

user_count = len(umap)
item_count = len(smap)

num_users = user_count
num_items = item_count
max_len = args.bert_max_len
sliding_size = args.sliding_window_size

print(f"user_count : {user_count}")
print(f"item_count : {item_count}")
print(f"num_users : {num_users}")
print(f"num_items : {num_items}")
print(f"max_len : {max_len}")
print(f"sliding_size : {sliding_size}")

user_count : 610
item_count : 9724
num_users : 610
num_items : 9724
max_len : 50
sliding_size : 1.0


## Train Loader

## Test Loader

최종형태: Seq: train(N개) + val(1개) -> 총 50개(**max_len**) / target: test(1개)

In [27]:
u2seq = train
u2val = val
u2answer = test
users = [u for u in sorted(u2seq.keys()) if len(u2val[u]) > 0 and len(u2answer[u]) > 0]
max_len = max_len
rng = rng

In [28]:
index = 0

In [29]:
user = users[index]
# user에 대하여 train sequence 와 val sequence를 병합
seq = u2seq[user] + u2val[user]
answer = u2answer[user]

# 전체 시퀸스에서 max_len 까지만 추출
seq = seq[-max_len:]

# 패딩 적용
padding_len = max_len - len(seq)
seq = [0] * padding_len + seq

In [33]:
torch.LongTensor(seq)

tensor([2031,  835, 1031, 1976, 2527, 1042, 1163, 2252, 2260, 1081, 1101, 2097,
        2256, 2312, 2263,  947, 2869, 1390, 2719, 1404, 2795, 1985, 1088,  824,
        1049,  153, 1943, 1093, 1063, 1029, 3134, 2594,  558,   47, 1998, 2697,
         503, 4225,  945, 1047,  141, 2929,  689, 1121, 2613,  147, 1251,  529,
        2114, 1699])

In [34]:
torch.LongTensor(answer)

tensor([2123])

## Validation Loader

Test Loader 전처리 방식과 거의 동일