In [1]:
import os
import time
import numpy as np
import pandas as pd
import pickle
import zipfile
from tqdm import tqdm
from scipy import sparse as ssp
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.backends.cudnn as cudnn

In [2]:
# ✅ 1. 데이터 로드
file_path = "data_ver1_final.csv"
df = pd.read_csv(file_path)

# ✅ 2. Wide Format → Long Format 변환 (melt 적용)
df = df.melt(id_vars=[df.columns[0]], var_name='wine', value_name='rating')
df.columns = ['user', 'wine', 'rating']
df = df.dropna()
print(df.head())
print(df.shape)
df

             user                             wine  rating
11    Abright Pan  The Holy Trinity Red Blend 2018     4.2
95   Cheoljin Lee  The Holy Trinity Red Blend 2018     4.2
143       EVSTERS  The Holy Trinity Red Blend 2018     4.2
163  Frank Posega  The Holy Trinity Red Blend 2018     4.0
252           L F  The Holy Trinity Red Blend 2018     3.8
(342408, 3)


Unnamed: 0,user,wine,rating
11,Abright Pan,The Holy Trinity Red Blend 2018,4.2
95,Cheoljin Lee,The Holy Trinity Red Blend 2018,4.2
143,EVSTERS,The Holy Trinity Red Blend 2018,4.2
163,Frank Posega,The Holy Trinity Red Blend 2018,4.0
252,L F,The Holy Trinity Red Blend 2018,3.8
...,...,...,...
16839907,Brandon Martini,Aspaldi Crianza 2016,4.2
16839946,Cookbook Guy,Aspaldi Crianza 2016,3.8
16840633,Paul Sweeney,Berola 2015,3.8
16840635,Pavel Ivanov RWC,Berola 2015,3.5


In [3]:
user2id = {user: i for i, user in enumerate(df['user'].unique())}
wine2id = {wine: i for i, wine in enumerate(df['wine'].unique())}

# ✅ 기존 데이터프레임에 매핑 적용
df['user'] = df['user'].map(user2id)
df['wine'] = df['wine'].map(wine2id)

df.loc[:, 'rating' == ] = 1
df.shape

(342408, 3)

In [4]:
train, temp = train_test_split(df, test_size=0.2, random_state=42)
val, test = train_test_split(temp, test_size=0.5, random_state=42)
print(train.shape)
print(val.shape)
print(test.shape)

(273926, 3)
(34241, 3)
(34241, 3)


In [5]:
all_wines = df['wine'].unique()

users, wines, labels = [], [], []

user_wine_set = set(zip(train['user'], train['wine']))

num_neg = 4

for u, i in tqdm(user_wine_set):
    users.append(u)
    wines.append(i)
    labels.append(1)
    for _ in range(num_neg):
        neg_item = np.random.choice(all_wines)
        while (u, neg_item) in user_wine_set:
            neg_item = np.random.choice(all_wines)
        users.append(u)
        wines.append(neg_item)
        labels.append(0)

100%|██████████| 273926/273926 [00:11<00:00, 23942.04it/s]


In [6]:
train_neg = pd.DataFrame(list(zip(users, wines, labels)), columns=['user', 'wine', 'rating'])
print(train_neg.head())
print(train_neg.shape)

   user   wine  rating
0   322  12443       1
1   322  22496       0
2   322  17577       0
3   322  21851       0
4   322  26269       0
(1369630, 3)


In [7]:
print(val.head())
print(test.head())

          user   wine  rating
3931686     21   8437     1.0
13970172   165  29978     1.0
7139152     19  15320     1.0
11128119   320  23880     1.0
6712285     16  14404     1.0
          user   wine  rating
634715     363   1362     1.0
13812975   106  29641     1.0
156752     253    336     1.0
8770038    208  18819     1.0
2601361     60   5582     1.0


In [8]:
train_neg.to_csv("train_data_ver1.csv", index=False)
val.to_csv("val_data_ver1.csv", index=False)
test.to_csv("test_data_ver1.csv", index=False)

In [None]:
# python code/main.py --decay=1e-4 --lr=0.001 --layer=3 --seed=2025 --dataset="wine" --topks="[20]" --recdim=64 --epochs=10 --testbatch=46