In [1]:
!pip install tensorboardX

Collecting tensorboardX
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorboardX
Successfully installed tensorboardX-2.6.2.2


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
cd '/content/drive/MyDrive/Capstone/ai'

/content/drive/MyDrive/Capstone/ai


In [4]:
!git config --global user.name 'Yong'
!git config --global user.email 'whitasrgrey@gmail.com'

In [5]:
!git pull origin main

remote: Enumerating objects: 7, done.[K
remote: Counting objects:  14% (1/7)[Kremote: Counting objects:  28% (2/7)[Kremote: Counting objects:  42% (3/7)[Kremote: Counting objects:  57% (4/7)[Kremote: Counting objects:  71% (5/7)[Kremote: Counting objects:  85% (6/7)[Kremote: Counting objects: 100% (7/7)[Kremote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects:  50% (1/2)[Kremote: Compressing objects: 100% (2/2)[Kremote: Compressing objects: 100% (2/2), done.[K
remote: Total 4 (delta 2), reused 4 (delta 2), pack-reused 0[K
Unpacking objects: 100% (4/4), 453 bytes | 1024 bytes/s, done.
From https://github.com/Aggressive-3Back/ai
 * branch            main       -> FETCH_HEAD
   33cf6f6..c49f9d2  main       -> origin/main
Updating 33cf6f6..c49f9d2
Fast-forward
 data/ratings.csv | 10 [32m++++++++++[m
 1 file changed, 10 insertions(+)


# 데이터 증강

In [6]:
import pandas as pd
import numpy as np

# 기존 데이터 로드
data = pd.read_csv('./data/ratings.csv', sep=";")
new_column_order = ['user_id', 'item_id', 'rating']
data = data[new_column_order]

# 유저 및 가게 리스트 생성
user_ids = data['user_id'].unique()
item_ids = data['item_id'].unique().tolist()

# 가게 수를 200개로 확장
max_item_id = max(item_ids)
item_ids = list(range(1, 201))

# 유저별 네거티브 샘플 생성
neg_samples = []

for user_id in user_ids:
    user_items = data[data['user_id'] == user_id]['item_id'].tolist()
    neg_items = list(set(item_ids) - set(user_items))

    if len(neg_items) < 100:
        neg_items = np.random.choice(neg_items, len(neg_items), replace=False)
    else:
        neg_items = np.random.choice(neg_items, 100, replace=False)

    for item in neg_items:
        neg_samples.append([user_id, item, 0])

# 네거티브 샘플을 데이터프레임으로 변환
neg_samples_df = pd.DataFrame(neg_samples, columns=['user_id', 'item_id', 'rating'])

# 원본 데이터와 네거티브 샘플을 결합
augmented_data = pd.concat([data, neg_samples_df], ignore_index=True)

# 데이터셋 저장
augmented_data.to_csv('./data/augmented_data.csv', index=False)

# 학습

In [None]:
import os
import time
import random
import math
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from tensorboardX import SummaryWriter
from sklearn.model_selection import train_test_split

# 설정값 정의
config = {
    "model_path": "./models/",
    "data_path": './data/augmented_data.csv',
}

args = {
    "batch_size":16,
    "dropout": 0,
    "epochs": 10,
    "factor_num": 8,
    "gpu": "0",
    "layers": [64, 32, 16, 8],
    "lr": 0.0001,
    "num_ng": 4,
    "num_ng_test": 50,
    "out": True,
    "seed": 42,
    "top_k": 10,
}

# 시드 설정 함수
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(args['seed'])

# 데이터셋 클래스 정의
class Rating_Dataset(torch.utils.data.Dataset):
    def __init__(self, user_list, item_list, rating_list):
        super(Rating_Dataset, self).__init__()
        self.user_list = user_list
        self.item_list = item_list
        self.rating_list = rating_list

    def __len__(self):
        return len(self.user_list)

    def __getitem__(self, idx):
        user = self.user_list[idx]
        item = self.item_list[idx]
        rating = self.rating_list[idx]

        return (
            torch.tensor(user, dtype=torch.long),
            torch.tensor(item, dtype=torch.long),
            torch.tensor(rating, dtype=torch.float)
        )

# 데이터 로드 및 전처리
def load_data(config, args):
    data = pd.read_csv(config['data_path'])

    user_list = data['user_id'].tolist()
    item_list = data['item_id'].tolist()
    rating_list = data['rating'].tolist()

    train_data, test_data = train_test_split(data, test_size=0.2, random_state=args['seed'])

    train_dataset = Rating_Dataset(train_data['user_id'].tolist(), train_data['item_id'].tolist(), train_data['rating'].tolist())
    test_dataset = Rating_Dataset(test_data['user_id'].tolist(), test_data['item_id'].tolist(), test_data['rating'].tolist())

    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args['batch_size'], shuffle=True, num_workers=2)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args['batch_size'], shuffle=False, num_workers=2)

    return train_loader, test_loader, len(user_list), len(item_list)

train_loader, test_loader, user_num, item_num = load_data(config, args)


def get_hit_ratio(recommended_items, actual_item):
    if actual_item in recommended_items:
        return 1
    return 0

def get_ndcg(recommended_items, actual_item):
    if actual_item in recommended_items:
        index = recommended_items.index(actual_item)
        return math.log(2) / math.log(index + 2)
    return 0

# NeuMF 모델 클래스 정의
class NeuMF(nn.Module):
    def __init__(self, user_num, item_num, factor_num, layers, dropout):
        super(NeuMF, self).__init__()
        self.user_embedding_mf = nn.Embedding(user_num, factor_num)
        self.item_embedding_mf = nn.Embedding(item_num, factor_num)
        self.user_embedding_mlp = nn.Embedding(user_num, factor_num * 2)
        self.item_embedding_mlp = nn.Embedding(item_num, factor_num * 2)

        MLP_modules = []
        input_size = factor_num * 4
        for layer_size in layers:
            MLP_modules.append(nn.Linear(input_size, layer_size))
            MLP_modules.append(nn.ReLU())
            MLP_modules.append(nn.Dropout(dropout))
            input_size = layer_size
        self.MLP_layers = nn.Sequential(*MLP_modules)
        self.predict_layer = nn.Linear(factor_num + layers[-1], 1)

    def forward(self, user, item):
        user_embedding_mf = self.user_embedding_mf(user)
        item_embedding_mf = self.item_embedding_mf(item)
        mf_vector = user_embedding_mf * item_embedding_mf

        user_embedding_mlp = self.user_embedding_mlp(user)
        item_embedding_mlp = self.item_embedding_mlp(item)
        mlp_vector = torch.cat([user_embedding_mlp, item_embedding_mlp], dim=-1)
        mlp_vector = self.MLP_layers(mlp_vector)

        vector = torch.cat([mf_vector, mlp_vector], dim=-1)
        prediction = self.predict_layer(vector)
        return prediction.view(-1,1)  # Ensure the output is always a 1D tensor

# Adjust metrics function to handle scalar predictions
def metrics(model, test_loader, top_k, device):
    HR, NDCG = [], []

    for user, item, _ in test_loader:
        user = user.to(device)
        item = item.to(device)

        with torch.no_grad():
            prediction = model(user, item).squeeze()
            if prediction.dim() == 0:  # Handle scalar prediction
                prediction = prediction.unsqueeze(0)

        if prediction.size(0) == 0:
            continue

        k = min(top_k, prediction.size(0))
        _, indices = torch.topk(prediction, k)
        recommended_items = torch.take(item, indices).cpu().numpy().tolist()
        actual_items = item.cpu().numpy().tolist()

        for actual_item in actual_items:
            HR.append(get_hit_ratio(recommended_items, actual_item))
            NDCG.append(get_ndcg(recommended_items, actual_item))

    return np.mean(HR), np.mean(NDCG)

# 모델 훈련 및 평가
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = NeuMF(user_num, item_num, args['factor_num'], args['layers'], args['dropout']).to(device)
optimizer = optim.Adam(model.parameters(), lr=args['lr'])
criterion = nn.BCEWithLogitsLoss()
writer = SummaryWriter()

best_hr, best_ndcg, best_epoch = 0, 0, 0
for epoch in range(args['epochs']):
    model.train()
    start_time = time.time()
    for user, item, label in train_loader:
        user = user.to(device)
        item = item.to(device)
        label = label.to(device).view(-1, 1)  # 여기서 label 크기를 맞춰줍니다.

        model.zero_grad()
        prediction = model(user, item)
        loss = criterion(prediction, label)
        loss.backward()
        optimizer.step()
        writer.add_scalar('loss/Train_loss', loss.item(), epoch)

    model.eval()
    HR, NDCG = metrics(model, test_loader, args['top_k'], device)
    writer.add_scalar('Performance/HR@10', HR, epoch)
    writer.add_scalar('Performance/nDCG@10', NDCG, epoch)

    elapsed_time = time.time() - start_time
    print("The time elapse of epoch {:03d}".format(epoch) + " is: " +  time.strftime("%H: %M: %S", time.gmtime(elapsed_time)))
    print("HR: {:.3f}\tNDCG: {:.3f}".format(HR, NDCG))

    if HR > best_hr:
        best_hr, best_ndcg, best_epoch = HR, NDCG, epoch
        if args['out']:
            if not os.path.exists(config['model_path']):
                os.mkdir(config['model_path'])
            torch.save({
    'user_num': user_num,
    'item_num': item_num,
    'model_state_dict': model.state_dict()
}, os.path.join(config['model_path'], 'NeuMF_state_dict.pth'))
writer.close()

In [None]:
!git add .

In [None]:
!git commit -m 'update system'
!git push origin main

In [None]:
# %load_ext tensorboard
# %tensorboard --logdir=runs