# 이분 그래프 기반 추천 시스템 테스트

In [184]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data

# 예시 유저 데이터
interactions_pos = pd.DataFrame(columns=['user_id', 'movie_id'])
interactions_neg = pd.DataFrame(columns=['user_id', 'movie_id'])

# 새로운 사용자 데이터
new_users_pos = [
    [4604, 4845, 5419, 300, 4165, 4788, 4548],  # 사용자 11
    [4604, 4845, 5419, 300, 4108, 3799, 5543],  # 사용자 12
    [4604, 4845, 5419, 300, 5502, 3802, 3201],  # 사용자 13
    [4604, 4845, 5419, 300, 4165, 4788, 3928],  # 사용자 14
    [4769, 5077, 5778, 5210, 6072, 3973, 4817],  # 사용자 15
    [4769, 5077, 5778, 5210, 6072, 3973, 4817],  # 사용자 16
    [4769, 5077, 5778, 5210, 6072, 3973, 4817],  # 사용자 17
    [4769, 5077, 5778, 5210, 6072, 3973, 4817],  # 사용자 18
    [3649, 1124, 6072, 3973, 4817, 6636, 6364],  # 사용자 19
    [3649, 1124, 6072, 3973, 4817, 6636, 6364],  # 사용자 20
    [3649, 1124, 6072, 3973, 4817, 6636, 6364],  # 사용자 21
    [3649, 1124, 6072, 3973, 4817, 6636, 6364]   # 사용자 22
]

new_users_neg = [
    [4763, 5701, 4477, 3095, 5103, 2724],  # 사용자 11
    [4763, 5701, 4477, 3095, 5103, 2724],  # 사용자 12
    [4763, 5701, 4477, 3095, 5103, 2724],  # 사용자 13
    [4763, 5701, 4477, 3095, 5103, 2724],  # 사용자 14
    [5502, 3802, 4788, 4165, 5737, 4815],  # 사용자 15
    [5502, 3802, 4788, 4165, 5737, 4815],  # 사용자 16
    [5502, 3802, 4788, 4165, 5737, 4815],  # 사용자 17
    [5502, 3802, 4788, 4165, 5737, 4815],  # 사용자 18
    [4604, 4845, 5419, 300, 5502, 3802],  # 사용자 19
    [4604, 4845, 5419, 300, 5502, 3802],  # 사용자 20
    [4604, 4845, 5419, 300, 5502, 3802],  # 사용자 21
    [4604, 4845, 5419, 300, 5502, 3802]   # 사용자 22
]

# 기존 사용자 데이터
mixed1_pos = [5502, 3802, 5737, 4815, 4548, 3973, 4817]
mixed1_neg = [5103, 2724, 5778, 4769, 4845, 4604]

mixed2_pos = [4788, 4165, 105, 3008, 5543, 6072, 3649]
mixed2_neg = [96, 4754, 5077, 5210, 300, 5100]

mixed3_pos = [4108, 3799, 4894, 5210, 3928, 4945, 6636]
mixed3_neg = [4248, 4716, 5737, 4769, 5419, 4604]

mixed4_pos = [5502, 3802, 5737, 4815, 4548, 3973, 4817]
mixed4_neg = [5103, 2724, 5778, 4769, 4845, 4604]

mixed5_pos = [4788, 4165, 105, 3008, 5543, 6072, 3649]
mixed5_neg = [96, 4754, 5077, 5210, 300, 5100]

mixed6_pos = [4108, 3799, 4894, 5210, 3928, 4945, 6636]
mixed6_neg = [4248, 4716, 5737, 4769, 5419, 4604]

mixed7_pos = [4788, 4165, 5737, 4815, 4548, 3973, 4817]
mixed7_neg = [96, 4754, 5778, 4769, 300, 4604]

mixed8_pos = [4108, 3799, 105, 3008, 5543, 6072, 3649]
mixed8_neg = [4248, 4716, 5077, 5210, 5419, 5100]

mixed9_pos = [4788, 4165, 4894, 5210, 3928, 4945, 6636]
mixed9_neg = [96, 4754, 5737, 4769, 300, 4604]

mixed10_pos = [5502, 3802, 5737, 4815, 4548, 3973, 4817]
mixed10_neg = [5103, 2724, 5778, 4769, 4845, 4604]

# 사용자 추가
additional_user_id = 0  # 새로운 유저 ID

# 기존 유저 데이터 추가
all_pos_samples = [
    mixed1_pos, mixed2_pos, mixed3_pos, mixed4_pos, mixed5_pos,
    mixed6_pos, mixed7_pos, mixed8_pos, mixed9_pos, mixed10_pos
]

all_neg_samples = [
    mixed1_neg, mixed2_neg, mixed3_neg, mixed4_neg, mixed5_neg,
    mixed6_neg, mixed7_neg, mixed8_neg, mixed9_neg, mixed10_neg
]

# 새로운 사용자 데이터와 기존 사용자 데이터 합치기
all_pos_samples.extend(new_users_pos)
all_neg_samples.extend(new_users_neg)

# 각 유저에 대한 긍정적 샘플 추가
for i, pos_samples in enumerate(all_pos_samples):
    user_id = additional_user_id + i
    for movie_id in pos_samples:
        interactions_pos = pd.concat([interactions_pos, pd.DataFrame({'user_id': [user_id], 'movie_id': [movie_id]})], ignore_index=True)

# 각 유저에 대한 부정적 샘플 추가
for i, neg_samples in enumerate(all_neg_samples):
    user_id = additional_user_id + i
    for movie_id in neg_samples:
        interactions_neg = pd.concat([interactions_neg, pd.DataFrame({'user_id': [user_id], 'movie_id': [movie_id]})], ignore_index=True)

users = pd.DataFrame(columns=['user_id'])
users['user_id'] = interactions_pos['user_id'].unique()

num_users = users.shape[0]

# 영화 features
movie_features = torch.from_numpy(np.load("final_features_0528.npy")).float()

num_movies = movie_features.size(0)
num_in_features = movie_features.size(1)
num_out_features = movie_features.size(1)

# user indices, movie indices 생성
user_id_to_index = {user_id: idx for idx, user_id in enumerate(users['user_id'])}
movie_id_to_index = {movie_id: idx for idx, movie_id in enumerate(range(num_movies))}

# 유저 - 영화 간 상호작용 edge index로 변환
user_indices = interactions_pos['user_id'].apply(lambda x: user_id_to_index[x])
movie_indices = interactions_pos['movie_id'].apply(lambda x: num_users + movie_id_to_index[x])  # 유저 수만큼 offset 추가

edge_index = torch.tensor([user_indices.values, movie_indices.values], dtype=torch.long)

# 유저 초기 임베딩 생성
user_embedding = torch.zeros(num_users, num_in_features)
# 좋아하는 영화의 평균으로 유저 임베딩 생성
for user_id in users['user_id']:
    user_idx = user_id_to_index[user_id]
    movie_idxs = interactions_pos[interactions_pos['user_id'] == user_id]['movie_id'].apply(lambda x: movie_id_to_index[x])
    user_movie_features = movie_features[movie_idxs.values].mean(dim=0)
    user_embedding[user_idx] = user_movie_features
# # 유저 초기 임베딩을 xavier 초기화
# nn.init.xavier_uniform_(user_embedding)  # initialize user features with xavier initialization
# 영화 feature와 유저 feature 합치기
x = torch.cat([user_embedding, movie_features], dim=0)

data = Data(x=x, edge_index=edge_index)

class GCNLinkPredictor(nn.Module):
    def __init__(self, num_in_features, num_out_features=128, num_users=num_users):
        super().__init__()
        self.conv1 = GCNConv(num_in_features, num_out_features)
        self.conv2 = GCNConv(num_out_features, num_out_features)
        self.num_users = num_users

    def forward(self, x, edge_index):
        user_embedding = x[:self.num_users]
        movie_features = x[self.num_users:].detach()
        
        x = torch.cat([user_embedding, movie_features], dim=0)
        x = self.conv1(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        
        x[self.num_users:] = movie_features
        return x

# 유저 - 영화 간 edge를 입력받아 유저-영화 간 관계 예측
class LinkPredictor(nn.Module):
    # 비선형으로 개선
    def __init__(self, input_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim * 2, input_dim * 2)
        self.fc2 = nn.Linear(input_dim * 2, input_dim)
        self.fc3 = nn.Linear(input_dim, input_dim)
        self.fc4 = nn.Linear(input_dim, 1)
        self.dropout = nn.Dropout(p=0.5)

    def forward(self, z, edge_index):
        row, col = edge_index
        z_row = z[row]
        z_col = z[col]
        
        # 임베딩 연결
        z_concat = torch.cat([z_row, z_col], dim=1)
        
        # 비선형 활성화 함수
        x = F.relu(self.fc1(z_concat))
        
        # 정규화를 위한 드롭아웃
        x = self.dropout(x)

        x = F.relu(self.fc2(x))
        x = self.dropout(x)

        x = F.relu(self.fc3(x))
        x = self.dropout(x)
        
        # 두 번째 선형 계층 및 시그모이드 활성화 적용
        return torch.sigmoid(self.fc4(x))

# 모델 초기화
gcn_model = GCNLinkPredictor(num_in_features, num_out_features, num_users)
link_predictor = LinkPredictor(num_out_features)

# Negative sampling: 다른 유저와 연결된 영화를 negative sample로 선정
def negative_sampling(interactions, num_users):
    user_indices = interactions['user_id'].apply(lambda x: user_id_to_index[x])
    movie_indices = interactions['movie_id'].apply(lambda x: num_users + x)  # 유저 수만큼 offset 추가

    neg_edge_index = torch.tensor([user_indices.values, movie_indices.values], dtype=torch.long)

    return neg_edge_index

neg_edge_index = negative_sampling(interactions_neg, num_users)


# positive, negative samples 합치기
train_edge_index = torch.cat([edge_index, neg_edge_index], dim=1)
train_labels = torch.cat([torch.ones(edge_index.size(1)), torch.zeros(neg_edge_index.size(1))])

# Train 함수 정의
def train(model, predictor, data, train_edge_index, train_labels, optimizer, epochs=200, patience=100):
    model.train()
    predictor.train()
    criterion = nn.BCELoss()
    best_loss = float('inf')

    for epoch in range(epochs):
        optimizer.zero_grad()
        node_embeddings = model(data.x, data.edge_index)
        scores = predictor(node_embeddings, train_edge_index)
        loss = criterion(scores.squeeze(), train_labels)
        loss.backward()
        optimizer.step()

        if epoch % 10 == 0:
            print(f'Epoch {epoch}, Loss: {loss.item()}')
        
        # Early stopping
        if loss.item() < best_loss:
            best_loss = loss.item()
            patience_counter = 0
        else:
            patience_counter += 1
        
        if patience_counter >= patience:
            print(f'Early stopping at epoch {epoch}')
            break

# Initialize optimizer
optimizer = torch.optim.Adam(list(gcn_model.parameters()) + list(link_predictor.parameters()), lr=0.005)

# 모델 학습
train(gcn_model, link_predictor, data, train_edge_index, train_labels, optimizer, epochs=300)

# 학습된 모델을 사용하여 positive, negative 간선의 점수 확인
def check_positive_negative_edge_scores(model, predictor, data, edge_index, neg_edge_index):
    model.eval()
    predictor.eval()
    with torch.no_grad():
        node_embeddings = model(data.x, data.edge_index)
        pos_scores = predictor(node_embeddings, edge_index)
        neg_scores = predictor(node_embeddings, neg_edge_index)
        print("Scores for positive edges:")
        print(pos_scores.squeeze())
        print("Scores for negative edges:")
        print(neg_scores.squeeze())

check_positive_negative_edge_scores(gcn_model, link_predictor, data, edge_index, neg_edge_index)

Epoch 0, Loss: 0.6935786008834839
Epoch 10, Loss: 0.3409143388271332
Epoch 20, Loss: 0.0490550771355629
Epoch 30, Loss: 0.055342018604278564
Epoch 40, Loss: 0.13991595804691315
Epoch 50, Loss: 0.09257462620735168
Epoch 60, Loss: 0.02048284187912941
Epoch 70, Loss: 0.01268954761326313
Epoch 80, Loss: 0.03280670940876007
Epoch 90, Loss: 0.015053675509989262
Epoch 100, Loss: 0.05631186068058014
Epoch 110, Loss: 0.009037458337843418
Epoch 120, Loss: 0.0011791532160714269
Epoch 130, Loss: 0.07013938575983047
Epoch 140, Loss: 0.0015428961487486959
Epoch 150, Loss: 0.016584260389208794
Epoch 160, Loss: 0.004368618596345186
Epoch 170, Loss: 0.00032446198747493327
Epoch 180, Loss: 0.021462123841047287
Epoch 190, Loss: 0.02822306752204895
Epoch 200, Loss: 0.005328413564711809
Epoch 210, Loss: 0.028836410492658615
Epoch 220, Loss: 0.04935828596353531
Epoch 230, Loss: 0.04468010738492012
Early stopping at epoch 233
Scores for positive edges:
tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 

In [185]:
edge_index, neg_edge_index

(tensor([[   0,    0,    0,    0,    0,    0,    0,    1,    1,    1,    1,    1,
             1,    1,    2,    2,    2,    2,    2,    2,    2,    3,    3,    3,
             3,    3,    3,    3,    4,    4,    4,    4,    4,    4,    4,    5,
             5,    5,    5,    5,    5,    5,    6,    6,    6,    6,    6,    6,
             6,    7,    7,    7,    7,    7,    7,    7,    8,    8,    8,    8,
             8,    8,    8,    9,    9,    9,    9,    9,    9,    9,   10,   10,
            10,   10,   10,   10,   10,   11,   11,   11,   11,   11,   11,   11,
            12,   12,   12,   12,   12,   12,   12,   13,   13,   13,   13,   13,
            13,   13,   14,   14,   14,   14,   14,   14,   14,   15,   15,   15,
            15,   15,   15,   15,   16,   16,   16,   16,   16,   16,   16,   17,
            17,   17,   17,   17,   17,   17,   18,   18,   18,   18,   18,   18,
            18,   19,   19,   19,   19,   19,   19,   19,   20,   20,   20,   20,
            20, 

# Test

In [3]:
movie_metadata = pd.read_excel("cinemate_data_0511.xlsx")
final_metadata = pd.read_excel("final_metadata_with_overview_embeddings_ver5_0511.xlsx")

In [187]:
# Function to recommend top 5 movies for a given user
def recommend_movies_for_user(user_id, node_embeddings, edge_index, num_recommendations=5):
    user_idx = torch.tensor([user_id_to_index[user_id]])
    movie_indices = torch.arange(num_users, num_users + num_movies)
    
    # Generate all possible user-movie pairs
    pairs = torch.stack([user_idx.expand(num_movies), movie_indices], dim=0)
    
    # Check existing edges to avoid recommending already liked movies
    existing_edges = edge_index[:, (edge_index[0] == user_idx)]
    existing_movies = existing_edges[1] - num_users
    
    # Remove existing edges from pairs
    # mask = torch.ones(num_movies, dtype=torch.bool)
    # mask[existing_movies] = False
    # pairs = pairs[:, mask]
    
    # Get scores for the user-movie pairs
    scores = link_predictor(node_embeddings, pairs)
    print(scores[scores > 0.99])
    
    # Get top N recommendations
    _, top_indices = torch.topk(scores.squeeze(), num_recommendations)
    # print(top_indices)
    # print(scores[top_indices])
    top_movie_indices = pairs[1][top_indices] - num_users
    
    return top_movie_indices

# 모델 학습 후 각 유저에 대한 추천 수행
node_embeddings = gcn_model(data.x, data.edge_index)
for i in range(num_users):
    user_id = users['user_id'][i]
    num_recommendations = 10
    top_movies = recommend_movies_for_user(user_id, node_embeddings, edge_index, num_recommendations=num_recommendations)
    print(f"Top {num_recommendations} recommended movies for user {user_id}: {top_movies}")
    for idx in top_movies:
        print(final_metadata.iloc[int(idx)].genres)
        # print(movie_metadata.iloc[int(idx)].overview)

# crime = [4226, 4, 4769] # 마스터, 범죄도시4, 기술자들
# sci_fi = [5459, 6076, 9519]  # 인 타임, 점퍼, 메트로폴리스
# horror = [2060, 2104, 4268]  # 컨저링, 컨저링3, 컨저링2
# comedy = [4887, 5839, 9506]  # 7번 방의 선물, 세 얼간이, City Lights

tensor([1.0000, 1.0000, 1.0000,  ..., 1.0000, 0.9999, 0.9937],
       grad_fn=<IndexBackward0>)
Top 10 recommended movies for user 0: tensor([ 23,   9, 124, 105,  25,  59, 137,  98,  49,  34])
Action,Adventure,Drama
Action,Adventure,Drama
Action,Drama,Thriller
Drama,Romance
Comedy,Romance,Sci-Fi
Action,Adventure,Drama
Documentary
Thriller
Adventure,Animation,Comedy
Action,Animation,Comedy
tensor([1.0000, 0.9946, 0.9995,  ..., 1.0000, 1.0000, 0.9957],
       grad_fn=<IndexBackward0>)
Top 10 recommended movies for user 1: tensor([158,  98, 256, 255,  23, 165, 294, 227, 154, 105])
Comedy
Thriller
Comedy,Romance
Drama,Mystery
Action,Adventure,Drama
Horror,Thriller
Action,Thriller
Adventure,Animation,Comedy
Biography,Drama
Drama,Romance
tensor([0.9986, 0.9995, 0.9975,  ..., 0.9999, 0.9998, 0.9915],
       grad_fn=<IndexBackward0>)
Top 10 recommended movies for user 2: tensor([105,  49, 303, 294, 227, 158, 336, 332, 256,  98])
Drama,Romance
Adventure,Animation,Comedy
Drama
Action,Thriller
Ad

In [None]:
movie_metadata = pd.read_excel("cinemate_data_0511.xlsx")
movie_metadata

In [None]:
final_metadata = pd.read_excel("final_metadata_with_overview_embeddings_ver5_0511.xlsx")
final_metadata

# 학습 데이터에 없던 사용자에 대한 추천 테스트

In [190]:
import numpy as np

# 유저 임베딩 생성 함수
def create_new_user_embedding(movie_features, interacted_movie_indices):
    new_user_embedding = movie_features[interacted_movie_indices].mean(dim=0)
    # print("new user embedding shape:", new_user_embedding.shape)
    # print("new user embedding:", new_user_embedding)
    return new_user_embedding

# # 새로운 유저에 대한 영화 추천
# def recommend_movies_for_new_user(node_embeddings, num_users = 1, num_movies = 9525, num_recommendations=5):
#     movie_indices = torch.arange(num_users, num_users + num_movies) # 영화 인덱스 생성 (유저 수만큼 offset)
    
#     # user-movie pairs 생성
#     user_movie_pairs = torch.stack([torch.zeros(num_movies, dtype=torch.long), movie_indices], dim=0)
#     print(user_movie_pairs)
    
#     # user-movie pairs의 score 계산
#     scores = link_predictor(node_embeddings, user_movie_pairs)
#     # print(scores[scores > 0.9])
    
#     # top N 추천
#     _, top_indices = torch.topk(scores.squeeze(), num_recommendations)
#     top_movie_indices = user_movie_pairs[1][top_indices] - num_users # 유저 수만큼 offset 재조정
    
#     return top_movie_indices
def recommend_movies_for_new_user(link_predictor, node_embeddings, edge_index, num_users = 1, num_movies = 9525, num_recommendations=5):
    movie_indices = torch.arange(num_users, num_users + num_movies) # 영화 인덱스 생성 (유저 수만큼 offset)
    
    # user-movie pairs 생성
    user_movie_pairs = torch.stack([torch.zeros(num_movies, dtype=torch.long), movie_indices], dim=0)

    # 기존 유저가 본 영화 제외
    already_seen_movies = edge_index[1] - num_users
    mask = torch.ones(num_movies, dtype=torch.bool)
    mask[already_seen_movies] = False
    user_movie_pairs = user_movie_pairs[:, mask]
    
    # user-movie pairs의 score 계산
    scores = link_predictor(node_embeddings, user_movie_pairs)
    
    # top N 추천
    _, top_indices = torch.topk(scores.squeeze(), num_recommendations)
    top_movie_indices = user_movie_pairs[1][top_indices] - num_users # 유저 수만큼 offset 재조정
    
    return top_movie_indices


# 영화 features
movie_features = torch.from_numpy(np.load("final_features_0528.npy")).float()

new_user_interacted_movies = [3201, 5229, 6214]
new_user_embedding = create_new_user_embedding(movie_features, new_user_interacted_movies)

new_x = torch.cat([new_user_embedding.view(1, -1), movie_features], dim=0)
num_users = 1

# 유저 - 영화 간 상호작용 edge index로 변환
user_indices = [i for i in range(num_users)]
movie_indices = [i + num_users for i in new_user_interacted_movies]

edge_index = torch.tensor([user_indices * len(movie_indices), movie_indices], dtype=torch.long)

node_embeddings = gcn_model(new_x, edge_index)

# 새로운 유저에 대한 추천 수행
num_recommendations = 20
top_movies_for_new_user = recommend_movies_for_new_user(
    link_predictor=link_predictor,
    node_embeddings=node_embeddings, 
    edge_index=edge_index,
    num_users=num_users,
    num_movies=movie_features.shape[0],
    num_recommendations=num_recommendations)
print(f"Top {num_recommendations} recommended movies for the new user: {top_movies_for_new_user}")
for idx in top_movies_for_new_user:
    print(final_metadata.iloc[int(idx)].genres)
    # print(movie_metadata.iloc[int(idx)].overview)

Top 20 recommended movies for the new user: tensor([375, 256, 434, 227, 105, 387, 344, 679,  98, 435,  14, 154, 303, 158,
        613, 680, 658, 468, 294,  23])
Documentary
Comedy,Romance
Drama
Adventure,Animation,Comedy
Drama,Romance
Comedy,Drama
Action,Adventure
Drama
Thriller
Drama
Comedy,Drama,Sport
Biography,Drama
Drama
Comedy
Action
Drama,Horror,Romance
Drama
Drama,History
Action,Thriller
Action,Adventure,Drama


In [191]:
# save models
torch.save(gcn_model.state_dict(), 'gcn_model.pth')
torch.save(link_predictor.state_dict(), 'link_predictor.pth')

In [192]:
# load saved models
gcn_model = GCNLinkPredictor(num_in_features, num_out_features, num_users)
link_predictor = LinkPredictor(num_out_features)
gcn_model.load_state_dict(torch.load('gcn_model.pth'))
link_predictor.load_state_dict(torch.load('link_predictor.pth'))

# 모델을 evaluation 모드로 변경
gcn_model.eval()
link_predictor.eval()

LinkPredictor(
  (fc1): Linear(in_features=828, out_features=828, bias=True)
  (fc2): Linear(in_features=828, out_features=414, bias=True)
  (fc3): Linear(in_features=414, out_features=414, bias=True)
  (fc4): Linear(in_features=414, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

# 장르별로 데이터셋을 구성하여 재학습

In [9]:
import random

# 영화 인덱스
genres = {
    "Action_Crime": [5502, 3802, 4788, 4165, 4108, 3799, 3843, 3830, 5414],
    "Action_History": [5103, 2724, 96, 4754, 4248, 4716, 4588, 1248],
    "Action_Disaster": [5737, 4815, 105, 3008, 4894],
    "Sports": [5778, 4769, 5077, 5210],
    "Sci_Fi": [4548, 4387, 5543, 3201, 3928],
    "Horror": [4845, 4604, 300, 5100, 5419],
    "Comedy": [4763, 5701, 4477, 3095],
    "Animation": [3649, 1124],
    "Musical": [3973, 6072, 4945, 3793, 6364],
    "Romance": [4817, 4732, 6636, 4486]
}

all_movies = [movie for genre_movies in genres.values() for movie in genre_movies]

# 각 장르만 좋아하는 사용자 1명 생성
users_pos = []
users_neg = []

for genre, movies in genres.items():
    count = 4
    if genre == "Animation":
        count = 2
    pos_samples = random.sample(movies, count)
    neg_samples = random.sample([m for m in all_movies if m not in movies], count)
    users_pos.append(pos_samples)
    users_neg.append(neg_samples)

# Musical, Romance, Animation을 좋아하고 다른 건 싫어하는 사용자 2명 생성
musical_romance_animation_pos = genres["Musical"][:2] + genres["Romance"][:1] + genres["Animation"][:1]
musical_romance_animation_neg = random.sample([m for m in all_movies if m not in musical_romance_animation_pos], 4)
for _ in range(2):
    users_pos.append(musical_romance_animation_pos)
    users_neg.append(musical_romance_animation_neg)

# Comedy, Sports를 좋아하고 다른 건 싫어하는 사용자 2명 생성
comedy_sports_pos = genres["Comedy"][:2] + genres["Sports"][:2]
comedy_sports_neg = random.sample([m for m in all_movies if m not in comedy_sports_pos], 4)
for _ in range(2):
    users_pos.append(comedy_sports_pos)
    users_neg.append(comedy_sports_neg)

# 결과 출력
new_users_pos = []
new_users_neg = []
for i, (pos, neg) in enumerate(zip(users_pos, users_neg), start=1):
    print(f"사용자 {i}:")
    print(f"  좋아하는 영화 인덱스: {pos}")
    print(f"  싫어하는 영화 인덱스: {neg}")
    print()
    new_users_pos.append(pos)
    new_users_neg.append(neg)


사용자 1:
  좋아하는 영화 인덱스: [3799, 4108, 3830, 3843]
  싫어하는 영화 인덱스: [4604, 5100, 5778, 5419]

사용자 2:
  좋아하는 영화 인덱스: [4588, 2724, 4248, 5103]
  싫어하는 영화 인덱스: [4763, 5737, 4769, 4548]

사용자 3:
  좋아하는 영화 인덱스: [3008, 105, 4894, 4815]
  싫어하는 영화 인덱스: [4604, 4817, 4754, 3843]

사용자 4:
  좋아하는 영화 인덱스: [4769, 5077, 5210, 5778]
  싫어하는 영화 인덱스: [3799, 5543, 5100, 3843]

사용자 5:
  좋아하는 영화 인덱스: [4548, 3928, 4387, 5543]
  싫어하는 영화 인덱스: [3649, 5210, 4248, 105]

사용자 6:
  좋아하는 영화 인덱스: [4845, 5100, 300, 4604]
  싫어하는 영화 인덱스: [4486, 3830, 4248, 4945]

사용자 7:
  좋아하는 영화 인덱스: [3095, 4763, 4477, 5701]
  싫어하는 영화 인덱스: [5419, 3973, 4769, 5778]

사용자 8:
  좋아하는 영화 인덱스: [1124, 3649]
  싫어하는 영화 인덱스: [4548, 5701]

사용자 9:
  좋아하는 영화 인덱스: [3973, 3793, 6072, 4945]
  싫어하는 영화 인덱스: [5077, 4769, 3928, 4486]

사용자 10:
  좋아하는 영화 인덱스: [6636, 4732, 4486, 4817]
  싫어하는 영화 인덱스: [5077, 4548, 5502, 4945]

사용자 11:
  좋아하는 영화 인덱스: [3973, 6072, 4817, 3649]
  싫어하는 영화 인덱스: [4604, 4945, 4477, 5103]

사용자 12:
  좋아하는 영화 인덱스: [3973, 6072, 4817, 3649]
  싫어하는 영화

In [18]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data

# 예시 유저 데이터
interactions_pos = pd.DataFrame(columns=['user_id', 'movie_id'])
interactions_neg = pd.DataFrame(columns=['user_id', 'movie_id'])

# 새로운 사용자 데이터와 기존 사용자 데이터 합치기
all_pos_samples = new_users_pos
all_neg_samples = new_users_neg

# 각 유저에 대한 긍정적 샘플 추가
for i, pos_samples in enumerate(all_pos_samples):
    user_id = i
    for movie_id in pos_samples:
        interactions_pos = pd.concat([interactions_pos, pd.DataFrame({'user_id': [user_id], 'movie_id': [movie_id]})], ignore_index=True)

# 각 유저에 대한 부정적 샘플 추가
for i, neg_samples in enumerate(all_neg_samples):
    user_id = i
    for movie_id in neg_samples:
        interactions_neg = pd.concat([interactions_neg, pd.DataFrame({'user_id': [user_id], 'movie_id': [movie_id]})], ignore_index=True)

users = pd.DataFrame(columns=['user_id'])
users['user_id'] = interactions_pos['user_id'].unique()

num_users = users.shape[0]

# 영화 features
movie_features = torch.from_numpy(np.load("final_features_0528.npy")).float()

num_movies = movie_features.size(0)
num_in_features = movie_features.size(1)
num_out_features = movie_features.size(1)

# user indices, movie indices 생성
user_id_to_index = {user_id: idx for idx, user_id in enumerate(users['user_id'])}
movie_id_to_index = {movie_id: idx for idx, movie_id in enumerate(range(num_movies))}

# 유저 - 영화 간 상호작용 edge index로 변환
user_indices = interactions_pos['user_id'].apply(lambda x: user_id_to_index[x])
movie_indices = interactions_pos['movie_id'].apply(lambda x: num_users + movie_id_to_index[x])  # 유저 수만큼 offset 추가

edge_index = torch.tensor([user_indices.values, movie_indices.values], dtype=torch.long)

# 유저 초기 임베딩 생성
user_embedding = torch.zeros(num_users, num_in_features)
# 좋아하는 영화의 평균으로 유저 임베딩 생성
for user_id in users['user_id']:
    user_idx = user_id_to_index[user_id]
    movie_idxs = interactions_pos[interactions_pos['user_id'] == user_id]['movie_id'].apply(lambda x: movie_id_to_index[x])
    user_movie_features = movie_features[movie_idxs.values].mean(dim=0)
    user_embedding[user_idx] = user_movie_features

# 영화 feature와 유저 feature 합치기
x = torch.cat([user_embedding, movie_features], dim=0)

data = Data(x=x, edge_index=edge_index)

class GCNLinkPredictor(nn.Module):
    def __init__(self, num_in_features, num_out_features=128, num_users=num_users):
        super().__init__()
        self.conv1 = GCNConv(num_in_features, num_out_features)
        self.conv2 = GCNConv(num_out_features, num_out_features)
        self.num_users = num_users

    def forward(self, x, edge_index):
        user_embedding = x[:self.num_users]
        movie_features = x[self.num_users:].detach()
        
        x = torch.cat([user_embedding, movie_features], dim=0)
        x = self.conv1(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        
        x[self.num_users:] = movie_features
        return x

# 유저 - 영화 간 edge를 입력받아 유저-영화 간 관계 예측
class LinkPredictor(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim * 2, input_dim * 2)
        self.fc2 = nn.Linear(input_dim * 2, input_dim)
        self.fc3 = nn.Linear(input_dim, 1)
        # self.fc3 = nn.Linear(input_dim, input_dim)
        # self.fc4 = nn.Linear(input_dim, 1)
        self.dropout = nn.Dropout(p=0.5)

    def forward(self, z, edge_index):
        row, col = edge_index
        z_row = z[row]
        z_col = z[col]
        
        # 임베딩 연결
        z_concat = torch.cat([z_row, z_col], dim=1)
        
        # 비선형 활성화 함수
        x = F.relu(self.fc1(z_concat))
        
        # 정규화를 위한 드롭아웃
        x = self.dropout(x)

        x = F.relu(self.fc2(x))
        x = self.dropout(x)

        # x = F.relu(self.fc3(x))
        # x = self.dropout(x)
        
        # 두 번째 선형 계층 및 시그모이드 활성화 적용
        # return torch.sigmoid(self.fc4(x))
        return torch.sigmoid(self.fc3(x))

# 모델 초기화
gcn_model = GCNLinkPredictor(num_in_features, num_out_features, num_users)
link_predictor = LinkPredictor(num_out_features)

# Negative sampling: 다른 유저와 연결된 영화를 negative sample로 선정
def negative_sampling(interactions, num_users):
    user_indices = interactions['user_id'].apply(lambda x: user_id_to_index[x])
    movie_indices = interactions['movie_id'].apply(lambda x: num_users + x)  # 유저 수만큼 offset 추가

    neg_edge_index = torch.tensor([user_indices.values, movie_indices.values], dtype=torch.long)

    return neg_edge_index

neg_edge_index = negative_sampling(interactions_neg, num_users)

# positive, negative samples 합치기
train_edge_index = torch.cat([edge_index, neg_edge_index], dim=1)
train_labels = torch.cat([torch.ones(edge_index.size(1)), torch.zeros(neg_edge_index.size(1))])

# Train 함수 정의
def train(model, predictor, data, train_edge_index, train_labels, optimizer, epochs=200, patience=100):
    model.train()
    predictor.train()
    criterion = nn.BCELoss()
    best_loss = float('inf')

    for epoch in range(epochs):
        optimizer.zero_grad()
        node_embeddings = model(data.x, data.edge_index)
        scores = predictor(node_embeddings, train_edge_index)
        loss = criterion(scores.squeeze(), train_labels)
        loss.backward()
        optimizer.step()

        if epoch % 10 == 0:
            print(f'Epoch {epoch}, Loss: {loss.item()}')
        
        # Early stopping
        if loss.item() < best_loss:
            best_loss = loss.item()
            patience_counter = 0
        else:
            patience_counter += 1
        
        if patience_counter >= patience:
            print(f'Early stopping at epoch {epoch}')
            break

# Initialize optimizer
optimizer = torch.optim.Adam(list(gcn_model.parameters()) + list(link_predictor.parameters()), lr=0.005)

# 모델 학습
train(gcn_model, link_predictor, data, train_edge_index, train_labels, optimizer, epochs=300)

# 학습된 모델을 사용하여 positive, negative 간선의 점수 확인
def check_positive_negative_edge_scores(model, predictor, data, edge_index, neg_edge_index):
    model.eval()
    predictor.eval()
    with torch.no_grad():
        node_embeddings = model(data.x, data.edge_index)
        pos_scores = predictor(node_embeddings, edge_index)
        neg_scores = predictor(node_embeddings, neg_edge_index)
        print("Scores for positive edges:")
        print(pos_scores.squeeze())
        print("Scores for negative edges:")
        print(neg_scores.squeeze())

check_positive_negative_edge_scores(gcn_model, link_predictor, data, edge_index, neg_edge_index)


Epoch 0, Loss: 0.6968079805374146
Epoch 10, Loss: 0.5069767832756042
Epoch 20, Loss: 0.24979525804519653
Epoch 30, Loss: 0.19425076246261597
Epoch 40, Loss: 0.17261365056037903
Epoch 50, Loss: 0.09868376702070236
Epoch 60, Loss: 0.1127716451883316
Epoch 70, Loss: 0.05101766064763069
Epoch 80, Loss: 0.08523685485124588
Epoch 90, Loss: 0.06455976516008377
Epoch 100, Loss: 0.23924577236175537
Epoch 110, Loss: 0.1481323093175888
Epoch 120, Loss: 0.11476656794548035
Epoch 130, Loss: 0.06492843478918076
Epoch 140, Loss: 0.01211571879684925
Epoch 150, Loss: 0.03131544217467308
Epoch 160, Loss: 0.015558700077235699
Epoch 170, Loss: 0.10059851408004761
Epoch 180, Loss: 0.015073816291987896
Epoch 190, Loss: 0.031553998589515686
Epoch 200, Loss: 0.022982342168688774
Epoch 210, Loss: 0.1358097940683365
Epoch 220, Loss: 0.050412073731422424
Epoch 230, Loss: 0.04954433813691139
Epoch 240, Loss: 0.08767319470643997
Early stopping at epoch 249
Scores for positive edges:
tensor([0.9998, 1.0000, 1.0000,

In [5]:
movie_metadata = pd.read_excel("cinemate_data_only_eng_kor_0528.xlsx")
final_metadata = pd.read_excel("final_metadata_with_overview_embeddings_only_eng_kor_0528.xlsx")

In [19]:
# Function to recommend top 5 movies for a given user
def recommend_movies_for_user(user_id, node_embeddings, edge_index, num_recommendations=5):
    user_idx = torch.tensor([user_id_to_index[user_id]])
    movie_indices = torch.arange(num_users, num_users + num_movies)
    
    # Generate all possible user-movie pairs
    pairs = torch.stack([user_idx.expand(num_movies), movie_indices], dim=0)
    
    # Check existing edges to avoid recommending already liked movies
    existing_edges = edge_index[:, (edge_index[0] == user_idx)]
    existing_movies = existing_edges[1] - num_users
    
    # Remove existing edges from pairs
    # mask = torch.ones(num_movies, dtype=torch.bool)
    # mask[existing_movies] = False
    # pairs = pairs[:, mask]
    
    # Get scores for the user-movie pairs
    scores = link_predictor(node_embeddings, pairs)
    # print(scores[scores > 0.99])
    
    # Get top N recommendations
    _, top_indices = torch.topk(scores.squeeze(), num_recommendations)
    # print(top_indices)
    # print(scores[top_indices])
    top_movie_indices = pairs[1][top_indices] - num_users
    
    return top_movie_indices

# 모델 학습 후 각 유저에 대한 추천 수행
node_embeddings = gcn_model(data.x, data.edge_index)
for i in range(num_users):
    user_id = users['user_id'][i]
    num_recommendations = 10
    top_movies = recommend_movies_for_user(user_id, node_embeddings, edge_index, num_recommendations=num_recommendations)
    print(f"Top {num_recommendations} recommended movies for user {user_id}: {top_movies}")
    for idx in top_movies:
        print(final_metadata.iloc[int(idx)].genres)
        # print(movie_metadata.iloc[int(idx)].overview)

# crime = [4226, 4, 4769] # 마스터, 범죄도시4, 기술자들
# sci_fi = [5459, 6076, 9519]  # 인 타임, 점퍼, 메트로폴리스
# horror = [2060, 2104, 4268]  # 컨저링, 컨저링3, 컨저링2
# comedy = [4887, 5839, 9506]  # 7번 방의 선물, 세 얼간이, City Lights
    # [5502, 3802, 4788, 4165],  # Action / Crime
    # [5103, 2724, 96, 4754],  # Action / History
    # [5737, 4815, 105, 3008],  # Action / Disaster
    # [5778, 4769, 5077, 5210],  # Sports
    # [4548, 4387, 5543, 3201],  # Sci-Fi
    # [4845, 4604, 300, 5100],  # Horror
    # [4763, 5701, 4477, 3095],  # Comedy
    # [3649, 1124],  # Animation
    # [3973, 6072, 4945, 3793],  # Musical
    # [4817, 4732, 6636, 4486],  # Romance
    # [3973, 6072, 4945, 4817],  # Musical, Romance, Animation 1
    # [3973, 6072, 4945, 4817],  # Musical, Romance, Animation 2
    # [4763, 5778, 4769, 5077],  # Comedy, Sports 1
    # [4763, 5778, 4769, 5077]   # Comedy, Sports 2

Top 10 recommended movies for user 0: tensor([6067, 3095, 3830, 4749, 8227, 5738, 5572,  994, 7777, 3486])
Action,Comedy,Crime
Action,Comedy,Crime
Action,Crime
Action,Crime,Drama
Action,Comedy,Crime
Action,Biography,Crime
Action,Crime,Drama
Action,Crime
Action,Comedy,Crime
Action,Crime,Drama
Top 10 recommended movies for user 1: tensor([  96,  105, 3027, 1782, 1360,  710, 3838, 3579, 3344, 2724])
Action,Drama,History
Action,Adventure,Drama
Action,Drama,War
Action,Drama,Thriller
Action,Drama,History
Action,Crime,Thriller
Action,Drama,Thriller
Action,Drama,History
Drama,Thriller
Drama,History,Thriller
Top 10 recommended movies for user 2: tensor([3008, 4815, 7370, 9155, 8064, 3680, 4849,  137, 5153, 4895])
Action,Adventure,Sci-Fi
Action,Drama,Sci-Fi
Action,Adventure,Sci-Fi
Action,Horror,Sci-Fi
Action,Adventure,Sci-Fi
Action,Sci-Fi,Thriller
Action,Adventure,Sci-Fi
Action,Horror,Sci-Fi
Action,Animation,Sci-Fi
Action,Adventure,Sci-Fi
Top 10 recommended movies for user 3: tensor([6072, 5778,

In [23]:
# save models
torch.save(gcn_model.state_dict(), 'gcn_model_0601_light.pth')
torch.save(link_predictor.state_dict(), 'link_predictor_0601_light.pth')

In [24]:
# load saved models
gcn_model = GCNLinkPredictor(num_in_features, num_out_features, num_users)
link_predictor = LinkPredictor(num_out_features)
gcn_model.load_state_dict(torch.load('gcn_model_0601_light.pth'))
link_predictor.load_state_dict(torch.load('link_predictor_0601_light.pth'))

# 모델을 evaluation 모드로 변경
gcn_model.eval()
link_predictor.eval()

LinkPredictor(
  (fc1): Linear(in_features=828, out_features=828, bias=True)
  (fc2): Linear(in_features=828, out_features=414, bias=True)
  (fc3): Linear(in_features=414, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [22]:
import numpy as np

# 유저 임베딩 생성 함수
def create_new_user_embedding(movie_features, interacted_movie_indices):
    new_user_embedding = movie_features[interacted_movie_indices].mean(dim=0)
    return new_user_embedding

def recommend_movies_for_new_user(link_predictor, node_embeddings, edge_index, num_users = 1, num_movies = 9525, num_recommendations=5):
    movie_indices = torch.arange(num_users, num_users + num_movies) # 영화 인덱스 생성 (유저 수만큼 offset)
    
    # user-movie pairs 생성
    user_movie_pairs = torch.stack([torch.zeros(num_movies, dtype=torch.long), movie_indices], dim=0)

    # 기존 유저가 본 영화 제외
    already_seen_movies = edge_index[1] - num_users
    mask = torch.ones(num_movies, dtype=torch.bool)
    mask[already_seen_movies] = False
    user_movie_pairs = user_movie_pairs[:, mask]
    
    # user-movie pairs의 score 계산
    scores = link_predictor(node_embeddings, user_movie_pairs)
    
    # top N 추천
    _, top_indices = torch.topk(scores.squeeze(), num_recommendations)
    top_movie_indices = user_movie_pairs[1][top_indices] - num_users # 유저 수만큼 offset 재조정
    
    return top_movie_indices


# 영화 features
movie_features = torch.from_numpy(np.load("final_features_0528.npy")).float()

new_user_interacted_movies = [4817]
new_user_embedding = create_new_user_embedding(movie_features, new_user_interacted_movies)

new_x = torch.cat([new_user_embedding.view(1, -1), movie_features], dim=0)
num_users = 1

# 유저 - 영화 간 상호작용 edge index로 변환
user_indices = [i for i in range(num_users)]
movie_indices = [i + num_users for i in new_user_interacted_movies]

edge_index = torch.tensor([user_indices * len(movie_indices), movie_indices], dtype=torch.long)

node_embeddings = gcn_model(new_x, edge_index)

# 새로운 유저에 대한 추천 수행
num_recommendations = 20
top_movies_for_new_user = recommend_movies_for_new_user(
    link_predictor=link_predictor,
    node_embeddings=node_embeddings, 
    edge_index=edge_index,
    num_users=num_users,
    num_movies=movie_features.shape[0],
    num_recommendations=num_recommendations)
print(f"Top {num_recommendations} recommended movies for the new user: {top_movies_for_new_user}")
for idx in top_movies_for_new_user:
    print(final_metadata.iloc[int(idx)].genres)
    # print(movie_metadata.iloc[int(idx)].overview)

Top 20 recommended movies for the new user: tensor([3649, 1768, 5279, 8052, 2428, 6683, 1864, 1883, 4641, 6861, 4486, 5882,
        7019, 6718, 6977, 8638, 1124, 3917, 6012, 7071])
Adventure,Animation,Drama,Family
Adventure,Fantasy,Romance
Adventure,Family,Fantasy
Comedy,Fantasy,Romance
Comedy,Family,Fantasy
Adventure,Drama,Fantasy
Comedy,Fantasy,Romance
Adventure,Animation,Comedy
Drama,Fantasy,Romance
Adventure,Family,Fantasy
Drama,Fantasy,Romance
Drama,Fantasy,Romance
Comedy,Family,Fantasy
Adventure,Family,Fantasy
Adventure,Family,Fantasy
Adventure,Family,Romance
Adventure,Animation,Comedy
Adventure,Animation,Fantasy
Adventure,Comedy,Fantasy
Comedy,Fantasy,Romance
