# 이분 그래프 기반 추천 시스템 테스트

In [113]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv, GCNConv
from torch_geometric.data import Data

# 예시 유저 데이터
interactions = pd.DataFrame(columns=['user_id', 'movie_id'])
# interactions['user_id'] = [1, 1, 1, 2, 2, 2]
# interactions['movie_id'] = [2060, 2104, 4268, 4887, 5839, 9506]
# interactions['user_id'] = [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4]
# interactions['movie_id'] = [4, 4769, 5208, 5459, 6076, 2060, 2104, 4268, 4887, 5839, 9506]
# interactions['user_id'] = [1, 1, 1]
crime = [4226, 4, 4769] # 마스터, 범죄도시4, 기술자들
sci_fi = [5459, 6076, 9519]  # 인 타임, 점퍼, 메트로폴리스
horror = [2060, 2104, 4268]  # 컨저링, 컨저링3, 컨저링2
comedy = [4887, 5839, 9506]  # 7번 방의 선물, 세 얼간이, City Lights
interactions['user_id'] = [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4]
interactions['movie_id'] = crime + sci_fi + horror + comedy
users = pd.DataFrame(columns=['user_id'])
users['user_id'] = interactions.user_id.unique()

num_users = users.shape[0]
# num_of_heads = 1

# 영화 features
movie_features = torch.from_numpy(np.load("final_features_without_directors_0518.npy")).float()

num_movies = movie_features.size(0)
num_in_features = movie_features.size(1)
num_out_features = movie_features.size(1)

# user indices, movie indices 생성
user_id_to_index = {user_id: idx for idx, user_id in enumerate(users['user_id'])}
movie_id_to_index = {movie_id: idx for idx, movie_id in enumerate(range(num_movies))}

# 유저 - 영화 간 상호작용 edge index로 변환
user_indices = interactions['user_id'].apply(lambda x: user_id_to_index[x])
movie_indices = interactions['movie_id'].apply(lambda x: num_users + movie_id_to_index[x])  # 유저 수만큼 offset 추가

edge_index = torch.tensor([user_indices.values, movie_indices.values], dtype=torch.long)

# 유저 초기 임베딩 생성
user_embedding = torch.zeros(num_users, num_in_features)
# 좋아하는 영화의 평균으로 유저 임베딩 생성
for user_id in users['user_id']:
    user_idx = user_id_to_index[user_id]
    movie_idxs = interactions[interactions['user_id'] == user_id]['movie_id'].apply(lambda x: movie_id_to_index[x])
    user_movie_features = movie_features[movie_idxs.values].mean(dim=0)
    user_embedding[user_idx] = user_movie_features
# # 유저 초기 임베딩을 xavier 초기화
# nn.init.xavier_uniform_(user_embedding)  # initialize user features with xavier initialization
# 영화 feature와 유저 feature 합치기
x = torch.cat([user_embedding, movie_features], dim=0)

data = Data(x=x, edge_index=edge_index)

class GCNLinkPredictor(nn.Module):
    def __init__(self, num_in_features, num_out_features=128, num_users=num_users):
        super().__init__()
        self.conv1 = GCNConv(num_in_features, num_out_features)
        self.conv2 = GCNConv(num_out_features, num_out_features)
        self.num_users = num_users

    def forward(self, x, edge_index):
        user_embedding = x[:self.num_users]
        movie_features = x[self.num_users:].detach()
        
        x = torch.cat([user_embedding, movie_features], dim=0)
        x = self.conv1(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        
        x[self.num_users:] = movie_features
        return x

# 유저 - 영화 간 edge를 입력받아 유저-영화 간 관계 예측
class LinkPredictor(nn.Module):
    # 비선형으로 개선
    def __init__(self, input_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim * 2, input_dim)
        self.fc2 = nn.Linear(input_dim, 1)
        self.dropout = nn.Dropout(p=0.5)

    def forward(self, z, edge_index):
        row, col = edge_index
        z_row = z[row]
        z_col = z[col]
        
        # 임베딩 연결
        z_concat = torch.cat([z_row, z_col], dim=1)
        
        # 비선형 활성화 함수
        x = F.relu(self.fc1(z_concat))
        
        # 정규화를 위한 드롭아웃
        x = self.dropout(x)
        
        # 두 번째 선형 계층 및 시그모이드 활성화 적용
        return torch.sigmoid(self.fc2(x))

# 모델 초기화
gcn_model = GCNLinkPredictor(num_in_features, num_out_features, num_users)
link_predictor = LinkPredictor(num_out_features)

# Negative sampling: 다른 유저와 연결된 영화를 negative sample로 선정
def negative_sampling(interactions, num_users, num_neg_samples):
    neg_edge_index = []
    for user_id in interactions['user_id'].unique():
        user_idx = user_id_to_index[user_id]
        user_movies = set(interactions[interactions['user_id'] == user_id]['movie_id'].values)
        other_users_movies = set(interactions[interactions['user_id'] != user_id]['movie_id'].values)
        neg_movies = list(other_users_movies - user_movies)
        
        if len(neg_movies) < num_neg_samples:
            neg_samples = neg_movies
        else:
            neg_samples = np.random.choice(neg_movies, num_neg_samples, replace=False)
        
        neg_edge_index.extend([(user_idx, num_users + movie_id_to_index[movie_id]) for movie_id in neg_samples])
    
    neg_edge_index = torch.tensor(neg_edge_index).t()
    return neg_edge_index

num_neg_samples = 3  # 각 유저마다 n개의 negative sample 생성
neg_edge_index = negative_sampling(interactions, num_users, num_neg_samples)

# positive, negative samples 합치기
train_edge_index = torch.cat([edge_index, neg_edge_index], dim=1)
train_labels = torch.cat([torch.ones(edge_index.size(1)), torch.zeros(neg_edge_index.size(1))])

# Train 함수 정의
def train(model, predictor, data, train_edge_index, train_labels, optimizer, epochs=200, patience=20):
    model.train()
    predictor.train()
    criterion = nn.BCELoss()
    best_loss = float('inf')

    for epoch in range(epochs):
        optimizer.zero_grad()
        node_embeddings = model(data.x, data.edge_index)
        scores = predictor(node_embeddings, train_edge_index)
        loss = criterion(scores.squeeze(), train_labels)
        loss.backward()
        optimizer.step()

        if epoch % 10 == 0:
            print(f'Epoch {epoch}, Loss: {loss.item()}')
        
        # Early stopping
        if loss.item() < best_loss:
            best_loss = loss.item()
            patience_counter = 0
        else:
            patience_counter += 1
        
        if patience_counter >= patience:
            print(f'Early stopping at epoch {epoch}')
            break

# Initialize optimizer
optimizer = torch.optim.Adam(list(gcn_model.parameters()) + list(link_predictor.parameters()), lr=0.005)

# 모델 학습
train(gcn_model, link_predictor, data, train_edge_index, train_labels, optimizer, epochs=300)

# 학습된 모델을 사용하여 positive, negative 간선의 점수 확인
def check_positive_negative_edge_scores(model, predictor, data, edge_index, neg_edge_index):
    model.eval()
    predictor.eval()
    with torch.no_grad():
        node_embeddings = model(data.x, data.edge_index)
        pos_scores = predictor(node_embeddings, edge_index)
        neg_scores = predictor(node_embeddings, neg_edge_index)
        print("Scores for positive edges:")
        print(pos_scores.squeeze())
        print("Scores for negative edges:")
        print(neg_scores.squeeze())

check_positive_negative_edge_scores(gcn_model, link_predictor, data, edge_index, neg_edge_index)

Epoch 0, Loss: 0.6960185170173645
Epoch 10, Loss: 0.39519020915031433
Epoch 20, Loss: 0.17541120946407318
Epoch 30, Loss: 0.24620454013347626
Epoch 40, Loss: 0.08727201074361801
Epoch 50, Loss: 0.2779368758201599
Epoch 60, Loss: 0.1979672908782959
Early stopping at epoch 69
Scores for positive edges:
tensor([0.9999, 0.9995, 0.9994, 0.9963, 0.9957, 0.8636, 1.0000, 1.0000, 1.0000,
        0.9171, 0.8837, 0.6143])
Scores for negative edges:
tensor([9.3940e-04, 7.6912e-04, 4.2132e-02, 9.8333e-04, 4.5855e-04, 1.7437e-04,
        9.3040e-02, 2.7539e-03, 3.9238e-04, 6.4300e-05, 2.5772e-05, 1.3061e-05])


In [114]:
edge_index, neg_edge_index

(tensor([[   0,    0,    0,    1,    1,    1,    2,    2,    2,    3,    3,    3],
         [4230,    8, 4773, 5463, 6080, 9523, 2064, 2108, 4272, 4891, 5843, 9510]]),
 tensor([[   0,    0,    0,    1,    1,    1,    2,    2,    2,    3,    3,    3],
         [6080, 2108, 9510, 4891, 4773,    8, 9523, 4773,    8, 5463, 2108, 4272]]))

# Test

In [4]:
movie_metadata = pd.read_excel("cinemate_data_0511.xlsx")
final_metadata = pd.read_excel("final_metadata_with_overview_embeddings_ver5_0511.xlsx")

In [161]:
# Function to recommend top 5 movies for a given user
def recommend_movies_for_user(user_id, node_embeddings, edge_index, num_recommendations=5):
    user_idx = torch.tensor([user_id_to_index[user_id]])
    movie_indices = torch.arange(num_users, num_users + num_movies)
    
    # Generate all possible user-movie pairs
    pairs = torch.stack([user_idx.expand(num_movies), movie_indices], dim=0)
    
    # Check existing edges to avoid recommending already liked movies
    existing_edges = edge_index[:, (edge_index[0] == user_idx)]
    existing_movies = existing_edges[1] - num_users
    
    # Remove existing edges from pairs
    mask = torch.ones(num_movies, dtype=torch.bool)
    mask[existing_movies] = False
    pairs = pairs[:, mask]
    
    # Get scores for the user-movie pairs
    scores = link_predictor(node_embeddings, pairs)
    # print(scores[scores > 0.6])
    
    # Get top N recommendations
    _, top_indices = torch.topk(scores.squeeze(), num_recommendations)
    # print(top_indices)
    # print(scores[top_indices])
    top_movie_indices = pairs[1][top_indices] - num_users
    
    return top_movie_indices

# 모델 학습 후 각 유저에 대한 추천 수행
node_embeddings = gcn_model(data.x, data.edge_index)
for i in range(num_users):
    user_id = users['user_id'][i]
    num_recommendations = 10
    top_movies = recommend_movies_for_user(user_id, node_embeddings, edge_index, num_recommendations=num_recommendations)
    print(f"Top {num_recommendations} recommended movies for user {user_id}: {top_movies}")
    for idx in top_movies:
        print(final_metadata.iloc[int(idx)].genres)
        # print(movie_metadata.iloc[int(idx)].overview)

Top 10 recommended movies for user 1: tensor([5667, 5804, 5276, 2260, 5340,   55, 2898, 6529, 5708, 3675])
Action,Crime,Drama
Action,Crime,Drama
Action,Crime,Drama
Action,Crime,Drama
Action,Comedy,Crime
Action,Crime,Drama
Action,Crime,Drama
Action,Crime,Drama
Action,Crime,Drama
Action,Crime,Drama
Top 10 recommended movies for user 2: tensor([4268, 7362, 4717, 3779, 8142, 5869, 3764,  687, 5019, 7134])
Horror,Mystery,Thriller
Horror,Mystery,Sci-Fi
Horror,Mystery
Horror,Mystery,Thriller
Horror,Sci-Fi
Horror,Mystery,Thriller
Horror,Mystery,Thriller
Horror,Mystery,Thriller
Adventure,Horror,Sci-Fi
Horror,Mystery,Thriller
Top 10 recommended movies for user 3: tensor([3764, 4971, 5869, 2892, 3779, 3365, 5137, 4726, 2004, 3651])
Horror,Mystery,Thriller
Horror,Mystery,Thriller
Horror,Mystery,Thriller
Horror,Mystery,Thriller
Horror,Mystery,Thriller
Horror,Mystery,Thriller
Horror,Mystery,Thriller
Horror,Mystery,Thriller
Drama,Horror,Mystery
Horror,Mystery,Thriller
Top 10 recommended movies for us

In [None]:
movie_metadata = pd.read_excel("cinemate_data_0511.xlsx")
movie_metadata

In [None]:
final_metadata = pd.read_excel("final_metadata_with_overview_embeddings_ver5_0511.xlsx")
final_metadata

# 학습 데이터에 없던 사용자에 대한 추천 테스트

In [164]:
import numpy as np

# 유저 임베딩 생성 함수
def create_new_user_embedding(movie_features, interacted_movie_indices):
    new_user_embedding = movie_features[interacted_movie_indices].mean(dim=0)
    # print("new user embedding shape:", new_user_embedding.shape)
    # print("new user embedding:", new_user_embedding)
    return new_user_embedding

# 새로운 유저에 대한 영화 추천
def recommend_movies_for_new_user(node_embeddings, num_users = 1, num_movies = 9525, num_recommendations=5):
    movie_indices = torch.arange(num_users, num_users + num_movies) # 영화 인덱스 생성 (유저 수만큼 offset)
    
    # user-movie pairs 생성
    user_movie_pairs = torch.stack([torch.zeros(num_movies, dtype=torch.long), movie_indices], dim=0)
    print(user_movie_pairs)
    
    # user-movie pairs의 score 계산
    scores = link_predictor(node_embeddings, user_movie_pairs)
    # print(scores[scores > 0.9])
    
    # top N 추천
    _, top_indices = torch.topk(scores.squeeze(), num_recommendations)
    top_movie_indices = user_movie_pairs[1][top_indices] - num_users # 유저 수만큼 offset 재조정
    
    return top_movie_indices


# 영화 features
movie_features = torch.from_numpy(np.load("final_features_without_directors_0518.npy")).float()

# 새로운 유저가 6076, 9519 영화를 시청했다고 가정
# new_user_interacted_movies = [6076, 9519] # 점퍼, 메트로폴리스
# new_user_interacted_movies = [4887, 5839] # 7번 방의 선물, 세 얼간이
new_user_interacted_movies = [6, 10, 14] # Citizen of a Kind, It's Okay!, Deurim (코미디들)
new_user_embedding = create_new_user_embedding(movie_features, new_user_interacted_movies)

new_x = torch.cat([new_user_embedding.view(1, -1), movie_features], dim=0)
num_users = 1

# 유저 - 영화 간 상호작용 edge index로 변환
user_indices = [i for i in range(num_users)]
movie_indices = [i + num_users for i in new_user_interacted_movies]

edge_index = torch.tensor([user_indices * len(movie_indices), movie_indices], dtype=torch.long)

node_embeddings = gcn_model(new_x, edge_index)

# 새로운 유저에 대한 추천 수행
num_recommendations = 20
top_movies_for_new_user = recommend_movies_for_new_user(node_embeddings, num_recommendations=num_recommendations)
print(f"Top {num_recommendations} recommended movies for the new user: {top_movies_for_new_user}")
for idx in top_movies_for_new_user:
    print(final_metadata.iloc[int(idx)].genres)
    # print(movie_metadata.iloc[int(idx)].overview)

tensor([[   0,    0,    0,  ...,    0,    0,    0],
        [   1,    2,    3,  ..., 9523, 9524, 9525]])
Top 20 recommended movies for the new user: tensor([4226, 5667, 1111,   55, 5804, 4737, 5276, 2260, 7599, 6529, 7706, 3675,
        4769, 5340, 3504, 1862, 9212, 8410, 5787, 6697])
Action,Crime
Action,Crime,Drama
Comedy,Crime,Drama
Action,Crime,Drama
Action,Crime,Drama
Comedy,Drama,Fantasy
Action,Crime,Drama
Action,Crime,Drama
Crime,Drama
Action,Crime,Drama
Crime,Drama
Action,Crime,Drama
Action,Crime,Drama
Action,Comedy,Crime
Comedy,Crime,Drama
Comedy,Crime,Drama
Crime,Drama
Comedy,Crime,Drama
Comedy,Crime,Drama
Comedy,Crime,Drama


In [158]:
# save models
torch.save(gcn_model.state_dict(), 'gcn_model.pth')
torch.save(link_predictor.state_dict(), 'link_predictor.pth')

In [None]:
# load saved models
gcn_model = GCNLinkPredictor(num_in_features, num_out_features, num_users)
link_predictor = LinkPredictor(num_out_features)
gcn_model.load_state_dict(torch.load('gcn_model.pth'))
link_predictor.load_state_dict(torch.load('link_predictor.pth'))

# 모델을 evaluation 모드로 변경
gcn_model.eval()
link_predictor.eval()