# 이분 그래프 기반 추천 시스템 테스트

In [84]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv, GCNConv
from torch_geometric.data import Data

# 예시 유저 데이터
interactions = pd.DataFrame(columns=['user_id', 'movie_id'])
# interactions['user_id'] = [1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4]
# interactions['movie_id'] = [4, 4769, 5208, 5459, 6076, 2060, 2104, 4268, 4887, 5839, 9506]
# interactions['user_id'] = [1, 1, 1]
# interactions['movie_id'] = [3, 4, 4769] # 파묘, 범죄도시4, 
# interactions['user_id'] = [1, 1]
# interactions['movie_id'] = [5459, 6076]  # 인 타임, 점퍼
# interactions['user_id'] = [1, 1, 1]
# interactions['movie_id'] = [2060, 2104, 4268]  # 컨저링, 컨저링3, 컨저링2
interactions['user_id'] = [1, 1, 1]
interactions['movie_id'] = [4887, 5839, 9506]  # 7번 방의 선물, 세 얼간이, City Lights
users = pd.DataFrame(columns=['user_id'])
users['user_id'] = interactions.user_id.unique()

num_users = users.shape[0]
# num_of_heads = 1

# 영화 features
movie_features = torch.from_numpy(np.load("final_features_without_directors_0518.npy")).float()

num_movies = movie_features.size(0)
num_in_features = movie_features.size(1)
num_out_features = movie_features.size(1)

# user indices, movie indices 생성
user_id_to_index = {user_id: idx for idx, user_id in enumerate(users['user_id'])}
movie_id_to_index = {movie_id: idx for idx, movie_id in enumerate(range(num_movies))}

# 유저 - 영화 간 상호작용 edge index로 변환
user_indices = interactions['user_id'].apply(lambda x: user_id_to_index[x])
movie_indices = interactions['movie_id'].apply(lambda x: num_users + movie_id_to_index[x])  # 유저 수만큼 offset 추가

edge_index = torch.tensor([user_indices.values, movie_indices.values], dtype=torch.long)

# 유저 초기 임베딩 생성
user_features = torch.zeros(num_users, num_in_features)
# 좋아하는 영화의 평균으로 유저 임베딩 생성
for user_id in users['user_id']:
    user_idx = user_id_to_index[user_id]
    movie_idxs = interactions[interactions['user_id'] == user_id]['movie_id'].apply(lambda x: movie_id_to_index[x])
    user_movie_features = movie_features[movie_idxs.values].mean(dim=0)
    user_features[user_idx] = user_movie_features
# # 유저 초기 임베딩을 xavier 초기화
# nn.init.xavier_uniform_(user_features)  # initialize user features with xavier initialization
# 영화 feature와 유저 feature 합치기
x = torch.cat([user_features, movie_features], dim=0)

data = Data(x=x, edge_index=edge_index)

# 유저, 영화의 임베딩 생성용 GAT (영화 특징 고정)
# class GATLinkPredictor(nn.Module):
#     def __init__(self, num_in_features, num_out_features=128, num_of_heads=1, num_users=num_users):
#         super().__init__()
#         self.conv1 = GATConv(num_in_features, num_out_features, heads=num_of_heads)
#         self.conv2 = GATConv(num_out_features * num_of_heads, num_out_features, heads=num_of_heads)
#         self.num_users = num_users

#     def forward(self, x, edge_index):
#         user_features = x[:self.num_users]
#         movie_features = x[self.num_users:].detach()  # 영화 특징을 detach()하여 고정
        
#         x = torch.cat([user_features, movie_features], dim=0)
#         x = self.conv1(x, edge_index)
#         x = F.elu(x)
#         x = F.dropout(x, p=0.5, training=self.training)
#         x = self.conv2(x, edge_index)
#         x = F.elu(x)
#         x = F.dropout(x, p=0.5, training=self.training)
        
#         # 영화 특징을 원래대로 유지
#         x[self.num_users:] = movie_features
#         return x
class GCNLinkPredictor(nn.Module):
    def __init__(self, num_in_features, num_out_features=128, num_users=num_users):
        super().__init__()
        self.conv1 = GCNConv(num_in_features, num_out_features)
        self.conv2 = GCNConv(num_out_features, num_out_features)
        self.num_users = num_users

    def forward(self, x, edge_index):
        user_features = x[:self.num_users]
        movie_features = x[self.num_users:].detach()
        
        x = torch.cat([user_features, movie_features], dim=0)
        x = self.conv1(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        
        x[self.num_users:] = movie_features
        return x

# 유저 - 영화 간 edge를 입력받아 유저-영화 간 관계 예측
class LinkPredictor(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.linear = nn.Linear(input_dim * 2, 1)

    def forward(self, z, edge_index):
        row, col = edge_index
        z_row = z[row]
        z_col = z[col]
        z_concat = torch.cat([z_row, z_col], dim=1)
        return torch.sigmoid(self.linear(z_concat))

# 모델 초기화
gcn_model = GCNLinkPredictor(num_in_features, num_out_features, num_users)
link_predictor = LinkPredictor(num_out_features)  # * num_of_heads)

# Negative sampling
def negative_sampling(edge_index, num_users, num_movies, num_neg_samples):
    existing_edges = set(zip(edge_index[0].tolist(), edge_index[1].tolist()))
    neg_edge_index = set()
    
    while len(neg_edge_index) < num_neg_samples:
        i = torch.randint(0, num_users, (1,)).item()
        j = torch.randint(0, num_movies, (1,)).item() + num_users  # Add offset to movie indices
        if (i, j) not in existing_edges and (i, j) not in neg_edge_index:
            neg_edge_index.add((i, j))
    
    neg_edge_index = torch.tensor(list(neg_edge_index)).t()
    return neg_edge_index

# negative samples
num_neg_samples = edge_index.size(1) * 2  # Increase the number of negative samples
neg_edge_index = negative_sampling(edge_index, num_users, num_movies, num_neg_samples)

# positive, negative samples 합치기
train_edge_index = torch.cat([edge_index, neg_edge_index], dim=1)
train_labels = torch.cat([torch.ones(edge_index.size(1)), torch.zeros(neg_edge_index.size(1))])

# Train 함수 정의
def train(model, predictor, data, train_edge_index, train_labels, optimizer, epochs=200, patience=10):
    model.train()
    predictor.train()
    criterion = nn.BCELoss()
    best_loss = float('inf')

    for epoch in range(epochs):
        optimizer.zero_grad()
        node_embeddings = model(data.x, data.edge_index)
        scores = predictor(node_embeddings, train_edge_index)
        loss = criterion(scores.squeeze(), train_labels)
        loss.backward()
        optimizer.step()

        if epoch % 10 == 0:
            print(f'Epoch {epoch}, Loss: {loss.item()}')
        
        # Early stopping
        if loss.item() < best_loss:
            best_loss = loss.item()
            patience_counter = 0
        else:
            patience_counter += 1
        
        if patience_counter >= patience:
            print(f'Early stopping at epoch {epoch}')
            break

# Initialize optimizer
optimizer = torch.optim.Adam(list(gcn_model.parameters()) + list(link_predictor.parameters()), lr=0.005)

# 모델 학습
train(gcn_model, link_predictor, data, train_edge_index, train_labels, optimizer, epochs=300)

# 학습된 모델을 사용하여 positive 간선의 점수 확인
def check_positive_edge_scores(model, predictor, data, edge_index):
    model.eval()
    predictor.eval()
    with torch.no_grad():
        node_embeddings = model(data.x, data.edge_index)
        scores = predictor(node_embeddings, edge_index)
        print("Scores for positive edges:")
        print(scores.squeeze())

check_positive_edge_scores(gcn_model, link_predictor, data, edge_index)

Epoch 0, Loss: 0.7120834589004517
Epoch 10, Loss: 0.5307788252830505
Epoch 20, Loss: 0.42413127422332764
Epoch 30, Loss: 0.5174251198768616
Epoch 40, Loss: 0.5328844785690308
Epoch 50, Loss: 0.23114416003227234
Epoch 60, Loss: 0.20014739036560059
Epoch 70, Loss: 0.1537685990333557
Epoch 80, Loss: 0.17490652203559875
Epoch 90, Loss: 0.10739453881978989
Epoch 100, Loss: 0.10739800333976746
Epoch 110, Loss: 0.0832342803478241
Epoch 120, Loss: 0.07813926041126251
Epoch 130, Loss: 0.06757330894470215
Epoch 140, Loss: 0.1297936886548996
Epoch 150, Loss: 0.08263488113880157
Epoch 160, Loss: 0.08906222134828568
Epoch 170, Loss: 0.04553953558206558
Epoch 180, Loss: 0.06431152671575546
Epoch 190, Loss: 0.039976153522729874
Epoch 200, Loss: 0.03770154342055321
Epoch 210, Loss: 0.03888051211833954
Epoch 220, Loss: 0.03917520493268967
Epoch 230, Loss: 0.032731667160987854
Epoch 240, Loss: 0.029976649209856987
Epoch 250, Loss: 0.03297916054725647
Epoch 260, Loss: 0.03127758949995041
Epoch 270, Loss:

# Test

In [4]:
movie_metadata = pd.read_excel("cinemate_data_0511.xlsx")
final_metadata = pd.read_excel("final_metadata_with_overview_embeddings_ver5_0511.xlsx")

In [85]:
# Function to recommend top 5 movies for a given user
def recommend_movies_for_user(user_id, node_embeddings, edge_index, num_recommendations=5):
    user_idx = torch.tensor([user_id_to_index[user_id]])
    movie_indices = torch.arange(num_users, num_users + num_movies)
    
    # Generate all possible user-movie pairs
    pairs = torch.stack([user_idx.expand(num_movies), movie_indices], dim=0)
    
    # Check existing edges to avoid recommending already liked movies
    existing_edges = edge_index[:, (edge_index[0] == user_idx)]
    existing_movies = existing_edges[1] - num_users
    
    # Remove existing edges from pairs
    mask = torch.ones(num_movies, dtype=torch.bool)
    mask[existing_movies] = False
    pairs = pairs[:, mask]
    
    # Get scores for the user-movie pairs
    scores = link_predictor(node_embeddings, pairs)
    print(scores[scores > 0.6])
    
    # Get top N recommendations
    _, top_indices = torch.topk(scores.squeeze(), num_recommendations)
    # print(top_indices)
    # print(scores[top_indices])
    top_movie_indices = pairs[1][top_indices] - num_users
    
    return top_movie_indices

# 모델 학습 후 각 유저에 대한 추천 수행
node_embeddings = gcn_model(data.x, data.edge_index)
for i in range(num_users):
    user_id = users['user_id'][i]
    num_recommendations = 10
    top_movies = recommend_movies_for_user(user_id, node_embeddings, edge_index, num_recommendations=num_recommendations)
    print(f"Top {num_recommendations} recommended movies for user {user_id}: {top_movies}")
    for idx in top_movies:
        print(final_metadata.iloc[int(idx)].genres)
        print(movie_metadata.iloc[int(idx)].overview)

tensor([0.6172, 0.7367, 0.6757, 0.7023, 0.6279, 0.7052, 0.6340, 0.7142, 0.6603,
        0.6258, 0.8210, 0.6879, 0.6805, 0.6147, 0.7297, 0.6443, 0.6106, 0.6494,
        0.6806, 0.7369, 0.6209, 0.6671, 0.6251, 0.6775, 0.8317, 0.6267, 0.6281,
        0.7259, 0.6233, 0.6691, 0.6924, 0.7173, 0.6166, 0.7630, 0.6146, 0.7016,
        0.7179, 0.7986, 0.6072, 0.6114, 0.6453, 0.6271, 0.7771, 0.7376, 0.7353,
        0.6326, 0.6204, 0.7317, 0.6729, 0.7914, 0.8165, 0.6868, 0.6931, 0.6193,
        0.6034, 0.7544, 0.7396, 0.6564, 0.7414, 0.7111, 0.6472, 0.7112, 0.6232,
        0.6001, 0.6442, 0.6058, 0.6298, 0.7102, 0.7467, 0.6367, 0.6016, 0.6556,
        0.6898, 0.6504, 0.6341, 0.6293, 0.6093, 0.8349, 0.6271, 0.6862, 0.8223,
        0.6668, 0.6120, 0.7482, 0.6653, 0.6078, 0.7520, 0.7353, 0.7767, 0.6442,
        0.6681, 0.6510, 0.6852, 0.6428, 0.8365, 0.7155, 0.7640, 0.6464, 0.7753,
        0.7745, 0.6337, 0.6982, 0.6325, 0.7552, 0.6200, 0.6186, 0.6928, 0.7129,
        0.7781, 0.6101, 0.6871, 0.7301, 

# Check

In [21]:
node_embeddings[3:]

tensor([[-0.6534,  0.2353,  0.0287,  ...,  0.0176,  0.0353, -0.0232],
        [ 0.6367,  1.3329,  0.1633,  ...,  0.0743, -0.0190,  0.0559],
        [-0.3094, -1.2843, -0.0157,  ...,  0.0528,  0.0359,  0.0256],
        ...,
        [-0.9974,  0.5730, -0.0165,  ..., -0.0368,  0.0734, -0.1050],
        [-2.2876, -0.2712, -0.0148,  ..., -0.0415,  0.0264,  0.0238],
        [ 1.7119,  0.9107, -0.0148,  ..., -0.0054, -0.0053, -0.0255]],
       grad_fn=<SliceBackward0>)

In [12]:
movie_features

tensor([[-0.6534,  0.2353,  0.0287,  ...,  0.0176,  0.0353, -0.0232],
        [ 0.6367,  1.3329,  0.1633,  ...,  0.0743, -0.0190,  0.0559],
        [-0.3094, -1.2843, -0.0157,  ...,  0.0528,  0.0359,  0.0256],
        ...,
        [-0.9974,  0.5730, -0.0165,  ..., -0.0368,  0.0734, -0.1050],
        [-2.2876, -0.2712, -0.0148,  ..., -0.0415,  0.0264,  0.0238],
        [ 1.7119,  0.9107, -0.0148,  ..., -0.0054, -0.0053, -0.0255]])

In [531]:
user_features[0]

tensor([ 0.0443, -0.0766, -0.0697,  0.1090,  0.0415, -0.1025,  0.0254,  0.1204,
         0.0348, -0.1034, -0.0655, -0.1003, -0.0579,  0.1080, -0.0087, -0.0213,
        -0.1205, -0.1147,  0.0620,  0.0763,  0.0769,  0.0513, -0.0993,  0.0610,
         0.0512, -0.0684, -0.1093,  0.0374,  0.1097, -0.0587,  0.0609, -0.0500,
        -0.1125, -0.0621, -0.0809, -0.0171,  0.0656, -0.1043, -0.0094, -0.0952,
        -0.1208, -0.0725, -0.0350,  0.0215,  0.1011,  0.0770,  0.0816, -0.0513,
        -0.0707,  0.0492, -0.1211, -0.0108,  0.1005,  0.0506,  0.1115,  0.1201,
         0.1015,  0.0165, -0.0948, -0.0975, -0.0481,  0.0499,  0.0175, -0.0142,
         0.0952, -0.0037,  0.0113, -0.0018,  0.1098, -0.0606,  0.1175, -0.0019,
        -0.0130, -0.0738,  0.0501,  0.0023,  0.0584,  0.0042, -0.0485,  0.0430,
         0.0365, -0.0178,  0.0243, -0.0648,  0.0163, -0.0700,  0.0463, -0.0911,
         0.0116,  0.0021,  0.0484,  0.0606, -0.0959, -0.0966,  0.0188, -0.0030,
         0.0328,  0.0280, -0.0286,  0.08

In [None]:
movie_metadata = pd.read_excel("cinemate_data_0511.xlsx")
movie_metadata

In [None]:
final_metadata = pd.read_excel("final_metadata_with_overview_embeddings_ver5_0511.xlsx")
final_metadata

In [30]:
# Top 5 recommended movies for user 1: tensor([130,  16, 184,  92,   7])
# Top 5 recommended movies for user 2: tensor([121,   4, 161, 109, 107])
# Top 5 recommended movies for user 3: tensor([ 16,  71, 130, 109,   4])
# [504, 529, 628, 616, 307]
# tensor([9148, 9207, 3478, 2557, 8102])
# ([ 2,  7, 31, 25, 23])  [5459, 6076]
# ([8684, 5259, 6462, 1362, 8183])
# ([5321, 6482, 9439, 3483, 4080])
# [ 899,  672, 2028, 1151, 1233]
# tensor([3355, 2098, 3049,    5, 8588])
# Top 5 recommended movies for user 1: tensor([2365, 1473, 2560, 2336, 1536])
# Top 5 recommended movies for user 2: tensor([2560,    4, 9376, 5951,    3])
# Top 5 recommended movies for user 1: tensor([6076, 5459, 1172, 9263, 1963])
# Top 5 recommended movies for user 2: tensor([5951,    4,    3, 1172, 9263])
# Top 5 recommended movies for user 1: tensor([5459, 6076, 3890, 1172, 1477])
# Top 5 recommended movies for user 2: tensor([5951,    4,    3, 3890, 1172])
final_metadata.iloc[9506].genres

'Comedy,Drama,Romance'

In [29]:
movie_metadata.iloc[9506].overview

'일자리가 없어 도시를 배회하는 떠돌이는 어느날 아침 산책길에서 꽃 파는 눈먼 소녀(를 만난다. 떠돌이는 마지막 동전을 털어서 꽃을 사주고, 육중한 차문 닫히는 소리에 소녀는 그를 부자로 오인한다. 소녀에게 애정을 느낀 떠돌이는 부자 행세를 하며 가깝게 지내고, 그녀의 눈을 수술할 비용을 마련해 주기로 약속한다. 어느 날 술에 취해 물에 빠진 백만장자를 구해준 떠돌이는 그와 친구가 되는데, 백만장자는 술에 취했을 때만 그를 알아보고 술이 깨면 그를 도둑으로 오인한다. 백만장자가 술에 취했을 때 소녀의 수술비를 얻어낸 떠돌이는 그가 술이 깨기 전에 달아나 소녀에게 돈을 전해주고 사라지는데...'

In [None]:
movie_metadata.iloc[5951]

In [None]:
movie_metadata.iloc[8102].overview

In [None]:
movie_metadata.iloc[9209]

In [None]:
final_metadata.iloc[8954]

In [None]:
# [3, 4, 5951]를 좋아하는 사용자에게 추천하는 영화
# array([9209, 2630, 8562, 8954, 5206]) 첫번째 추천 결과
# array([8562, 5206, 9209, 5723, 8954]) 두번째 추천 결과
# [8562 8711 5723 7577 9209]

In [None]:
final_metadata.iloc[326].genres

In [None]:
movie_metadata.iloc[326].overview

In [None]:
final_metadata.iloc[5951].genres

In [None]:
movie_metadata.iloc[5957].overview

In [None]:
movie_metadata.iloc[5951]