In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!unzip -qn /content/drive/MyDrive/open.zip -d /content/

In [3]:
!pip install torch torch-geometric

Collecting torch-geometric
  Downloading torch_geometric-2.5.3-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_

In [4]:
!pip install janome

Collecting janome
  Downloading Janome-0.5.0-py2.py3-none-any.whl (19.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.7/19.7 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: janome
Successfully installed janome-0.5.0


In [5]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from janome.tokenizer import Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv

# 필요한 NLTK 데이터 다운로드
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# 데이터 로드
view_log_train = pd.read_csv('view_log.csv')
article_info = pd.read_csv('article_info.csv')
submission = pd.read_csv('sample_submission.csv')

# 결측치 처리
article_info['userCountry'].fillna('Unknown', inplace=True)
article_info['userRegion'].fillna('Unknown', inplace=True)

# 불용어 로드
stop_words_dict = {
    'en': stopwords.words('english'),
    'pt': stopwords.words('portuguese'),
    'la': ['et', 'in', 'de'],
    'es': stopwords.words('spanish')
}

# 일본어 불용어 직접 정의
japanese_stop_words = ['これ', 'それ', 'あれ', 'この', 'その', 'あの', 'ここ', 'そこ', 'あそこ', 'こちら', 'どこ', 'だれ', 'なに', 'なん']

# 전처리 함수 정의
def preprocess_text(text, language):
    # URL 제거
    text = re.sub(r'https?://\S+', '', text)
    if language == 'en':
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    elif language == 'pt':
        text = re.sub(r'[^a-zA-Z0-9áéíóúâêîôûãõçÇ\s]', '', text)
    elif language == 'la':
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    elif language == 'ja':
        text = re.sub(r'[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\s]', '', text)
    elif language == 'es':
        text = re.sub(r'[^a-zA-Z0-9áéíóúñÑ\s]', '', text)

    # 소문자 변환
    text = text.lower()

    # 토큰화 및 불용어 제거
    if language == 'ja':
        tokenizer = Tokenizer()
        tokens = [token.surface for token in tokenizer.tokenize(text)]
        tokens = [token for token in tokens if token not in japanese_stop_words]
    else:
        tokens = nltk.word_tokenize(text)
        stop_words = stop_words_dict.get(language, [])
        tokens = [token for token in tokens if token not in stop_words]

    return ' '.join(tokens)

# 모든 기사에 대해 전처리 적용
article_info['ProcessedContent'] = article_info.apply(lambda row: preprocess_text(row['Content'], row['Language']), axis=1)

# TF-IDF 벡터화 (차원 축소 추가)
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(article_info['ProcessedContent'])

# TruncatedSVD를 사용하여 TF-IDF 벡터의 차원을 축소
n_components = 64
svd = TruncatedSVD(n_components=n_components)
reduced_tfidf_matrix = svd.fit_transform(tfidf_matrix)

# 유사도 행렬 계산
content_similarity = cosine_similarity(reduced_tfidf_matrix)

# 사용자-기사 행렬 생성
user_article_matrix = view_log_train.groupby(['userID', 'articleID']).size().unstack(fill_value=0)

# User-Article Matrix 크기 정의
num_users, num_articles = user_article_matrix.shape

# 그래프 데이터 준비
user_ids = user_article_matrix.index.tolist()
article_ids = user_article_matrix.columns.tolist()
edge_index = []
for user_idx, user_id in enumerate(user_ids):
    for article_idx, article_id in enumerate(article_ids):
        if user_article_matrix.iloc[user_idx, article_idx] > 0:
            edge_index.append([user_idx, len(user_ids) + article_idx])
            edge_index.append([len(user_ids) + article_idx, user_idx])
edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

# 노드 특성 준비
user_features = torch.randn(num_users, n_components)
article_features = torch.tensor(reduced_tfidf_matrix, dtype=torch.float)
x = torch.cat([user_features, article_features], dim=0)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [11]:
class HybridGNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, num_users, num_articles):
        super(HybridGNN, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        self.user_proj = torch.nn.Linear(hidden_channels, num_articles)
        self.article_proj = torch.nn.Linear(hidden_channels, num_users)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        user_emb = self.user_proj(x[:num_users])
        article_emb = self.article_proj(x[num_users:])
        return user_emb, article_emb.t()

# 모델 초기화 및 학습
model = HybridGNN(in_channels=x.size(1), hidden_channels=64, num_users=num_users, num_articles=num_articles)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(1000):
    model.train()
    optimizer.zero_grad()
    user_emb, article_emb = model(x, edge_index)
    pred = user_emb  # user_emb는 이미 (num_users, num_articles) 크기입니다
    target = torch.tensor(user_article_matrix.values, dtype=torch.float)
    loss = F.mse_loss(pred, target)
    loss.backward()
    optimizer.step()
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item()}')


model.eval()
with torch.no_grad():
    user_emb, article_emb = model(x, edge_index)
    gnn_similarity = user_emb  # user_emb는 이미 (num_users, num_articles) 크기입니다

    # content_similarity의 크기를 확인하고 필요한 경우 조정
    if content_similarity.shape != (num_users, num_articles):
        print("Warning: content_similarity shape mismatch. Adjusting...")
        content_similarity_adj = content_similarity[:num_users, :num_articles]
    else:
        content_similarity_adj = content_similarity

    final_scores = 0.44 * gnn_similarity.numpy() + 0.28 * user_article_matrix.values + 0.28 * content_similarity_adj

# 추천 생성
recommendations = []
for idx, user in enumerate(user_ids):
    sorted_indices = final_scores[idx].argsort()[::-1]
    top5recommend = [article_ids[i] for i in sorted_indices[:5]]
    recommendations.extend([[user, article] for article in top5recommend])

# DataFrame 생성 및 제출 파일 저장
top_recommendations = pd.DataFrame(recommendations, columns=['userID', 'articleID'])
submission['articleID'] = top_recommendations['articleID']
submission.to_csv('hybrid_gnn_recommendation_submission.csv', index=False)

Epoch 0, Loss: 0.05307191237807274
Epoch 10, Loss: 0.02470482513308525
Epoch 20, Loss: 0.024088649079203606
Epoch 30, Loss: 0.023858824744820595
Epoch 40, Loss: 0.02352830395102501
Epoch 50, Loss: 0.02300015278160572
Epoch 60, Loss: 0.022325007244944572
Epoch 70, Loss: 0.021598927676677704
Epoch 80, Loss: 0.02094309590756893
Epoch 90, Loss: 0.020459502935409546
Epoch 100, Loss: 0.01980516128242016
Epoch 110, Loss: 0.019279710948467255
Epoch 120, Loss: 0.018789708614349365
Epoch 130, Loss: 0.018357520923018456
Epoch 140, Loss: 0.01796562410891056
Epoch 150, Loss: 0.017597565427422523
Epoch 160, Loss: 0.017252517864108086
Epoch 170, Loss: 0.01693050004541874
Epoch 180, Loss: 0.01662309654057026
Epoch 190, Loss: 0.016346637159585953
Epoch 200, Loss: 0.01607615500688553
Epoch 210, Loss: 0.0158180370926857
Epoch 220, Loss: 0.015548937022686005
Epoch 230, Loss: 0.015283344313502312
Epoch 240, Loss: 0.015030354261398315
Epoch 250, Loss: 0.014789517968893051
Epoch 260, Loss: 0.0145784532651305