In [None]:
# !pip install janome 일본어 형태소 분석기

In [3]:
# 라이브러리 
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from janome.tokenizer import Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
# nltk download
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kti08\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kti08\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
# 데이터 로드
view_log_train = pd.read_csv('./open/view_log.csv')
article_info = pd.read_csv('./open/article_info.csv')
submission = pd.read_csv('./open/sample_submission.csv')

In [6]:
# 결측치 처리
article_info['userCountry'].fillna('Unknown', inplace=True)
article_info['userRegion'].fillna('Unknown', inplace=True)

In [7]:
# 사용자-기사 행렬 생성
user_article_matrix = view_log_train.groupby(['userID', 'articleID']).size().unstack(fill_value=0)

In [8]:
# 불용어 로드
stop_words_dict = {
    'en': stopwords.words('english'),
    'pt': stopwords.words('portuguese'),
    'la': ['et', 'in', 'de'],
    'es': stopwords.words('spanish')
}

# 일본어 불용어 직접 정의
japanese_stop_words = ['これ', 'それ', 'あれ', 'この', 'その', 'あの', 'ここ', 'そこ', 'あそこ', 'こちら', 'どこ', 'だれ', 'なに', 'なん']

In [10]:
# 전처리 함수 정의
def preprocess_text(text, language):

    # URL 제거
    text = re.sub(r'https?://\S+', '', text)
    if language == 'en':
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    elif language == 'pt':
        text = re.sub(r'[^a-zA-Z0-9áéíóúâêîôûãõçÇ\s]', '', text)
    elif language == 'la':
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    elif language == 'ja':
        text = re.sub(r'[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\s]', '', text)
    elif language == 'es':
        text = re.sub(r'[^a-zA-Z0-9áéíóúñÑ\s]', '', text)

    # 소문자 변환
    text = text.lower()

    # 토큰화 및 불용어 제거
    if language == 'ja':
        tokenizer = Tokenizer()
        tokens = [token.surface for token in tokenizer.tokenize(text)]
        tokens = [token for token in tokens if token not in japanese_stop_words]

    else:
        tokens = nltk.word_tokenize(text)
        stop_words = stop_words_dict.get(language, [])
        tokens = [token for token in tokens if token not in stop_words]

    return ' '.join(tokens)

In [11]:
# 모든 기사에 대해 전처리 적용
article_info['ProcessedContent'] = article_info.apply(lambda row: preprocess_text(row['Content'], row['Language']), axis=1)

# TF-IDF 벡터화
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(article_info['ProcessedContent'])

# 유사도 행렬 계산
similarity_matrix = cosine_similarity(tfidf_matrix)

# 사용자 간 유사성 계산
user_similarity = cosine_similarity(user_article_matrix)

# 사용자 기반 추천 점수 계산
user_predicted_scores = user_similarity.dot(user_article_matrix) / np.array([np.abs(user_similarity).sum(axis=1)]).T


# 사용자별 콘텐츠 기반 추천 점수 계산
content_based_scores = np.zeros(user_article_matrix.shape)

# 전체 기사 수
num_articles = len(article_info)

for user_idx, user in enumerate(user_article_matrix.index):
    user_viewed_articles = user_article_matrix.columns[user_article_matrix.loc[user] > 0]
    if len(user_viewed_articles) > 0:
        user_articles_idx = [list(user_article_matrix.columns).index(article) for article in user_viewed_articles]
        user_content_scores = similarity_matrix[user_articles_idx].mean(axis=0)
        # 사용자별 점수에 맞게 reshape
        user_content_scores = user_content_scores[:user_article_matrix.shape[1]]
        content_based_scores[user_idx] = user_content_scores


In [12]:
# 협업 필터링 점수와 콘텐츠 기반 점수를 조합하여 최종 추천 점수 계산
final_scores = 0.42 * user_predicted_scores + 0.58 * content_based_scores

# 이미 조회한 기사 포함해서 추천
recommendations = []
for idx, user in enumerate(user_article_matrix.index):
    sorted_indices = final_scores[idx].argsort()[::-1]
    top5recommend = [article for article in user_article_matrix.columns[sorted_indices]][:5]
    for article in top5recommend:
        recommendations.append([user, article])

In [13]:
# DataFrame 생성 및 제출 파일 저장
top_recommendations = pd.DataFrame(recommendations, columns=['userID', 'articleID'])
submission['articleID'] = top_recommendations['articleID']
submission.to_csv('hybrid_recommendation_submission.csv', index=False)