In [1]:
import nltk
import warnings
import numpy as np
import pandas as pd
from tqdm import tqdm
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

tqdm.pandas()
warnings.filterwarnings("ignore")



In [2]:
nltk.download('punkt_tab')
model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\asas4\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [3]:
data = pd.read_csv('Database/article_info.csv').fillna('NAN')
view_log_df = pd.read_csv('Database/view_log.csv').drop_duplicates().reset_index(drop=True)
view_log_df = pd.concat([view_log_df, pd.DataFrame([{'userID': 'USER_9999', 'articleID': 'ARTICLE_0001'}])],
                        ignore_index=True)

In [4]:
pred_embeddings = model.encode(data.Language + " " + data.userCountry + " " + data.userRegion + " " + data.Title)
pred_embeddings2 = model.encode(data.Content)

kw_model = KeyBERT(model=model)
keywords = kw_model.extract_keywords(docs=data.Title, top_n=1)
keywords[1849] = [('.', 0.0)]
data["Content_Keyword"] = [pair[0] for sub_lst in keywords for pair in sub_lst]

In [5]:
def compute_similarity(article_id):
    """
    주어진 article_id에 대해 title 및 content의 코사인 유사도를 계산
    data, pred_embeddings, pred_embeddings2는 전역변수
    """
    matching_indices = data.loc[data['articleID'] == article_id].index
    if len(matching_indices) == 0:
        return None, None  # 매칭이 없는 경우

    title_embedding = pred_embeddings[matching_indices]
    content_embedding = pred_embeddings2[matching_indices]

    # 벡터 연산을 이용한 Cosine Similarity 계산
    cosine_sim = cosine_similarity(title_embedding, pred_embeddings)
    cosine_sim2 = cosine_similarity(content_embedding, pred_embeddings2)

    return cosine_sim, cosine_sim2

In [6]:
def sb_calculation(data_list):
    df = pd.DataFrame(data_list, columns=['userID', 'values'])

    # 배열이 아닌 경우를 대비하여 NumPy 배열로 변환
    df['values'] = df['values'].apply(lambda x: np.asarray(x) if isinstance(x, (list, np.ndarray)) else np.array([x]))

    # 각 userID별 평균 계산 (벡터 연산)
    grouped = df.groupby('userID')['values'].agg(lambda x: np.mean(np.vstack(x), axis=0))

    # NumPy 배열로 변환 (메모리 최적화)
    return np.vstack(grouped.values).astype(np.float32)

In [7]:
# view_log_df[['cosine_sim', 'cosine_sim2']] = pd.DataFrame(
#     view_log_df['articleID'].progress_apply(lambda x: compute_similarity(x)).to_list(),
#     index=view_log_df.index
# )
# 
# view_log_df[['cosine_sim', 'cosine_sim2']]=view_log_df[['cosine_sim', 'cosine_sim2']].applymap(lambda x: x[0])

In [8]:
# df_0 = view_log_df.copy()
# df_0.to_parquet('File/view_log_df.parquet')

df_0 = pd.read_parquet('File/view_log_df.parquet')

In [9]:
unique_articles = set(data["articleID"].unique()) - set(df_0["articleID"].unique())
new_data = pd.DataFrame({'userID': 'USER_9999', 'articleID': list(unique_articles)})
df_0 = pd.concat([df_0, new_data], ignore_index=True)

In [10]:
data_list = df_0[['userID', 'cosine_sim']].dropna().values.tolist()
data_list0 = df_0[['userID', 'cosine_sim2']].dropna().values.tolist()

In [11]:
user_article_matrix = df_0.groupby(['userID', 'articleID']).size().unstack(fill_value=0)
user_article_matrix.loc[:, :] = sb_calculation(data_list)

In [12]:
user_article_matrix0 = df_0.groupby(['userID', 'articleID']).size().unstack(fill_value=0)
user_article_matrix0.loc[:, :] = sb_calculation(data_list0)

In [13]:
df_0 = df_0.drop_duplicates(subset=['userID', 'articleID'], keep='first')

In [14]:
top5_columns_per_row = np.argsort(user_article_matrix, axis=1)[:, :5]
top5_column_names_per_row = np.array(user_article_matrix.columns)[top5_columns_per_row]

In [15]:
top5_column_names_per_row

array([['ARTICLE_1366', 'ARTICLE_0446', 'ARTICLE_0534', 'ARTICLE_1121',
        'ARTICLE_2241'],
       ['ARTICLE_0534', 'ARTICLE_1322', 'ARTICLE_0446', 'ARTICLE_1889',
        'ARTICLE_0612'],
       ['ARTICLE_0534', 'ARTICLE_1028', 'ARTICLE_1408', 'ARTICLE_1394',
        'ARTICLE_0809'],
       ...,
       ['ARTICLE_1707', 'ARTICLE_1220', 'ARTICLE_2075', 'ARTICLE_2330',
        'ARTICLE_0424'],
       ['ARTICLE_0534', 'ARTICLE_2011', 'ARTICLE_2041', 'ARTICLE_0364',
        'ARTICLE_1743'],
       ['ARTICLE_1366', 'ARTICLE_1743', 'ARTICLE_2241', 'ARTICLE_0241',
        'ARTICLE_1500']], dtype=object)