In [1]:
import nltk
import pickle
import warnings
from keybert import KeyBERT
from Module.trainer import *
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer

warnings.filterwarnings("ignore")

Namespace(lr=0.001, epochs=300, device='cuda', patience=10, batch_size=32)


# Preprocessing

In [2]:
nltk.download('punkt_tab')
model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\asas4\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [3]:
data = pd.read_csv('Database/article_info.csv').fillna('NAN')
view_log_df = pd.read_csv('Database/view_log.csv').drop_duplicates().reset_index(drop=True)
view_log_df = pd.concat([view_log_df, pd.DataFrame([{'userID': 'USER_9999', 'articleID': 'ARTICLE_0001'}])],
                        ignore_index=True)
df_0 = pd.read_parquet('File/view_log_df.parquet')

In [4]:
kw_model = KeyBERT(model=model)
keywords = kw_model.extract_keywords(docs=data.Title, top_n=1)
keywords[1849] = [('.', 0.0)]
data["Content_Keyword"] = [pair[0] for sub_lst in keywords for pair in sub_lst]

In [5]:
df_1 = df_0.merge(data, on="articleID", how="left")

# NumPy 벡터화 연산으로 변환
cosine_sim_array = np.vstack(df_1['cosine_sim'].values)
cosine_sim2_array = np.vstack(df_1['cosine_sim2'].values)

# 벡터를 데이터프레임으로 변환
cosine_sim_expanded = pd.DataFrame(cosine_sim_array,
                                   columns=[f'cosine_sim_{i}' for i in range(cosine_sim_array.shape[1])])
cosine_sim2_expanded = pd.DataFrame(cosine_sim2_array,
                                    columns=[f'cosine_sim2_{i}' for i in range(cosine_sim2_array.shape[1])])

# cosin_sim 나중에 쓸지도 몰라 이어 붙임.
df_1 = pd.concat([df_1, cosine_sim_expanded], axis=1).dropna()

input_lst = ['userID_x', 'articleID', 'userRegion_x', 'userCountry_x', 'Format', 'Language', 'userID_y',
             'userCountry_y', 'userRegion_y']

In [6]:
df_2 = df_1[input_lst].dropna()

encoder_dict = {}

for col in ['userID_x', 'articleID', 'userRegion_x', 'userCountry_x', 'Format', 'Language']:
    encoder_dict[col] = LabelEncoder()
    df_2[col] = encoder_dict[col].fit_transform(df_2[col])

# 새로운 값이 존재하면 기존 encoder에 추가
for col in ['userID_y', 'userCountry_y', 'userRegion_y']:
    parent_col = col[:-1] + 'x'

    # 기존 encoder 불러오기
    encoder = encoder_dict[parent_col]

    # 기존에 없는 새로운 값 찾기
    unseen_values = set(df_2[col].unique()) - set(encoder.classes_)

    if unseen_values:
        # 새로운 값 추가 후 재훈련
        new_classes = np.append(encoder.classes_, list(unseen_values))
        encoder.classes_ = new_classes  # 직접 classes_ 속성 업데이트

    # 변환 적용
    df_2[col] = encoder.transform(df_2[col])

In [7]:
df_3 = df_2.copy()
df_3['Ground Truth'] = 1

# userID 별 등장 횟수 계산
user_counts = df_3['userID_x'].value_counts().sort_values()

# 부정 샘플을 저장할 리스트
negative_samples = []

for user_id, count in user_counts.items():
    # 현재 userID 제외한 데이터에서 랜덤 샘플링
    candidate_samples = df_3[df_3['userID_x'] != user_id].sample(n=min(count, len(df_3) - count), replace=False)
    candidate_samples['userID_x'] = user_id
    # 부정 샘플의 label을 0으로 설정
    candidate_samples['Ground Truth'] = 0

    # 부정 샘플 리스트에 추가
    negative_samples.append(candidate_samples)

# 부정 샘플 데이터프레임 생성
negative_df = pd.concat(negative_samples, ignore_index=True)

# userID_x 기준 정렬
negative_df = negative_df.sort_values(by='userID_x').reset_index(drop=True)

# 기존 데이터와 부정 샘플 데이터 결합
df_3 = pd.concat([df_3, negative_df]).reset_index(drop=True)
df_3 = df_3.sort_values(by=['userID_x', 'Ground Truth']).reset_index(drop=True)

In [12]:
df_4, test = train_test_split(df_3, test_size=0.1, random_state=42)
df_4.reset_index(inplace=True, drop=True)
test.reset_index(inplace=True, drop=True)

x = df_4.drop(columns='Ground Truth')
y = df_4['Ground Truth']
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.15, random_state=42)

x_test = test.drop(columns='Ground Truth')
y_test = test['Ground Truth']

In [20]:
# 데이터 저장 경로
train_val_test_split = {
    "x_train": x_train,
    "y_train": y_train,
    "x_val": x_val,
    "y_val": y_val,
    "x_test": x_test,
    "y_test": y_test
}

# pickle 파일로 저장
file_path = "Database/train_val_test.pkl"
with open(file_path, "wb") as f:
    pickle.dump(train_val_test_split, f)