In [31]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import stopwords
import re

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

from transformers import BertTokenizer, BertModel
import torch

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
import re

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:
view_log_train = pd.read_csv('view_log.csv')
article_info = pd.read_csv('article_info.csv')
submission = pd.read_csv('sample_submission.csv')

view_log_train

Unnamed: 0,userID,articleID,userRegion,userCountry
0,USER_0000,ARTICLE_0661,NY,US
1,USER_0000,ARTICLE_2316,NY,US
2,USER_0000,ARTICLE_1345,NY,US
3,USER_0000,ARTICLE_1089,NY,US
4,USER_0000,ARTICLE_1484,NY,US
...,...,...,...,...
42712,USER_1420,ARTICLE_0682,SP,BR
42713,USER_1420,ARTICLE_2179,SP,BR
42714,USER_1420,ARTICLE_1848,SP,BR
42715,USER_1420,ARTICLE_0030,SP,BR


In [2]:
be_forms = {"am", "is", "are", "was", "were", "be", "been", "being"}

def preprocess_text(text):
    # 공백 제거 및 소문자 변환
    text = text.lower().strip()
    # 구두점 제거
    text = re.sub(r'[^\w\s]', '', text)
    return text

# # 텍스트에서 be 동사를 제거하는 함수
# def remove_be_verbs(text):
#     # 텍스트 전처리
#     text = preprocess_text(text)
#     # 텍스트를 토큰화
#     words = word_tokenize(text)
#     # POS 태깅
#     tagged_words = pos_tag(words)
#     # be 동사를 제외한 단어들만 선택
#     filtered_words = [word for word, tag in tagged_words if word not in be_forms]
#     # 필터링된 단어들을 다시 문장으로 결합
#     filtered_text = ' '.join(filtered_words)
#     return filtered_text

# 관사, 전치사, 접속사만 제거하는 함수
def remove_specific_pos(text):
    # 전처리
    text = preprocess_text(text)
    # 토큰화
    words = word_tokenize(text)
    
    # 품사 태깅
    tagged_words = pos_tag(words)
    # 제거할 품사 태그
    remove_pos = {'DT',  # 관사
                  'IN',  # 전치사
                  'CC'}  # 접속사
    # 필터링된 단어들만 선택
    filtered_words = [word for word, tag in tagged_words if tag not in remove_pos]
    # 필터링된 단어들을 다시 문장으로 결합
    filtered_text = ' '.join(filtered_words)
    return filtered_text

In [3]:
article_title = article_info[['articleID','Title']]
article_Content = article_info[['articleID','Content']]
article_info

Unnamed: 0,articleID,Title,Content,Format,Language,userID,userCountry,userRegion
0,ARTICLE_0000,19 Tips For Everyday Git Use,I've been using git full time for the past 4 y...,HTML,en,USER_0683,,
1,ARTICLE_0001,Intel buys computer vision startup Itseez to i...,Intel has acquired computer vision and machine...,HTML,en,USER_1129,,
2,ARTICLE_0002,Practical End-to-End Testing with Protractor,One of the reasons AngularJS is so great to wo...,HTML,en,USER_0256,,
3,ARTICLE_0003,Corporate venture growth in Brazil is another ...,Despite recent positive news and a renewed int...,HTML,en,USER_1304,,
4,ARTICLE_0004,Cross-channel user experiences with Drupal (aw...,"Last year around this time, I wrote that The B...",HTML,en,USER_0336,,
...,...,...,...,...,...,...,...,...
3003,ARTICLE_3003,Como consumir conteúdo de qualidade em iOS - C...,"Quando iniciei minha jornada em Swift, saindo ...",HTML,pt,USER_0882,BR,MG
3004,ARTICLE_3004,Aurelia 1.0 is Here!!!,It's been an amazing journey to get here and i...,HTML,en,USER_0220,,
3005,ARTICLE_3005,Lessons from converting an app to 100% Kotlin ...,This is part one in a series of posts about Ko...,HTML,en,USER_1010,BR,SP
3006,ARTICLE_3006,ITA está oferecendo 10 cursos gratuitos a dist...,"O Instituto Tecnológico de Aeronáutica (ITA) ,...",HTML,pt,USER_1210,,


In [4]:
article_info.loc[:,'Title'][1]

'Intel buys computer vision startup Itseez to improve navigation in self-driving cars'

In [32]:
from transformers import BertTokenizer, BertModel
import torch
from tqdm import tqdm

# TinyBERT 모델과 토크나이저 로드
model_name = 'huawei-noah/TinyBERT_General_4L_312D'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# 예제 텍스트
# text = "This is a sample sentence for TinyBERT embedding."

# 최대 시퀀스 길이 설정
max_length = 256
embeded = []
for text in tqdm(article_info.loc[:,'Title']):
    # 텍스트를 토큰화하고 시퀀스 길이를 고정
    text = preprocess_text(text)
    text = remove_specific_pos(text)
    inputs = tokenizer(text, max_length=max_length, padding='max_length', truncation=True, return_tensors='pt')

    # 모델을 사용하여 임베딩 추출
    with torch.no_grad():
        outputs = model(**inputs)
    embeded.append(outputs.last_hidden_state[:, 0, :].squeeze().numpy())

article_info['embeded_title'] = embeded

# TinyBERT 모델과 토크나이저 로드
model_name = 'huawei-noah/TinyBERT_General_4L_312D'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# 예제 텍스트
# text = "This is a sample sentence for TinyBERT embedding."

# 최대 시퀀스 길이 설정
max_length = 256
embeded = []
for text in tqdm(article_info.loc[:,'Content']):
    # 텍스트를 토큰화하고 시퀀스 길이를 고정
    text = preprocess_text(text)
    text = remove_specific_pos(text)
    inputs = tokenizer(text, max_length=max_length, padding='max_length', truncation=True, return_tensors='pt')

    # 모델을 사용하여 임베딩 추출
    with torch.no_grad():
        outputs = model(**inputs)
    embeded.append(outputs.last_hidden_state[:, 0, :].squeeze().numpy())
    
article_info['embeded_content'] = embeded

Some weights of the model checkpoint at huawei-noah/TinyBERT_General_4L_312D were not used when initializing BertModel: ['fit_denses.0.weight', 'fit_denses.4.weight', 'fit_denses.3.bias', 'cls.predictions.transform.dense.bias', 'fit_denses.2.bias', 'cls.predictions.bias', 'fit_denses.2.weight', 'fit_denses.0.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'fit_denses.3.weight', 'cls.seq_relationship.weight', 'fit_denses.4.bias', 'fit_denses.1.bias', 'cls.seq_relationship.bias', 'fit_denses.1.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing

In [59]:
# from sklearn.decomposition import PCA
# import matplotlib.pyplot as plt


# data = pd.DataFrame(list(article_info['embeded_title']))
# for i in tqdm(range(len(data))):
#     data.loc[i,:] = normalize(np.array(data.loc[i,:]).reshape(-1,1),norm='l2') 
# # article_info['embeded_title'] = [normalize(i,norm='l2') for i in article_info['embeded_title']]
# # article_info['embeded_content'] = [normalize(i,norm='l2') for i in article_info['embeded_content']]

# pca = PCA(n_components=2)
# reduced_embeddings = pca.fit_transform(data)

# # 시각화
# plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1],s=5)
# # for i, text in enumerate(texts):
# #     plt.annotate(f"Text {i+1}", (reduced_embeddings[i, 0], reduced_embeddings[i, 1]))
# # plt.title('PCA of Average-Pooled BERT Embeddings')
# # plt.xlabel('PCA Component 1')
# # plt.ylabel('PCA Component 2')
# # plt.show()

In [66]:
submission

Unnamed: 0,userID,articleID
0,USER_0000,ARTICLE_0000
1,USER_0000,ARTICLE_0001
2,USER_0000,ARTICLE_0002
3,USER_0000,ARTICLE_0003
4,USER_0000,ARTICLE_0004
...,...,...
7070,USER_1420,ARTICLE_0000
7071,USER_1420,ARTICLE_0001
7072,USER_1420,ARTICLE_0002
7073,USER_1420,ARTICLE_0003


In [71]:
target = 'embeded_content'
answer = []
for j in tqdm(range(len(article_info))):
    tp = [[cosine_similarity([article_info[target][j]],[article_info[target][i]]).squeeze(),i] for i in range(len(article_info))]
    tp.sort(reverse=True)
    answer.extend(tp[1:6])

  0%|                                                                                         | 0/3008 [00:00<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 3008/3008 [35:14<00:00,  1.42it/s][A


In [84]:
answer

[[array(0.9915251, dtype=float32), 2996],
 [array(0.9913184, dtype=float32), 1301],
 [array(0.9913129, dtype=float32), 810],
 [array(0.99092525, dtype=float32), 165],
 [array(0.9907521, dtype=float32), 1819],
 [array(0.9919008, dtype=float32), 818],
 [array(0.99123824, dtype=float32), 2965],
 [array(0.9911199, dtype=float32), 1487],
 [array(0.99071926, dtype=float32), 2236],
 [array(0.99038476, dtype=float32), 2788],
 [array(0.98994684, dtype=float32), 2069],
 [array(0.9896178, dtype=float32), 1735],
 [array(0.9894323, dtype=float32), 463],
 [array(0.9893867, dtype=float32), 1418],
 [array(0.98923224, dtype=float32), 1892],
 [array(0.9865364, dtype=float32), 1193],
 [array(0.98550004, dtype=float32), 414],
 [array(0.9852503, dtype=float32), 779],
 [array(0.98496306, dtype=float32), 2756],
 [array(0.98482907, dtype=float32), 1379],
 [array(1., dtype=float32), 4],
 [array(0.99263316, dtype=float32), 648],
 [array(0.9920974, dtype=float32), 1782],
 [array(0.99204224, dtype=float32), 1257]

In [88]:
art_nums = []
counter_nums = []
for i in range(len(article_info)):
    nums = str(i)
    tar = 'ARTICLE_'+'0'*(4-len(nums)) + nums
    for _ in range(5):
        art_nums.append(tar)

for i in [j[1] for j in answer]:
    nums = str(i)
    tar = 'ARTICLE_'+'0'*(4-len(nums)) + nums
    counter_nums.append(tar)        
        
content_relation = pd.DataFrame()
content_relation['articleID'] = art_nums
content_relation['corr_articleID'] = counter_nums
content_relation['cosine'] = [j[0] for j in answer]
content_relation.to_csv('./contents_relation.csv')

In [89]:
target = 'embeded_title'
answer = []
for j in tqdm(range(len(article_info))):
    tp = [[cosine_similarity([article_info[target][j]],[article_info[target][i]]).squeeze(),i] for i in range(len(article_info))]
    tp.sort(reverse=True)
    answer.extend(tp[1:6])

100%|██████████████████████████████████████████████████████████████████████████████| 3008/3008 [35:02<00:00,  1.43it/s]


In [91]:
art_nums = []
counter_nums = []
for i in range(len(article_info)):
    nums = str(i)
    tar = 'ARTICLE_'+'0'*(4-len(nums)) + nums
    for _ in range(5):
        art_nums.append(tar)

for i in [j[1] for j in answer]:
    nums = str(i)
    tar = 'ARTICLE_'+'0'*(4-len(nums)) + nums
    counter_nums.append(tar)        
        
content_relation = pd.DataFrame()
content_relation['articleID'] = art_nums
content_relation['corr_articleID'] = counter_nums
content_relation['cosine'] = [j[0] for j in answer]
content_relation.to_csv('./titles_relation.csv')
content_relation

Unnamed: 0,articleID,corr_articleID,cosine
0,ARTICLE_0000,ARTICLE_2692,0.9864081
1,ARTICLE_0000,ARTICLE_0768,0.98538667
2,ARTICLE_0000,ARTICLE_2943,0.983638
3,ARTICLE_0000,ARTICLE_1387,0.98353374
4,ARTICLE_0000,ARTICLE_0300,0.98278296
...,...,...,...
15035,ARTICLE_3007,ARTICLE_2511,0.99004513
15036,ARTICLE_3007,ARTICLE_0109,0.98913294
15037,ARTICLE_3007,ARTICLE_2325,0.98908
15038,ARTICLE_3007,ARTICLE_0201,0.9879881


In [93]:
pd.read_csv('./contents_relation.csv',index_col=0)

Unnamed: 0,articleID,corr_articleID,cosine
0,ARTICLE_0000,ARTICLE_2996,0.991525
1,ARTICLE_0000,ARTICLE_1301,0.991318
2,ARTICLE_0000,ARTICLE_0810,0.991313
3,ARTICLE_0000,ARTICLE_0165,0.990925
4,ARTICLE_0000,ARTICLE_1819,0.990752
...,...,...,...
15035,ARTICLE_3007,ARTICLE_2805,0.992927
15036,ARTICLE_3007,ARTICLE_1174,0.991779
15037,ARTICLE_3007,ARTICLE_0098,0.991699
15038,ARTICLE_3007,ARTICLE_2382,0.991646


In [30]:
remain = view_log_train.articleID.value_counts().keys()
remain = set(remain)

for i in tqdm(range(len(article_info))):
    if article_info.loc[i,'articleID'] not in remain:
        article_info = article_info.drop(i)
article_info

100%|███████████████████████████████████████████████████████████████████████████| 3008/3008 [00:00<00:00, 31017.44it/s]


Unnamed: 0,articleID,Title,Content,Format,Language,userID,userCountry,userRegion
0,ARTICLE_0000,19 Tips For Everyday Git Use,I've been using git full time for the past 4 y...,HTML,en,USER_0683,,
1,ARTICLE_0001,Intel buys computer vision startup Itseez to i...,Intel has acquired computer vision and machine...,HTML,en,USER_1129,,
2,ARTICLE_0002,Practical End-to-End Testing with Protractor,One of the reasons AngularJS is so great to wo...,HTML,en,USER_0256,,
3,ARTICLE_0003,Corporate venture growth in Brazil is another ...,Despite recent positive news and a renewed int...,HTML,en,USER_1304,,
4,ARTICLE_0004,Cross-channel user experiences with Drupal (aw...,"Last year around this time, I wrote that The B...",HTML,en,USER_0336,,
...,...,...,...,...,...,...,...,...
3003,ARTICLE_3003,Como consumir conteúdo de qualidade em iOS - C...,"Quando iniciei minha jornada em Swift, saindo ...",HTML,pt,USER_0882,BR,MG
3004,ARTICLE_3004,Aurelia 1.0 is Here!!!,It's been an amazing journey to get here and i...,HTML,en,USER_0220,,
3005,ARTICLE_3005,Lessons from converting an app to 100% Kotlin ...,This is part one in a series of posts about Ko...,HTML,en,USER_1010,BR,SP
3006,ARTICLE_3006,ITA está oferecendo 10 cursos gratuitos a dist...,"O Instituto Tecnológico de Aeronáutica (ITA) ,...",HTML,pt,USER_1210,,


In [19]:
article_info[article_info['Format'] != 'HTML']['articleID']

17      ARTICLE_0017
67      ARTICLE_0067
209     ARTICLE_0209
288     ARTICLE_0288
531     ARTICLE_0531
761     ARTICLE_0761
793     ARTICLE_0793
1100    ARTICLE_1100
1105    ARTICLE_1105
1199    ARTICLE_1199
1256    ARTICLE_1256
1631    ARTICLE_1631
1688    ARTICLE_1688
1878    ARTICLE_1878
1975    ARTICLE_1975
2014    ARTICLE_2014
2249    ARTICLE_2249
2853    ARTICLE_2853
2860    ARTICLE_2860
2988    ARTICLE_2988
Name: articleID, dtype: object

In [113]:
content_relation = pd.read_csv('./contents_relation.csv',index_col=0)
print(content_relation[content_relation['articleID']=='ARTICLE_2255'])
print(content_relation[content_relation['articleID']=='ARTICLE_0411'])

          articleID corr_articleID    cosine
11275  ARTICLE_2255   ARTICLE_2355  0.986928
11276  ARTICLE_2255   ARTICLE_0084  0.986795
11277  ARTICLE_2255   ARTICLE_0016  0.986580
11278  ARTICLE_2255   ARTICLE_2432  0.986204
11279  ARTICLE_2255   ARTICLE_1756  0.985792
         articleID corr_articleID    cosine
2055  ARTICLE_0411   ARTICLE_2804  0.990893
2056  ARTICLE_0411   ARTICLE_2071  0.988654
2057  ARTICLE_0411   ARTICLE_1730  0.987954
2058  ARTICLE_0411   ARTICLE_0146  0.987264
2059  ARTICLE_0411   ARTICLE_0236  0.986673


In [142]:
view_log_train['userID'].values

array(['USER_0000', 'USER_0000', 'USER_0000', ..., 'USER_1420',
       'USER_1420', 'USER_1420'], dtype=object)

In [152]:
sub = []

for val in tqdm(list(set(view_log_train['userID'].values))):
    tp = pd.DataFrame()
    keys = view_log_train[view_log_train['userID'] == val]['articleID'].value_counts().keys()
    for k in keys:
        tp = pd.concat([tp,content_relation[content_relation['articleID']==k]])
    sub.extend(tp['corr_articleID'].value_counts()[:5].keys())
    
sub

100%|██████████████████████████████████████████████████████████████████████████████| 1415/1415 [02:35<00:00,  9.13it/s]


['ARTICLE_1316',
 'ARTICLE_2120',
 'ARTICLE_1466',
 'ARTICLE_2747',
 'ARTICLE_0890',
 'ARTICLE_1106',
 'ARTICLE_2077',
 'ARTICLE_2506',
 'ARTICLE_0137',
 'ARTICLE_2956',
 'ARTICLE_0208',
 'ARTICLE_2428',
 'ARTICLE_1466',
 'ARTICLE_0859',
 'ARTICLE_0679',
 'ARTICLE_1302',
 'ARTICLE_1370',
 'ARTICLE_2183',
 'ARTICLE_0228',
 'ARTICLE_1064',
 'ARTICLE_1612',
 'ARTICLE_0165',
 'ARTICLE_1141',
 'ARTICLE_2959',
 'ARTICLE_0491',
 'ARTICLE_2110',
 'ARTICLE_1294',
 'ARTICLE_1671',
 'ARTICLE_1349',
 'ARTICLE_0902',
 'ARTICLE_2117',
 'ARTICLE_2193',
 'ARTICLE_1879',
 'ARTICLE_0629',
 'ARTICLE_1259',
 'ARTICLE_2608',
 'ARTICLE_0092',
 'ARTICLE_2735',
 'ARTICLE_0020',
 'ARTICLE_0244',
 'ARTICLE_0089',
 'ARTICLE_1122',
 'ARTICLE_3000',
 'ARTICLE_2762',
 'ARTICLE_2732',
 'ARTICLE_1436',
 'ARTICLE_2209',
 'ARTICLE_2593',
 'ARTICLE_0509',
 'ARTICLE_0616',
 'ARTICLE_1544',
 'ARTICLE_1187',
 'ARTICLE_1651',
 'ARTICLE_2983',
 'ARTICLE_2307',
 'ARTICLE_0652',
 'ARTICLE_0606',
 'ARTICLE_1040',
 'ARTICLE_2399

In [154]:
submission['articleID'] = sub
submission.to_csv('baseline_submission.csv', index=False)

In [124]:
title_relation = pd.read_csv('./titles_relation.csv',index_col=0)
tp = pd.DataFrame()
keys = view_log_train[view_log_train['userID'] == 'USER_0000']['articleID'].value_counts().keys()
for k in keys:
    tp = pd.concat([tp,title_relation[title_relation['articleID']==k]])
tp['corr_articleID'].value_counts()[:20]

ARTICLE_0001    2
ARTICLE_2478    2
ARTICLE_0434    1
ARTICLE_2975    1
ARTICLE_1544    1
ARTICLE_2719    1
ARTICLE_1433    1
ARTICLE_1424    1
ARTICLE_1445    1
ARTICLE_2511    1
ARTICLE_2970    1
ARTICLE_0289    1
ARTICLE_0014    1
ARTICLE_0815    1
ARTICLE_2347    1
ARTICLE_2140    1
ARTICLE_0548    1
ARTICLE_0019    1
ARTICLE_1305    1
ARTICLE_0919    1
Name: corr_articleID, dtype: int64

In [94]:
user_article_matrix = view_log_train.groupby(['userID', 'articleID']).size().unstack(fill_value=0)
user_article_matrix

articleID,ARTICLE_0000,ARTICLE_0001,ARTICLE_0002,ARTICLE_0003,ARTICLE_0004,ARTICLE_0005,ARTICLE_0006,ARTICLE_0007,ARTICLE_0008,ARTICLE_0009,...,ARTICLE_2998,ARTICLE_2999,ARTICLE_3000,ARTICLE_3001,ARTICLE_3002,ARTICLE_3003,ARTICLE_3004,ARTICLE_3005,ARTICLE_3006,ARTICLE_3007
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
USER_0000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
USER_0001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
USER_0002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
USER_0003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
USER_0004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
USER_1416,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
USER_1417,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
USER_1418,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
USER_1419,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [95]:
# 사용자-기사 행렬 생성
user_article_matrix = view_log_train.groupby(['userID', 'articleID']).size().unstack(fill_value=0)

# 사용자 간의 유사성 계산
user_similarity = cosine_similarity(user_article_matrix)

# 추천 점수 계산
user_predicted_scores = user_similarity.dot(user_article_matrix) / np.array([np.abs(user_similarity).sum(axis=1)]).T


In [97]:
user_predicted_scores.shape

(1415, 2879)

In [102]:
# 이미 조회한 기사 포함해서 추천
recommendations = []
for idx, user in enumerate(user_article_matrix.index):    
    # 해당 사용자의 추천 점수 (높은 점수부터 정렬)
    sorted_indices = user_predicted_scores[idx].argsort()[::-1]
    top5recommend = [article for article in user_article_matrix.columns[sorted_indices]][1:6]
    
    for article in top5recommend:
        recommendations.append([user, article])
        
# sample_submission.csv 형태로 DataFrame 생성
top_recommendations = pd.DataFrame(recommendations, columns=['userID', 'articleID'])

submission['articleID'] = top_recommendations['articleID']

submission.to_csv('baseline_submission.csv', index=False)
