# **사용 라이브러리**

In [2]:
#필요한 라이브러리
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch

# **데이터**

In [None]:
#데이터 로드 (알라딘 api를 사용해서 데이터를 정제하였다.)
file_path = 'books.csv'
df_books = pd.read_csv(file_path)

In [None]:
#결측지 처리
df_books['description'] = df_books['description'].fillna('')

# **텍스트 전처리**

In [None]:
def preprocess_text_with_nouns(text):
  from konlpy.tag import Okt
  okt = Okt()
  text = re.sub(r'<.*?>', '', text) # HTML 태그 제거
  text = re.sub(r'[^가-힣\s]', '', text) # 한글 외 제거
  words = okt.nouns(text) # 명사 추출
  return ' '.join(words)

# **TF-IDF 벡터화**

In [None]:
vectorize = TfidVectorizer(max_features = 1000) # 최대 1000개의 피처 사용

# **BERT 임베딩 설정**

In [None]:
tokenizer = AutoTokenizer.from_pretrained("monologg/kobert")
model = AutoModel.from_pretrained("monologg/kobert")

def get_embedding(text):
  inputs = tokenizer(text, return_tensors = "pt", truncation = True, padding = True)
  outputs = model(**inputs)
  return outputs.last_hidden_state.mean(dim = 1).detach().numpy()

# **추천 시스템 함수**

In [None]:
def recommend_books_v2(category_filter, top_n = 5, method = 'tfidf'):
  filtered_books = df_books[df_boks['category_name'] == category_filter]
  if filtered_books.empty:
    return "선택한 카테고리에 해당하는 책이 없습니다."

  filtered_books = filtered_books.reset_index(drop = True)

  if method == 'tfidf':
    filtered_books['processed_description'] = filtered_books['description'].apply(preprocess_text_with_nouns)
    tfidf_matrix = vectorizer.fit_transform(filtered_books['processed_description'])
    cosine_sim = cosine_similarity(tfidf_matric, tfidf_matrix)
  elif method == 'bert':
    embaddings = np.vstack([get_embadding(desc) for desc in filtered_books['description']])
    cosine_sim = cosine_similarity(embaddings, embaddings)
  else:
    return "지원하지 않는 추천 방식입니다. 'trfidf' 또는 'BERT'를 선택하세요."

  recommendations = []
  similarity_scores = list(enumerate(cosine_sim[0]))
  similarity_scores = sorted(similarity_scores, key = lambda x: x[1], reverse = True)[1: top_n+1]
  for idx, score in similarity_scores:
    book_info = filtered_books.iloc[idx]
    recommendations.append({'title': book_info['title'], 'score': score})

  return recommendations

In [None]:
# 추천 실행
category_filter = '소설/시/희곡'
top_n = 5
method = 'bert'

recommendations = recommend_books_v2(category_filter, top_n, method)
recommendations