## 요약 데이터 임베딩

In [1]:
import numpy as np
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


In [2]:
news_df=pd.read_csv('./Data/news.csv')

news_df.dropna(axis=0,inplace=True)

nltk.download('stopwords')
nltk.download('puntk')
nltk.download('punkt_tab')


# 영어, 소문자로 통일, 불용어 제거, 숫자 제거(x), 특수문자 제거, 마지막에 내용이 줄임표시 되어있는 것들이 있음(...)

stop_words=set(stopwords.words('english'))  


def clean_text(text):
    text=text.lower()
    #text=re.sub(r'\d+','',text)
    text=re.sub(r'[^\w\s]','',text) # 특수문자 제거
    text=re.sub(r'\s+',' ',text) # 연속된 공백 하나의 공백으로
    tokens=word_tokenize(text)
    for token in tokens:
        if token in stop_words:
            tokens.remove(token)    
    text=' '.join(tokens)   
    
    return text
    
news_df['cleaned_abstract']=news_df['abstract'].apply(clean_text)
news_df[['cleaned_abstract','abstract']].head(10)
news_df.shape


[nltk_data] Downloading package stopwords to /Users/mac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Error loading puntk: Package 'puntk' not found in index
[nltk_data] Downloading package punkt_tab to /Users/mac/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


(48612, 9)

In [6]:
pip install sentence_transformers

Defaulting to user installation because normal site-packages is not writeable
Collecting sentence_transformers
  Downloading sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence_transformers)
  Using cached transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting huggingface-hub>=0.20.0 (from sentence_transformers)
  Downloading huggingface_hub-0.29.3-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers<5.0.0,>=4.41.0->sentence_transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.1 (from transformers<5.0.0,>=4.41.0->sentence_transformers)
  Using cached safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl.metadata (3.8 kB)
Downloading sentence_transformers-3.4.1-py3-none-any.whl (275 kB)
Downloading huggingface_hub-0.29.3-py3-none-any.whl (468 kB)
Using cached transformers-4.49.0-py3-none-any.whl (10.0 MB)
Using cac

In [8]:
pip install tf-keras


Defaulting to user installation because normal site-packages is not writeable
Collecting tf-keras
  Downloading tf_keras-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting tensorflow<2.20,>=2.19 (from tf-keras)
  Downloading tensorflow-2.19.0-cp39-cp39-macosx_12_0_arm64.whl.metadata (4.0 kB)
Collecting tensorboard~=2.19.0 (from tensorflow<2.20,>=2.19->tf-keras)
  Downloading tensorboard-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting ml-dtypes<1.0.0,>=0.5.1 (from tensorflow<2.20,>=2.19->tf-keras)
  Downloading ml_dtypes-0.5.1-cp39-cp39-macosx_10_9_universal2.whl.metadata (21 kB)
Downloading tf_keras-2.19.0-py3-none-any.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tensorflow-2.19.0-cp39-cp39-macosx_12_0_arm64.whl (252.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m252.5/252.5 MB[0m [31m42.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading 

In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Sentence Transformer 모델 로드 (임베딩 생성)
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

#뉴스 요약 데이터 벡터화
embeddings = model.encode(news_df["cleaned_abstract"].tolist(), convert_to_numpy=True)

# FAISS Index 생성 (L2 거리 기반 검색)
dim = embeddings.shape[1]  # 벡터 차원 크기
index = faiss.IndexFlatL2(dim)
index.add(embeddings)

print(f"FAISS에 저장된 벡터 개수: {index.ntotal}")  


  from .autonotebook import tqdm as notebook_tqdm


FAISS에 저장된 벡터 개수: 48612


In [9]:
# 저장

faiss.write_index(index, "./Data/news_faiss.index")


In [6]:
def search_news(query, top_k=5):
    """
    사용자의 검색어(query)와 가장 유사한 뉴스 요약을 찾음
    """
    query_embedding = model.encode([query]).astype(np.float32)  # 검색어 벡터화
    distances, indices = index.search(query_embedding, top_k)  # FAISS에서 검색

    print(f"검색어: {query}\n")
    for i, idx in enumerate(indices[0]):
        print(f" Top-{i+1} (유사도 점수: {distances[0][i]:.4f})")
        print(f"   {news_df.iloc[idx]['abstract']}\n")

# 테스트 실행
search_news("climate change impact")


검색어: climate change impact

 Top-1 (유사도 점수: 0.5713)
   The economic effects of global warming may arrive sooner and with a bigger impact than previously thought, according to Oxford Economics in a report that compares recent scientific research with the economic literature on the costs of climate change.

 Top-2 (유사도 점수: 0.6713)
   It aims to draw attention to climate change

 Top-3 (유사도 점수: 0.6948)

 Top-4 (유사도 점수: 0.7305)
   Humans are affecting Earth's climate, and there's a wealth of scientific evidence to support that fact. We've already begun to see some seriously troubling trends that may be associated with our altering of the climate, including mass die-offs of ocean coral, widespread drought, and increasingly powerful and unpredictable storm systems. With all that in mind, scientists from Colorado State University are doing what they can to preserve a record...

 Top-5 (유사도 점수: 0.7333)
   How many scientists does it take to convince the world to take climate change seriously?
