In [1]:
import json
from datasets import Dataset
from sentence_transformers import SentenceTransformer
import faiss
import pickle

# JSON 파일 불러오기
with open("Json_Data/augmented_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# JSON 데이터를 리스트로 변환
contexts = [entry['context'] for entry in data]
responses = [entry["response"] for entry in data]

# 데이터셋 생성
if not contexts or not responses:
    raise ValueError("No valid context or response data found in the JSON file.")

dataset = Dataset.from_dict({"context": contexts, "response": responses})
filtered_dataset = dataset.filter(lambda example: example["context"] is not None and example["context"].strip() != "" and
                                               example["response"] is not None and example["response"].strip() != "")
# 필터링된 context와 response 추출
contexts = filtered_dataset["context"]
responses = filtered_dataset["response"]

if len(contexts) != len(responses):
    raise ValueError("Filtered contexts and responses are not synchronized.")

Filter:   0%|          | 0/27960 [00:00<?, ? examples/s]

In [2]:
# 텍스트 임베딩 모델 설정 (비용 효율적인 모델)
embedder = SentenceTransformer('all-MiniLM-L12-v2')

# 컨텍스트 데이터 임베딩
embeddings = embedder.encode(contexts, show_progress_bar=True)

# FAISS 인덱스 생성
if embeddings is None or len(embeddings) == 0:
    raise ValueError("Embedding generation failed. The embeddings array is empty or invalid.")

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

# 인덱스 저장
faiss.write_index(index, "megumin_context_index.faiss")

# 응답 데이터 저장
with open("megumin_responses.pkl", "wb") as f:
    pickle.dump(responses, f)

print("Index and response data successfully saved!")

Batches:   0%|          | 0/871 [00:00<?, ?it/s]

Index and response data successfully saved!
