In [1]:
import os

from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_pinecone import PineconeVectorStore
from langchain_upstage import UpstageDocumentParseLoader
from langchain_upstage import UpstageEmbeddings
from pinecone import Pinecone, ServerlessSpec

  from tqdm.autonotebook import tqdm


In [2]:
load_dotenv()

# upstage models
embedding_upstage = UpstageEmbeddings(model="embedding-query")

pinecone_api_key = os.environ.get("PINECONE_API_KEY")
pc = Pinecone(api_key=pinecone_api_key)
index_name = "samsung"
txt_path = "output.txt"

# create new index
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=4096,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )

In [10]:
from langchain.schema import Document

# 1. 텍스트 파일을 읽고 docs에 적절한 형식으로 저장
with open("output.txt", 'r', encoding='utf-8') as file:
    text_content = file.read()

# 2. docs를 문서 리스트로 정의
docs = [Document(page_content=text_content)]

In [6]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100)

In [11]:
splits = text_splitter.split_documents(docs)

PineconeVectorStore.from_documents(
    splits, embedding_upstage, index_name=index_name
)

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x288a082cb30>