In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
import os

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_LLM_MODEL = os.getenv("OPENAI_LLM_MODEL")
OPENAI_EMBEDDING_MODEL = os.getenv("OPENAI_EMBEDDING_MODEL")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT")
PINECONE_INDEX_REGION = os.getenv("PINECONE_INDEX_REGION")
PINECONE_INDEX_CLOUD = os.getenv("PINECONE_INDEX_CLOUD")
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")
PINECONE_INDEX_METRIC = os.getenv("PINECONE_INDEX_METRIC")
PINECONE_INDEX_DIMENSION = int(os.getenv("PINECONE_INDEX_DIMENSION"))


In [None]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(
    api_key=PINECONE_API_KEY
)

pc.create_index(
    name=PINECONE_INDEX_NAME,
    dimension=PINECONE_INDEX_DIMENSION,
    metric=PINECONE_INDEX_METRIC,
    spec=ServerlessSpec(
        region=PINECONE_INDEX_REGION,
        cloud=PINECONE_INDEX_CLOUD
    )
)

In [6]:
wine_index = pc.Index(PINECONE_INDEX_NAME)
wine_index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}

In [7]:
from langchain_community.document_loaders import CSVLoader

loader = CSVLoader("./winemag-data-130k-v2.csv", encoding="utf-8")
docs = loader.load()

docs[0]

Document(metadata={'source': './winemag-data-130k-v2.csv', 'row': 0}, page_content=": 0\ncountry: Italy\ndescription: Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity.\ndesignation: Vulkà Bianco\npoints: 87\nprice: \nprovince: Sicily & Sardinia\nregion_1: Etna\nregion_2: \ntaster_name: Kerin O’Keefe\ntaster_twitter_handle: @kerinokeefe\ntitle: Nicosia 2013 Vulkà Bianco  (Etna)\nvariety: White Blend\nwinery: Nicosia")

In [9]:
print(len(docs))
print(max(len(doc.page_content) for doc in docs))

129971
1115


In [10]:
from langchain_openai import OpenAIEmbeddings

embedding = OpenAIEmbeddings(model=OPENAI_EMBEDDING_MODEL, openai_api_key=OPENAI_API_KEY)

In [None]:
from langchain_pinecone import PineconeVectorStore

BATCH_SIZE = 300
for i in range(0, len(docs), BATCH_SIZE):
    batch = docs[i:i + BATCH_SIZE]
    try:
        PineconeVectorStore.from_documents(
            documents=batch,
            index_name=PINECONE_INDEX_NAME,
            embedding=embedding
        )

        print(f"{i}~{i+len(batch)-1} documents indexed")
    except Exception as e:
        print(f"Error indexing documents {i}~{i+len(batch)-1}: {e}")


0~299 documents indexed
300~599 documents indexed
600~899 documents indexed
900~1199 documents indexed
1200~1499 documents indexed
1500~1799 documents indexed
1800~2099 documents indexed
2100~2399 documents indexed
2400~2699 documents indexed
2700~2999 documents indexed
3000~3299 documents indexed
3300~3599 documents indexed
3600~3899 documents indexed
3900~4199 documents indexed
4200~4499 documents indexed
4500~4799 documents indexed
4800~5099 documents indexed
5100~5399 documents indexed
