In [1]:
import pickle
from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
import os
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm


In [2]:
# Load preprocessed document chunks
with open("preprocessed_docs.pkl", "rb") as f:
    split_docs = pickle.load(f)

print(f"Loaded {len(split_docs)} text chunks!")
print(f"Sample Chunk:\n{split_docs[0].page_content[:500]}")

Loaded 1116 text chunks!
Sample Chunk:
hide_table_of_contents: true import People from "@theme/People"; People There are some incredible humans from all over the world who have been instrumental in helping the LangChain community flourish 🌐! This page highlights a few of those folks who have dedicated their time to the open-source repo in the form of direct contributions and reviews. Top reviewers As LangChain has grown, the amount of surface area that maintainers cover has grown as well. Thank you to the following folks who have gon


In [3]:
# Load variables from .env file
load_dotenv()

True

In [4]:
# Access the API keys
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")   # not using due to error insufficient_quota
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENV = os.getenv("PINECONE_ENV")

In [5]:
# # Ensure you have set up your OpenAI API key in environment variables
# os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

# # Initialize the OpenAI embedding model
# embedding_model = OpenAIEmbeddings()

In [6]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [7]:
embedded_docs = [
    {
        "id": str(i),  # Unique ID for Pinecone
        "values": embedding_model.embed_documents([doc.page_content])[0],  # Generate embedding
        "metadata": {
            "text": doc.page_content,  # Store full chunk text
            "source": doc.metadata.get("source", "unknown")  # Store source info
        }
    }
    for i, doc in enumerate(split_docs)
]

print(f"Generated {len(embedded_docs)} embeddings!")


Generated 1116 embeddings!


In [8]:
# Save embeddings to a .pkl file BEFORE uploading to Pinecone
with open("embeddings.pkl", "wb") as f:
    pickle.dump(embedded_docs, f)

In [9]:

# Create a Pinecone instance
pc = Pinecone(api_key=PINECONE_API_KEY)

# Define the index name
index_name = "langchain-docs"

# Check if the index already exists
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name, 
        dimension=384,  # Must match embedding model output size
        metric="euclidean",
        spec=ServerlessSpec(
            cloud="aws",
            region=PINECONE_ENV  # Ensure this matches your Pinecone account region
        )
    )

# Connect to the index
index = pc.Index(index_name)

print(f"Successfully connected to Pinecone index: {index_name}")

Successfully connected to Pinecone index: langchain-docs


In [10]:
embedded_docs[0]["metadata"]['text']

'hide_table_of_contents: true import People from "@theme/People"; People There are some incredible humans from all over the world who have been instrumental in helping the LangChain community flourish 🌐! This page highlights a few of those folks who have dedicated their time to the open-source repo in the form of direct contributions and reviews. Top reviewers As LangChain has grown, the amount of surface area that maintainers cover has grown as well. Thank you to the following folks who have gone above and beyond in reviewing incoming PRs 🙏! Top recent contributors The list below contains contributors who have had the most PRs merged in the last three months, weighted (imperfectly) by impact. Thank you all so much for your time and efforts in making LangChain better ❤️! Core maintainers Hello there 👋! We\'re LangChain\'s core maintainers. If you\'ve spent time in the community, you\'ve probably crossed paths with at least one of us already. Top all-time contributors And finally, this 

In [11]:
batch_size = 100
for i in range(0, len(embedded_docs), batch_size):
    batch = embedded_docs[i : i + batch_size]

    vectors = [
        (
            doc["id"],  # Unique ID
            doc["values"],  # Embedding vector
            doc["metadata"]  # Metadata (text + source)
        )
        for doc in batch
    ]

    index.upsert(vectors=vectors)

print("Successfully stored embeddings in Pinecone!")


Successfully stored embeddings in Pinecone!
