# Upload data to Vectore Store Database

In [1]:
!pip install --quiet langchain pydantic unstructured nest_asyncio openai tiktoken


In [2]:
# fixes a bug with asyncio and jupyter
import nest_asyncio

nest_asyncio.apply()

In [3]:
import json
from langchain.schema.document import Document
with open('data.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
docs = [Document(page_content=doc_dict['content'], metadata=doc_dict['metadata']) for doc_dict in data]
print(docs)

[Document(page_content='Record a 1-2 minute video introducing yourself to your peers, and upload it by August 25th at 8:00 p.m. Use the recording option in this thread. State your name, where you are from, where you are, and what major you are interested in. Then share two true pieces of information about yourself and one lie. Your peers will guess which pieces of info are true and which one is the lie.', metadata={'course': 'ENC 1101', 'title': 'Week 1 - Introduction Post', 'description': 'First week introduction post on presenting yourself to classmates.'}), Document(page_content='After you complete this reading on strategies to approach an assignment, identify five main ideas that you think are the most important and useful takeaways for you. For each of the five, write one sentence explaining the point and why you think it is important and useful for you and your process. Be sure to use complete sentences and reference the point directly from the reading. Upload your five sentences

In [4]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    separator = "\n\n",
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len,
)

docs_Chunks = text_splitter.split_documents(docs)
docs_Chunks

[Document(page_content='Record a 1-2 minute video introducing yourself to your peers, and upload it by August 25th at 8:00 p.m. Use the recording option in this thread. State your name, where you are from, where you are, and what major you are interested in. Then share two true pieces of information about yourself and one lie. Your peers will guess which pieces of info are true and which one is the lie.', metadata={'course': 'ENC 1101', 'title': 'Week 1 - Introduction Post', 'description': 'First week introduction post on presenting yourself to classmates.'}),
 Document(page_content='After you complete this reading on strategies to approach an assignment, identify five main ideas that you think are the most important and useful takeaways for you. For each of the five, write one sentence explaining the point and why you think it is important and useful for you and your process. Be sure to use complete sentences and reference the point directly from the reading. Upload your five sentence

In [5]:
# from langchain.embeddings.huggingface import HuggingFaceBgeEmbeddings
from langchain.embeddings.openai import OpenAIEmbeddings

In [9]:
# model_name = "BAAI/bge-small-en-v1.5"
# # model_kwargs = {'device': 'cpu'}
# model_kwargs = {'device': 'cuda'}
# encode_kwargs = {'normalize_embeddings': True}
# embeddings = HuggingFaceBgeEmbeddings(
#     model_name=model_name,
#     model_kwargs=model_kwargs,
#     encode_kwargs=encode_kwargs
# )
import os
from dotenv import load_dotenv
load_dotenv()
embeddings = OpenAIEmbeddings(openai_api_key=os.environ['OPENAI_API_KEY'])

In [10]:
embeddings

OpenAIEmbeddings(client=<class 'openai.api_resources.embedding.Embedding'>, model='text-embedding-ada-002', deployment='text-embedding-ada-002', openai_api_version='', openai_api_base='', openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key='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', openai_organization='', allowed_special=set(), disallowed_special='all', chunk_size=1000, max_retries=6, request_timeout=None, headers=None, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False)

In [11]:
!pip install --quiet pinecone-client

In [12]:
import pinecone

# initialize pinecone
pinecone.init(
    api_key="58bd5618-e5fe-44e5-8585-5060c19c7a0a",
    environment="gcp-starter"
)

  from tqdm.autonotebook import tqdm


In [13]:
index = pinecone.Index("classbot")
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [14]:
for idx, doc in enumerate(docs_Chunks):
    metadata_size = len(str(docs_Chunks[idx].page_content).encode('utf-8'))
    if metadata_size > 30500:  # 40960 bytes is the limit
        print(f"Document at index {idx} has metadata size: {metadata_size} bytes")
        # docs_Chunks.pop(idx)
        print(f"Removed document at index {idx}")
len(docs_Chunks)

6

In [14]:
from langchain.vectorstores import Pinecone

index_name = "classbot"

# create a new index
docsearch = Pinecone.from_documents(docs_Chunks, embeddings, index_name = index_name)


# if you already have an index, you can load it like this
docsearch = Pinecone.from_existing_index(index_name, embeddings)

In [21]:
index = pinecone.Index("classbot")
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0001,
 'namespaces': {'': {'vector_count': 10}},
 'total_vector_count': 10}