In [39]:
from dotenv import load_dotenv
from data_ingestion.preprocessor import Preprocessor

load_dotenv()

True

In [40]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model = "text-embedding-3-small")

In [41]:
import os
from pinecone import Pinecone, ServerlessSpec
pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))

In [42]:
import time

index_name = "research-assistant-agent-index"

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
    while not pc.describe_index(index_name).status['ready']:
        print("Waiting for index to be ready, please wait a while...")
        time.sleep(1)

index = pc.Index(index_name)


In [43]:
from langchain_pinecone import PineconeVectorStore

embeddings = OpenAIEmbeddings(model = "text-embedding-3-small")
vector_store = PineconeVectorStore(index = index, embedding=embeddings)

In [44]:
from data_ingestion.preprocessor import res
from langchain.schema import Document

In [45]:
res.head()

Unnamed: 0,id,title,summary,authors,arxiv_id,url,chunk,prev_chunk_id,next_chunk_id
0,9308101v1_0,Dynamic Backtracking,Because of their occasional need to return to ...,[M. L. Ginsberg],9308101v1,http://arxiv.org/abs/cs/9308101v1,Journal of Arti/cial In telligence Researc h ...,9308101v1_,9308101v1_1
1,9308101v1_1,Dynamic Backtracking,Because of their occasional need to return to ...,[M. L. Ginsberg],9308101v1,http://arxiv.org/abs/cs/9308101v1,"problem/. In this pap er/, w e presen t a meth...",9308101v1_0,9308101v1_2
2,9308101v1_2,Dynamic Backtracking,Because of their occasional need to return to ...,[M. L. Ginsberg],9308101v1,http://arxiv.org/abs/cs/9308101v1,b y earlier approac hes/.\n/1/. In tro duction...,9308101v1_1,9308101v1_3
3,9308101v1_3,Dynamic Backtracking,Because of their occasional need to return to ...,[M. L. Ginsberg],9308101v1,http://arxiv.org/abs/cs/9308101v1,remaining problem in t w o/. W e no w b egin t...,9308101v1_2,9308101v1_4
4,9308101v1_4,Dynamic Backtracking,Because of their occasional need to return to ...,[M. L. Ginsberg],9308101v1,http://arxiv.org/abs/cs/9308101v1,there is no p oin t in w asting time completin...,9308101v1_3,9308101v1_5


In [46]:
vector_store_ids = [row['id'] for _, row in res.iterrows()]

documents = [
    Document(
        page_content = row['chunk'],
        metadata = {
            'arxiv_id': row["arxiv_id"],
            'title': row["title"],
            'chunk': row["chunk"],
            "prev_chunk_id": row["prev_chunk_id"],
            "next_chunk_id": row["next_chunk_id"]
        }
    )
for _, row in res.iterrows()]

In [47]:
vector_store.add_documents(documents=documents, ids=vector_store_ids)

['9308101v1_0',
 '9308101v1_1',
 '9308101v1_2',
 '9308101v1_3',
 '9308101v1_4',
 '9308101v1_5',
 '9308101v1_6',
 '9308101v1_7',
 '9308101v1_8',
 '9308101v1_9',
 '9308101v1_10',
 '9308101v1_11',
 '9308101v1_12',
 '9308101v1_13',
 '9308101v1_14',
 '9308101v1_15',
 '9308101v1_16',
 '9308101v1_17',
 '9308101v1_18',
 '9308101v1_19',
 '9308101v1_20',
 '9308101v1_21',
 '9308101v1_22',
 '9308101v1_23',
 '9308101v1_24',
 '9308101v1_25',
 '9308101v1_26',
 '9308101v1_27',
 '9308101v1_28',
 '9308101v1_29',
 '9308101v1_30',
 '9308101v1_31',
 '9308101v1_32',
 '9308101v1_33',
 '9308101v1_34',
 '9308101v1_35',
 '9308101v1_36',
 '9308101v1_37',
 '9308101v1_38',
 '9308101v1_39',
 '9308101v1_40',
 '9308101v1_41',
 '9308101v1_42',
 '9308101v1_43',
 '9308101v1_44',
 '9308101v1_45',
 '9308101v1_46',
 '9308101v1_47',
 '9308101v1_48',
 '9308101v1_49',
 '9308101v1_50',
 '9308101v1_51',
 '9308101v1_52',
 '9308101v1_53',
 '9308101v1_54',
 '9308101v1_55',
 '9308101v1_56',
 '9308101v1_57',
 '9308101v1_58',
 '93081

In [48]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 705}},
 'total_vector_count': 705}