In [39]:
import os
import time
from getpass import getpass

from tqdm.auto import tqdm

import pandas as pd

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain_huggingface import HuggingFaceEmbeddings
from pinecone import Pinecone, ServerlessSpec, Index

from dotenv import load_dotenv, find_dotenv

# Load the API keys from .env
load_dotenv(find_dotenv(), override=True)

True

In [None]:
OUTPUT_JSON_FILE = 'arxiv_papers.json'
DATA_FOLDER = '../data'
PDF_FOLDER = '../data/pdfs/'
OUTPUT_JSON_FILEPATH = os.path.join(DATA_FOLDER, OUTPUT_JSON_FILE)
DF_PDF_CSV_FILE = "arxiv_papers_with_pdfs.csv"
DF_PDF_CSV_FILEPATH = os.path.join(PDF_FOLDER, DF_PDF_CSV_FILE)


CHUNK_SIZE = 512
CHUNK_OVERLAP = 64

INDEX_NAME = 'langgraph-research-agent'
BATCH_SIZE = 64

EMBEDDING_MODEL_NAME = "BAAI/bge-small-en"
EMBEDDING_DIMS = 384


## Creating Data Embeddings

### Splitting and Chunking of Data

In [None]:

def load_and_chunk_pdf(pdf_file_name: str, 
                       saved_dir: str=PDF_FOLDER,
                       chunk_size: int=CHUNK_SIZE, 
                       chunk_overlap: int=CHUNK_OVERLAP) -> list[str]:
    """
    Loads a PDF file into chunks and returns a list of chunks.
    Args:
        pdf_file_name (str): The name of the PDF file.
        saved_dir (str): The directory where the PDF file is saved. Default is PDF_FOLDER.
        chunk_size (int): The size of each chunk in bytes. Default is CHUNK_SIZE.
        chunk_overlap (int): The overlap between chunks in bytes. Default is CHUNK_OVERLAP.
    Returns:
        List[str]: A list of chunks from the PDF file.
    """

    print(f'Loading and splitting into chunks: {pdf_file_name}')
    # name = remove_dot_from_filename(pdf_file_name)
    # print(name)
    
    pdf_file_path = os.path.join(saved_dir, pdf_file_name)

    # Load the PDF file into a DocumentLoader object
    loader = PyPDFLoader(pdf_file_path)
    data = loader.load()

    # Split the text into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, 
                                                   chunk_overlap=chunk_overlap)
    chunks = text_splitter.split_documents(data)

    return chunks

In [37]:
def add_chunks_to_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Adds chunks to the DataFrame, including their IDs and metadata.
    Args:
        df (pd.DataFrame): The DataFrame containing the paper details.
    Returns:
        pd.DataFrame: The updated DataFrame with added chunk information.
    """

    expanded_rows = []  # List to store expanded rows with chunk information

    # Loop through each row in the DataFrame
    for idx, row in df.iterrows():
        try:
            chunks = load_and_chunk_pdf(row['pdf_file_name'])
        except Exception as e:
            print(f"Error processing file {row['pdf_file_name']}: {e}")
            continue

        for i, chunk in enumerate(chunks):
            pre_chunk_id = i-1 if i > 0 else ''  # Preceding chunk ID
            post_chunk_id = i+1 if i < len(chunks) - 1 else ''  # Following chunk ID

            expanded_rows.append({
                'id': f"{row['arxiv_id']}#{i}",  # Unique chunk identifier
                'title': row['title'],
                'summary': row['summary'],
                'authors': row['authors'],
                'arxiv_id': row['arxiv_id'],
                'url': row['url'],
                'chunk': chunk.page_content,  # Text content of the chunk
                'pre_chunk_id': '' if i == 0 else f"{row['arxiv_id']}#{pre_chunk_id}",  # Previous chunk ID
                'post_chunk_id': '' if i == len(chunks) - 1 else f"{row['arxiv_id']}#{post_chunk_id}"  # Next chunk ID
            })
    # Return a new expanded DataFrame
    return pd.DataFrame(expanded_rows)

In [34]:
df_with_pdfs = pd.read_csv(DF_PDF_CSV_FILEPATH)
df_with_pdfs.head()

Unnamed: 0,title,summary,authors,url,pdf_link,published,arxiv_id,pdf_file_name
0,Cedille: A large autoregressive French languag...,Scaling up the size and training of autoregres...,"['Martin Müller', 'Florian Laurent']",http://arxiv.org/abs/2202.03371v1,http://arxiv.org/pdf/2202.03371v1,2022-02-07T17:40:43Z,2202.03371v1,2202.03371v1.pdf
1,A Precis of Language Models are not Models of ...,Natural Language Processing is one of the lead...,['Csaba Veres'],http://arxiv.org/abs/2205.07634v1,http://arxiv.org/pdf/2205.07634v1,2022-05-16T12:50:58Z,2205.07634v1,2205.07634v1.pdf
2,Integrating AI Planning with Natural Language ...,Natural language processing (NLP) aims at inve...,"['Kebing Jin', 'Hankz Hankui Zhuo']",http://arxiv.org/abs/2202.07138v2,http://arxiv.org/pdf/2202.07138v2,2022-02-15T02:19:09Z,2202.07138v2,2202.07138v2.pdf
3,Multilingual Text Classification for Dravidian...,As the fourth largest language family in the w...,"['Xiaotian Lin', 'Nankai Lin', 'Kanoksak Watta...",http://arxiv.org/abs/2112.01705v1,http://arxiv.org/pdf/2112.01705v1,2021-12-03T04:26:49Z,2112.01705v1,2112.01705v1.pdf
4,PersianLLaMA: Towards Building First Persian L...,Despite the widespread use of the Persian lang...,"['Mohammad Amin Abbasi', 'Arash Ghafouri', 'Ma...",http://arxiv.org/abs/2312.15713v1,http://arxiv.org/pdf/2312.15713v1,2023-12-25T12:48:55Z,2312.15713v1,2312.15713v1.pdf


In [40]:
df_with_chunks = add_chunks_to_df(df_with_pdfs)
df_with_chunks.head()

Loading and splitting into chunks: 2202.03371v1.pdf
Loading and splitting into chunks: 2205.07634v1.pdf
Loading and splitting into chunks: 2202.07138v2.pdf
Loading and splitting into chunks: 2112.01705v1.pdf
Loading and splitting into chunks: 2312.15713v1.pdf
Loading and splitting into chunks: 2305.06530v1.pdf
Loading and splitting into chunks: 2401.04155v1.pdf
Loading and splitting into chunks: 2112.07055v2.pdf
Loading and splitting into chunks: 2404.04748v1.pdf
Loading and splitting into chunks: 2210.14473v1.pdf
Loading and splitting into chunks: 2212.03419v1.pdf
Loading and splitting into chunks: 2406.07259v1.pdf
Loading and splitting into chunks: 1608.04434v1.pdf
Loading and splitting into chunks: 2408.02237v1.pdf
Loading and splitting into chunks: 2308.15118v1.pdf
Loading and splitting into chunks: 2311.04329v2.pdf
Loading and splitting into chunks: 2406.17873v1.pdf
Loading and splitting into chunks: 2101.06949v1.pdf
Loading and splitting into chunks: 2210.07041v1.pdf
Loading and 

Unnamed: 0,id,title,summary,authors,arxiv_id,url,chunk,pre_chunk_id,post_chunk_id
0,2202.03371v1#0,Cedille: A large autoregressive French languag...,Scaling up the size and training of autoregres...,"['Martin Müller', 'Florian Laurent']",2202.03371v1,http://arxiv.org/abs/2202.03371v1,CEDILLE :\nA LARGE AUTOREGRESSIVE LANGUAGE MOD...,,2202.03371v1#1
1,2202.03371v1#1,Cedille: A large autoregressive French languag...,Scaling up the size and training of autoregres...,"['Martin Müller', 'Florian Laurent']",2202.03371v1,http://arxiv.org/abs/2202.03371v1,"auto-regressive language model, speciﬁcally tr...",2202.03371v1#0,2202.03371v1#2
2,2202.03371v1#2,Cedille: A large autoregressive French languag...,Scaling up the size and training of autoregres...,"['Martin Müller', 'Florian Laurent']",2202.03371v1,http://arxiv.org/abs/2202.03371v1,Large autoregressive language models have draw...,2202.03371v1#1,2202.03371v1#3
3,2202.03371v1#3,Cedille: A large autoregressive French languag...,Scaling up the size and training of autoregres...,"['Martin Müller', 'Florian Laurent']",2202.03371v1,http://arxiv.org/abs/2202.03371v1,"Although large language models, such as GPT-3 ...",2202.03371v1#2,2202.03371v1#4
4,2202.03371v1#4,Cedille: A large autoregressive French languag...,Scaling up the size and training of autoregres...,"['Martin Müller', 'Florian Laurent']",2202.03371v1,http://arxiv.org/abs/2202.03371v1,models [5].\nMonolingual autoregressive langua...,2202.03371v1#3,2202.03371v1#5


## Create Pincone Vector Db

In [12]:
def get_embeddings(model_name, texts):
    # Define the Hugging Face Embeddings
    embeddings = HuggingFaceEmbeddings(model_name=model_name)

    embeddings_list = []
    for text in texts:
        embeddings_list.append(embeddings.embed_query(text))

    return embeddings_list

In [None]:
model_name = EMBEDDING_MODEL_NAME
input_text = ["Hello World!", "Goodbye World!"]

embeddings = get_embeddings(model_name, input_text)

print(len(embeddings[0]))
print(embeddings)

[[-0.02744012512266636, -0.011006052605807781, 0.042352523654699326, -0.042772505432367325, 0.016905877739191055, 0.02067454159259796, 0.019077520817518234, 0.05208282172679901, 0.01198798231780529, 0.010680303908884525, -0.004819100257009268, -0.045714251697063446, 0.018787158653140068, 0.04544045031070709, 0.021978164091706276, 0.012238230556249619, 0.028512557968497276, -0.014150147326290607, -0.0911283940076828, 0.006428318563848734, 0.08000028878450394, 0.023755745962262154, -0.028776006773114204, -0.059667229652404785, -0.006049215327948332, -0.0013380016898736358, -0.025655802339315414, 0.02398783527314663, 0.0003496247809380293, -0.17968158423900604, -0.024698453024029732, -0.028057819232344627, 0.058816201984882355, -0.0075118388049304485, 0.025728149339556694, -0.03086206130683422, -0.009181546047329903, 0.03781155124306679, -0.041793689131736755, 0.008646606467664242, 0.040567584335803986, -0.006288799457252026, 0.0003736849466804415, -0.023562859743833542, 0.009550483897328

In [None]:
class PinconeVectorDb:
    def __init__(self, 
                 cloud: str='aws', 
                 region: str='us-east-1'):
        """
        Initialize the PinconeVectorDb class.
        Args:
            cloud (str): The cloud provider for Pinecone. Default is 'aws'.
            region (str): The AWS region for Pinecone. Default is 'us-east-1'.
        """
        # Check if 'PINECONE_API_KEY' is set; prompt if not
        self.pc_api_key = os.getenv('PINECONE_API_KEY') or getpass('Pinecone API key: ')
        self.pc, self.spec = self.initialize_pinecone_client(cloud=cloud, 
                                                             region=region)
    
    def initialize_pinecone_client(self, cloud: str, region: str):
        """
        Initialize the Pinecone client and return it.
        Args:
            cloud (str): The cloud provider for Pinecone. 
            region (str): The AWS region for Pinecone.
        Returns:
            Pinecone client and serverless specification objects.
        """

        # Initialize the Pinecone client
        pc = Pinecone(api_key=self.pc_api_key)
        # Define the serverless specification for Pinecone (AWS region 'us-east-1')
        spec = ServerlessSpec(
            cloud=cloud, 
            region=region
        )

        return pc, spec
    
    def create_pinecone_index(self,
                                index_name: str=INDEX_NAME, 
                                EMBEDDING_DIMS: int=EMBEDDING_DIMS, 
                                metric: str='cosine') -> Index:
        """
        Creates a Pinecone index with the given name and dimensions.
        Args:
            index_name (str): The name of the index. Default is INDEX_NAME.
            EMBEDDING_DIMS (int): The dimensionality of the embeddings. Default is EMBEDDING_DIMS.
            metric (str): The metric used for similarity. Default is 'cosine'.
        Returns:
            Pinecone index object.
        """

        # Check if the index exists; create it if it doesn't
        if index_name not in self.pc.list_indexes().names():
            self.pc.create_index(
                index_name,
                dimension=EMBEDDING_DIMS,  # Embedding dimension
                metric=metric,
                spec=self.spec  # Cloud provider and region specification
            )

            # Wait until the index is fully initialized
            while not self.pc.describe_index(index_name).status['ready']:
                time.sleep(1)

        # Connect to the index
        self.index = self.pc.Index(index_name)

        # Add a short delay before checking the stats
        time.sleep(1)

        # View the index statistics
        print(f"Index Stats:\n{self.index.describe_index_stats()}")
    
    def get_embeddings(self,
                       texts: list[str],
                       model_name: str=EMBEDDING_MODEL_NAME):
        # Define the Hugging Face Embeddings
        embeddings = HuggingFaceEmbeddings(model_name=model_name)

        embeddings_list = []
        for text in texts:
            embeddings_list.append(embeddings.embed_query(text))

        return embeddings_list
    
    def add_embeddings_to_index(self, 
                                data: pd.DataFrame, 
                                batch_size: int=BATCH_SIZE,):

        # data = expanded_df
        # batch_size = 64  # Set batch size

        # Loop through the data in batches, using tqdm for a progress bar
        for i in tqdm(range(0, len(data), batch_size)):
            i_end = min(len(data), i + batch_size)  # Define batch endpoint
            batch = data[i:i_end].to_dict(orient='records')  # Slice data into a batch

            # Extract metadata for each chunk in the batch
            metadata = [{
                'arxiv_id': r['arxiv_id'],
                'title': r['title'],
                'chunk': r['chunk'],
            } for r in batch]
            
            # Generate unique IDs for each chunk
            ids = [r['id'] for r in batch]
            
            # Extract the chunk content
            chunks = [r['chunk'] for r in batch]
            
            # Convert chunks into embeddings
            embeds = self.get_embeddings(chunks)
            
            # Upload embeddings, IDs, and metadata to Pinecone
            self.index.upsert(vectors=zip(ids, embeds, metadata))
            
            # View the index statistics
            print(f"Index Stats:\n{self.index.describe_index_stats()}")

In [43]:
pc = PinconeVectorDb()
pc.create_pinecone_index()
pc.add_embeddings_to_index(data=df_with_chunks)

Index Stats:
{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}


100%|██████████| 70/70 [15:10<00:00, 13.01s/it]
