In [24]:
import fitz
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import TextNode
from llama_index.core.extractors import (
    SummaryExtractor,
    QuestionsAnsweredExtractor,
    TitleExtractor,
    KeywordExtractor,
)
from llama_index.core.ingestion import IngestionPipeline
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.core import Settings
from llama_index.vector_stores.pinecone import PineconeVectorStore

from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv, find_dotenv
from tqdm import tqdm
import os
import re


In [25]:
def find_doi(doc_):
    text = ""
    for page in doc_:
        text += page.get_text()
    
    match = re.search(r'\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?!["&\'<>])\S)+)\b', text)
    
    if match:
        return match.group()
    else:
        return None


In [26]:
DATA_PATH = '../../data'
CREATE_NEW_INDEX = True

INDEX_NAME = 'eds-papers'
CHUNK_SIZE = 1024
LLM_TO_USE = "gpt-35-turbo"
EMBEDDING_MODEL = "text-embedding-ada-002"


In [27]:

files = ['eds_review_paper_Paepe_2012.pdf', 'eds_perspective_paper_2001.pdf']


In [28]:
load_dotenv(os.path.join(os.path.expanduser('~'), '.gpt_config.env'))
api_key = os.environ.get('API_KEY')
azure_endpoint = os.environ.get('RESOURCE_ENDPOINT')
api_version = os.environ.get('API_VERSION')

load_dotenv(os.path.join(os.path.expanduser('~'), '.pinecone_config.env'))
pinecone_api_key = os.environ.get('PINECONE_API_KEY')



In [29]:
llm = AzureOpenAI(
    model=LLM_TO_USE,
    deployment_name=LLM_TO_USE,
    api_key=api_key,
    azure_endpoint=azure_endpoint,
    api_version=api_version,
)

embed_model = AzureOpenAIEmbedding(
    model=EMBEDDING_MODEL,
    deployment_name=EMBEDDING_MODEL,
    api_key=api_key,
    azure_endpoint=azure_endpoint,
    api_version=api_version,
)


Settings.llm = llm
Settings.embed_model = embed_model

pc = Pinecone(api_key=pinecone_api_key)
if CREATE_NEW_INDEX:
    pc.create_index(
        name=INDEX_NAME,
        dimension=len(node_embedding),
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        ) 
    )
pinecone_index = pc.Index(INDEX_NAME)
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)


In [32]:

for file_index, file in enumerate(files):
    print(f'**** Processing : File {file_index+1}/{len(files)}****')
    print('Reading the document ...')
    doc = fitz.open(os.path.join(DATA_PATH, file))
    
    print('Finding DOI from the document ...')
    doc_doi = find_doi(doc)
    
    print('Splitting the document into chunks ...')
    text_parser = SentenceSplitter(chunk_size=CHUNK_SIZE)
    text_chunks = []
    doc_idxs = []
    for doc_idx, page in enumerate(doc):
        page_text = page.get_text()
        cur_text_chunks = text_parser.split_text(page_text)
        text_chunks.extend(cur_text_chunks)
        doc_idxs.extend([doc_idx]*len(cur_text_chunks))
    
    print('Converting each chunk into nodes ...')
    nodes = []
    for idx, text_chunk in enumerate(text_chunks):
        node = TextNode(
            text=text_chunk,
        )
        nodes.append(node)
    
    print('Extracting metadata for each node ...')
    extractors = [
        TitleExtractor(nodes=5),
        QuestionsAnsweredExtractor(questions=3),
        SummaryExtractor(summaries=["prev", "self"]),
        KeywordExtractor(keywords=10)
    ]
    
    pipeline = IngestionPipeline(
        transformations=extractors,
    )
    nodes = await pipeline.arun(nodes=nodes, in_place=False)
    
    print('Creating embeddings for each node and adding DOI of the paper as another metadata ...')
    for node in tqdm(nodes):
        node_embedding = embed_model.get_text_embedding(
            node.get_content(metadata_mode="all")
        )
        node.embedding = node_embedding
        node.metadata['doi'] = doc_doi
        
    
    print('Populating vectorDB ...')    
    vector_store.add(nodes)
    print(f'**** File {file_index+1}/{len(files)} is completed ****')


**** Processing : File 1/2****
Reading the document ...
Finding DOI from the document ...
Splitting the document into chunks ...
Converting each chunk into nodes ...
Extracting metadata for each node ...


100%|█████████████████████████████████████████████| 5/5 [00:01<00:00,  3.76it/s]
100%|███████████████████████████████████████████| 24/24 [00:10<00:00,  2.21it/s]
100%|███████████████████████████████████████████| 24/24 [00:10<00:00,  2.24it/s]
100%|███████████████████████████████████████████| 24/24 [00:07<00:00,  3.13it/s]


Creating embeddings for each node and adding DOI of the paper as another metadata ...


100%|███████████████████████████████████████████| 24/24 [00:09<00:00,  2.42it/s]


Populating vectorDB ...


Upserted vectors: 100%|█████████████████████████| 24/24 [00:02<00:00, 10.05it/s]


**** File 1/2 is completed ****
**** Processing : File 2/2****
Reading the document ...
Finding DOI from the document ...
Splitting the document into chunks ...
Converting each chunk into nodes ...
Extracting metadata for each node ...


100%|█████████████████████████████████████████████| 5/5 [00:01<00:00,  3.34it/s]
100%|███████████████████████████████████████████| 15/15 [00:06<00:00,  2.15it/s]
100%|███████████████████████████████████████████| 15/15 [00:09<00:00,  1.64it/s]
100%|███████████████████████████████████████████| 15/15 [00:04<00:00,  3.73it/s]


Creating embeddings for each node and adding DOI of the paper as another metadata ...


100%|███████████████████████████████████████████| 15/15 [00:06<00:00,  2.49it/s]


Populating vectorDB ...


Upserted vectors: 100%|█████████████████████████| 15/15 [00:00<00:00, 38.46it/s]

**** File 2/2 is completed ****



