In [2]:
from dotenv import load_dotenv
import os
import gradio as gr
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import AzureOpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone
import time
import pandas as pd

In [3]:
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever

In [4]:
load_dotenv()

os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_KEY")
os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_DEPLOYMENT_NAME = os.getenv("AZURE_DEPLOYMENT_NAME")
AZURE_EMBEDDING = os.getenv("AZURE_EMBEDDING")

pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

In [5]:
embeddings = AzureOpenAIEmbeddings(
    azure_deployment=AZURE_EMBEDDING,
    openai_api_version="2024-03-01-preview",
)

In [6]:
import os
import csv

folder_path = 'people_pdfs'
csv_filename = 'search_people.csv'

# List to store the filenames
file_names = []

# Traverse the directory and add file names to the list
for filename in os.listdir(folder_path):
    if filename == '.DS_Store':
        continue
    file_names.append(filename)

# Write the list of filenames to a CSV file
with open(csv_filename, mode='w', newline='') as file:
    writer = csv.writer(file)
    for name in file_names:
        writer.writerow([name])

In [22]:
vectorstore = PineconeVectorStore(index_name='grants', embedding=embeddings)
data = pd.read_csv('grants.csv')

In [23]:
for index, row in data.iterrows():
    time.sleep(0.2)
    text = row['summary'] if not pd.isna(row['summary']) else ""
    metadata = {
        "source": row['source'],
        "date": row['post_date'],
        "uid": row['dpi_uid']
    }
    vectorstore.add_texts([text], metadatas=[metadata])

In [31]:
docs = vectorstore.similarity_search_with_score(
    """
    natural language processing
    """, 20, filter={
        "source": "NIH",
        "$and": [{ "date": { "$gte": 202305 } }, { "date": { "$lte": 202401 } }]
    },)

In [34]:
results = [(doc.metadata['uid'], score) for doc, score in docs]

for result in results:
    print("UID:", result[0], "Score:", result[1])

UID: aBe7AZQlc2GBR/LKYyNublN8BrQxsbUmgfRJIWKw1jM= Score: 0.317472905
UID: V91W67xUvXJzTwgMABJvrUTHrbDDkWtjQ1X1K7IySc0= Score: 0.250626832
UID: L8B1lJNgfTg7k6DMoUb3O3oU/KJGCSs6QHF4OuHV3rg= Score: 0.239333555
UID: qEkF/X/VQAkalKVjnOay0pApk+CmwqW8aeF6Yw+1p0o= Score: 0.232206762
UID: Ck1BgpfvXM+6/3ccjuxp185kr9JaYfgqa3SJqXcrXkM= Score: 0.229872
UID: LjYRPr95k7rPswZrctcnQJV1ChjBYkuIRV6ZOWYnijQ= Score: 0.223694399
UID: etKb/2evKNYolHjaKwjy3f7f8gSI+BPnKmYfLZ/QYOY= Score: 0.222091526
UID: 1H3ABy2+7oJ6BE1ledtFF3owu31c//Na1MQkQ7m9odI= Score: 0.217852876
UID: aJXi7gLe7pLIhIf6WhPxFePRn1EeHLtJlbvbRtpkEPw= Score: 0.217852876
UID: cOwz7GMxN4f8euT+gJrZ6sgGjIUlR3U5KPOh06KtcYo= Score: 0.217852876
UID: uz9Nb+R5vrzXLOQcGMTlP77++u+LGUsZuHOfFMxPPQk= Score: 0.217852876
UID: 1y1Iol82aJuPmw1lQCx74a28wzPNJKApqtZcDv395XM= Score: 0.217852876
UID: oAr0WXLU0JD771YW64HVV2wlnXoDhMEYRavm+0cajcA= Score: 0.217852876
UID: KL0lejo6NgrGW6MRvDhQTqqePYm5n/djjzKT+MibWoc= Score: 0.217852876
UID: EPBE3+vq00QlnYI1zZafuPxlMhD2nlIK

## researchers

In [4]:
class CustomFileLoader(UnstructuredPDFLoader):
    def _get_metadata(self) -> dict:
        file_name = os.path.basename(self.file_path)
        return {"source": file_name}

In [23]:
folder_path = '/Users/jaylee/Documents/work/dpi/pdf3'
for filename in os.listdir(folder_path):
      time.sleep(0.1)
      if filename == '.DS_Store':
            continue
      file_path = os.path.join(folder_path, filename)
      splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder()
      loader = CustomFileLoader(file_path)
      docs = loader.load_and_split(text_splitter=splitter)

      vectorstore = PineconeVectorStore.from_documents(docs, embeddings, index_name="dpi")

In [26]:
vector_store = PineconeVectorStore(index_name='dpi', embedding=embeddings)
retriever = vector_store.as_retriever()

In [37]:
docs = vector_store.similarity_search_with_score(
    """
    natural language processing
    """, 20)
for i, (doc, score) in enumerate(docs, start=1):
    source = doc.metadata['source']
    source_without_extension = source.split('.pdf')[0]
    print(f"{i}. {source_without_extension}: {score}")

1. Chilin Shih: 0.450808406
2. Heng Ji: 0.430762917
3. Hao Peng: 0.42467308
4. Chengxiang Zhai: 0.412713766
5. Jonathan E Dunn: 0.409267366
6. Nickvash Kani: 0.408924401
7. Suma Pallathadka Bhat: 0.404810667
8. Dilek Hakkani Tur: 0.404101938
9. Heng Ji: 0.398736149
10. Heng Ji: 0.396744192
11. Julia Constanze Hockenmaier: 0.391444176
12. Suma Pallathadka Bhat: 0.390879512
13. Heng Ji: 0.388903677
14. Chengxiang Zhai: 0.388726532
15. Jon Anthony Willits: 0.387324393
16. Huan Zhang: 0.385635346
17. Jiawei Han: 0.385319144
18. Jana Diesner: 0.383365512
19. Heng Ji: 0.382180631
20. Catherine Lesley Blake: 0.379984319


In [40]:
docs = vector_store.similarity_search_with_score(
    """
    natural language processing
    """, 20)

seen_sources = set()
unique_docs = []

for doc, score in docs:
    source = doc.metadata['source']
    source_without_extension = source.split('.pdf')[0]
    
    # Check if the source has already been included
    if source_without_extension not in seen_sources:
        seen_sources.add(source_without_extension)
        unique_docs.append((source_without_extension, score))

# Now, print the unique sources with their scores
for i, (source, score) in enumerate(unique_docs, start=1):
    print(f"{i}. {source}: {score}")

1. Chilin Shih: 0.450808406
2. Heng Ji: 0.430762917
3. Hao Peng: 0.42467308
4. Chengxiang Zhai: 0.412713766
5. Jonathan E Dunn: 0.409267366
6. Nickvash Kani: 0.408924401
7. Suma Pallathadka Bhat: 0.404810667
8. Dilek Hakkani Tur: 0.404101938
9. Julia Constanze Hockenmaier: 0.391444176
10. Jon Anthony Willits: 0.387324393
11. Huan Zhang: 0.385635346
12. Jiawei Han: 0.385319144
13. Jana Diesner: 0.383365512
14. Catherine Lesley Blake: 0.379984319
