In [1]:
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from tavily import TavilyClient

from dotenv import load_dotenv
import os
load_dotenv()



True

In [2]:
class Researcher:
    def __init__(self):
        self.tavily = TavilyClient(api_key=os.environ['TAVILY_API_KEY'])
        self.content = None
        #self.embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
        self.embedding_function = OpenAIEmbeddings(openai_api_key = os.environ['OPENAI_API_KEY'])
        self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
        self.vdb = None

    
    def search_query(self, query):
        self.content = self.tavily.search(query=query, max_results = 2, \
                                          include_domains = ["https://www.ncbi.nlm.nih.gov/"], search_depth="advanced", include_raw_content=True)
        return self.content
    
    def index_results(self):
        if self.content == None: 
            return "Please search a query first to get results to index."
        all_docs = []
        for c in self.content['results']:
            all_docs.append(c['raw_content'])
                
        self.split_docs = self.text_splitter.create_documents(all_docs)

        print(self.split_docs[0])
        
        self.vdb = Chroma.from_documents(self.split_docs, self.embedding_function)




        



In [3]:
researcher = Researcher()

In [4]:
researcher.search_query('What is the evidence in favor of air pollution causing cancer?')

{'query': 'What is the evidence in favor of air pollution causing cancer?',
 'follow_up_questions': ['What are the specific types of cancer linked to air pollution?',
  'How does air pollution increase the risk of cancer?',
  'What are the preventive measures to reduce the impact of air pollution on cancer?'],
 'answer': None,
 'images': None,
 'results': [{'content': 'the case for better policies surrounding air pollution and other environmental pollutants linked to cancer mortality,  90% of cancers in humans (1), including ambient air pollution, which has been declared carcinogenic to humans (2).  of understanding of air pollution’s effect on cancer from cancer etiology to include examinations of the effects of air  air pollution, cancer morbidity, and cancer mortality.  Environmental Exposures and CancerThe majority of research has focused on pollutants as a risk factor for new cancer development. However, cancer patients and survivors are exposed to environmental pollutants across 

In [5]:
researcher.content

{'query': 'What is the evidence in favor of air pollution causing cancer?',
 'follow_up_questions': ['What are the specific types of cancer linked to air pollution?',
  'How does air pollution increase the risk of cancer?',
  'What are the preventive measures to reduce the impact of air pollution on cancer?'],
 'answer': None,
 'images': None,
 'results': [{'content': 'the case for better policies surrounding air pollution and other environmental pollutants linked to cancer mortality,  90% of cancers in humans (1), including ambient air pollution, which has been declared carcinogenic to humans (2).  of understanding of air pollution’s effect on cancer from cancer etiology to include examinations of the effects of air  air pollution, cancer morbidity, and cancer mortality.  Environmental Exposures and CancerThe majority of research has focused on pollutants as a risk factor for new cancer development. However, cancer patients and survivors are exposed to environmental pollutants across 

In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
embedding_function = OpenAIEmbeddings(openai_api_key = os.environ['OPENAI_API_KEY'])



In [7]:
all_docs  = []
for c in researcher.content['results']:
            all_docs.append(c['raw_content'])
                
split_docs = researcher.text_splitter.create_documents(all_docs)



split_docs


[Document(page_content='An official website of the United States government\nThe .gov means it’s official.\nFederal government websites often end in .gov or .mil. Before\nsharing sensitive information, make sure you’re on a federal\ngovernment site.\nThe site is secure.\nThe https:// ensures that you are connecting to the\nofficial website and that any information you provide is encrypted\nand transmitted securely.\nAir Pollution across the Cancer Continuum: Extending Our Understanding of the Relationship between Environmental Exposures and Cancer\nJudy Y. Ou\n1Huntsman Cancer Institute, Cancer Control and Population Sciences, University of Utah School of Medicine, Salt Lake City, Utah\nAnne C. Kirchhoff\n1Huntsman Cancer Institute, Cancer Control and Population Sciences, University of Utah School of Medicine, Salt Lake City, Utah\n2Department of Pediatrics, University of Utah School of Medicine, Salt Lake City, Utah\nHeidi A. Hanson'),
 Document(page_content='1Huntsman Cancer Institut

In [8]:
split_docs

[Document(page_content='An official website of the United States government\nThe .gov means it’s official.\nFederal government websites often end in .gov or .mil. Before\nsharing sensitive information, make sure you’re on a federal\ngovernment site.\nThe site is secure.\nThe https:// ensures that you are connecting to the\nofficial website and that any information you provide is encrypted\nand transmitted securely.\nAir Pollution across the Cancer Continuum: Extending Our Understanding of the Relationship between Environmental Exposures and Cancer\nJudy Y. Ou\n1Huntsman Cancer Institute, Cancer Control and Population Sciences, University of Utah School of Medicine, Salt Lake City, Utah\nAnne C. Kirchhoff\n1Huntsman Cancer Institute, Cancer Control and Population Sciences, University of Utah School of Medicine, Salt Lake City, Utah\n2Department of Pediatrics, University of Utah School of Medicine, Salt Lake City, Utah\nHeidi A. Hanson'),
 Document(page_content='1Huntsman Cancer Institut

In [9]:
import chromadb

new_client = chromadb.EphemeralClient()


In [10]:
vdb = Chroma.from_documents(split_docs, embedding_function, client = new_client, collection_name="openai_test")

In [11]:
vdb.similarity_search("What types of cancer are most affected by air pollution?")

[Document(page_content='extrinsic factors that likely contributed to their initial cancer. We propose broadening the scope of understanding of air pollution’s effect on cancer from cancer etiology to include examinations of the effects of air pollution on the health of cancer patients during treatment and through survivorship. We suggest three key research priorities that are important for understanding the relationship between air pollution, cancer morbidity, and cancer mortality.'),
 Document(page_content='as bladder cancer or breast cancer is more limited. Outdoor air pollution may also be associated with poorer cancer survival though further research is needed. This report presents an overview of outdoor air pollutants, sources and global levels, as well as a description of epidemiological evidence linking outdoor air pollution with cancer incidence and mortality. Biological mechanisms of air pollution-derived carcinogenesis are also described. This report concludes by summarizing 