In [None]:
%pip install -qU langchain
%pip install -qU huggingface_hub
%pip install nest-asyncio
%pip install unstructured
%pip install chromadb

In [None]:
import os
# Necessary to import OpenAI
with open('assets/openai_api_key', 'r') as f:
    openai_api_key = f.read()
with open('assets/huggingface_api_key', 'r') as f:
    huggingface_api_key = f.read()
os.environ['OPENAI_API_KEY'] = openai_api_key
os.environ['HUGGINGFACEHUB_API_KEY'] = huggingface_api_key
from pathlib import Path
import nest_asyncio
nest_asyncio.apply()
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.prompts import PromptTemplate
from langchain.document_loaders import DirectoryLoader
from langchain.output_parsers import RegexParser
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
import json
import pandas as pd
import re
from langchain.chains import RetrievalQA
import ipywidgets

# OpenAI QA

We use Langchain to create a Chroma database of our papers against which we will query our questions. The answers are provided in the requested structure, making it straightforward to extract information.

Every paper is evaluated separately.

TODO: improve documentation

https://python.langchain.com/docs/modules/chains/additional/question_answering.html

https://python.langchain.com/docs/modules/chains/additional/openai_functions_retrieval_qa


Setting up the parameters:

In [None]:
paper_path = '../data/docs/'
pickle_path = '../data/pickle/'
chroma_path = '../data/chroma/'
MODEL = 'gpt-3.5-turbo'

Creating the vector database:

Defining the prompt template:

All together?

In [None]:
prompt_template = """We are trying to extract information from a journal article that focuses on bioassays performed on graphene oxide to investigate its effects on and interactions with biological systems and organisms. Our objective is to extract details about the assays conducted and how different measurement groups and conditions contribute to various effects.

Please use the provided journal article as context to fill out the keys in the format defined below. Ensure that the answer to each key is a comma-separated list. Each element in a given key's list should correspond with the respective elements in other keys, maintaining the same length for all lists. Leaving blanks or having lists of different lengths is not acceptable. Repeated elements are allowed, as the paper describes different conditions for each assay, different endpoints for each animal model, and so on. Be as exhaustive as you can, and do not forget to give an answer for every list element (you will respond 'not specified' if you can't find an answer).


Please return the answer to each key as a comma-separated list in such a way that e.g. element 1 of a given key (e.g., diameter measurement units) corresponds with element 1 from a different key (e.g. diameter measurement values) and element 1 from another key (Interaction). This means that it is crucial that all lists must be of the same length -an answer with lists of different lenghts is not acceptable. Each position in the lists represents an individual set of assay conditions, measurements and outcomes, which means that there can be several instances of the same outcome, materials, measurements and conditions.

Format:
---------
question i: [list i of length N]
question j: [list j of length N]
---------


Context:

{context}
---------
Questions: 
{question}

"""



In [None]:
questions = ["Observed interaction or effect",
             "Sample preparation"
             "Bioassays", 
             "Doses", 
             "Doses units", 
             "Coatings", 
             "Organisms/Biological systems", 
             "Shapes", 
             "Diameters measurement units", 
             "Diameters measurement values",
             "Diameters measurement types",
             "Time point values",
             "Time point units", 
             "Passages"]

query = """
    Observed interaction or effect: [List of observed outcomes]
    Sample preparation technique: [List of sample preparation techniques]
    Bioassays: [List of specific bioassays performed]
    Doses: [List of graphene oxide doses]
    Doses units: [List of units for graphene oxide doses]
    Coatings: [List of nanomaterial coating types reported]
    Organisms/Biological systems: [List of organisms or biological systems used in  the assay]
    Shapes: [List of graphene oxide shapes that produced the outcomes]
    Diameters measurement units: [List of units for diameter measurements]
    Diameters measurement values: [List of numeric values for diameter  measurements]
    Diameters measurement types: [List of types of diameter measurements]
    Time point values: [List of time points at which effects were observed]
    Time point units: [List of units for time points]
    Passages: [List of literal excerpts from the text asserting the outcomes]

"""
regex = "(.*)\n".join(questions) + '\n(.*)'


output_parser = RegexParser(
    regex=regex,
    output_keys=questions,
)

doc_prompt = PromptTemplate(
    template=prompt_template,
    input_variables=['context', 'question'],
    output_parser=output_parser,
)

In [None]:
def papers_to_vectors(paper_path):
    #TODO be able to restore persistent chroma
    loader = DirectoryLoader(paper_path)
    documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=100,chunk_overlap=0)
    texts = text_splitter.split_documents(documents)
    embeddings = OpenAIEmbeddings()
    docsearch = Chroma.from_documents(texts, embeddings,persist_directory=f'../data/chroma/{paper_path}', metadatas=[{"source": str(i)} for i in range(len(texts))])
    docsearch.persist()
    return docsearch
#documents = os.listdir(paper_path)

In [None]:
def get_data(path, runs, query, doc_prompt):
    docsearch = papers_to_vectors(path)
    res = {str(i): {} for i in range(runs)}
    for i in range(runs):
        print(f'Run #{i} ...')
        chain_type_kwargs = {'prompt': doc_prompt}
        qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="map_rerank", retriever=docsearch.as_retriever(), chain_type_kwargs=chain_type_kwargs, return_source_documents=True)
        result = qa({'query': query})
        rows = result['result'].split("\n")
        
        for row in rows:
            if 'Answer' not in row and ':' in row:
                row_name, values = row.split(":", 1)
                values = values.split(",")
                res[str(i)][row_name] = values
        res[str(i)]['source_documents'] = result['source_documents']
    res['query'] = result['query']
    return res





In [None]:
path = '../data/docs/test/'
runs = 10
query = query
test_am3 = get_data(path, runs, query, doc_prompt)


In [None]:
test_am3

In [None]:

def occurrence_interactions(results, runs):
    seen_interactions = {}
    for iteration in results.keys():
        if type(results[iteration]) == dict:
            values = results[iteration]:
            interaction = values['Observed interaction or effect']:
            if interaction not in seen_interactions.keys():
                seen_interactions[interaction] = {'vals':{}, 'count': 1}
            
            else:
                seen_interactions[interaction]['count'] += 1
            for other_keys in values.keys():
                
    occurences = {i: {'vals': seen_interactions[i]['vals'], 'freq': seen_interactions[i]['count']/runs} for i in seen_interactions.keys()}
    return occurences
seen = occurrence_interactions(test_am3, runs)
seen

In [None]:
test_am3

In [None]:
rows = result['result'].split("\n")
new_table = []
row_names = []
for row in rows:
    if ':' in row:
        row_name = row.split(":")[0]
        values = row.split(":")[1]
        row_names.append(row_name)
        values = values.split(",")
        new_table.append(values)
df = pd.DataFrame(new_table, index = row_names).drop('Answer')
df


In [None]:
result['result']