In [1]:
!pip install -qU langchain openai tiktoken pypdf2 scipy


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


### Read the pdf

In [54]:
from PyPDF2 import PdfReader

reader = PdfReader("liver-hp-patient.pdf")
pages = reader.pages

extracted_text = ""
for page_number in range(len(reader.pages)):
    page = reader.pages[page_number]
    extracted_text += page.extract_text()

Count number of words in any length of text

In [55]:
import tiktoken

tokenizer = tiktoken.get_encoding('cl100k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

tiktoken_len(extracted_text)

28120

Split the text into chunks of 400 words

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

In [27]:
chunks = text_splitter.split_text(extracted_text)
len(chunks)

81

### Prepare embedding

Convert chunks into vectors

In [19]:
from getpass import getpass

OPENAI_API_KEY = getpass("OpenAI API Key: ")

In [25]:
import openai
import pandas as pd

# calculate embeddings
EMBEDDING_MODEL = "text-embedding-ada-002"  # OpenAI's best embeddings as of Apr 2023
BATCH_SIZE = 1000  # you can submit up to 2048 embedding inputs per request

embeddings = []
for batch_start in range(0, len(chunks), BATCH_SIZE):
    batch_end = batch_start + BATCH_SIZE
    batch = chunks[batch_start:batch_end]
    print(f"Batch {batch_start} to {batch_end-1}")
    response = openai.Embedding.create(model=EMBEDDING_MODEL, input=batch)
    for i, be in enumerate(response["data"]):
        assert i == be["index"]  # double check embeddings are in same order as input
    batch_embeddings = [e["embedding"] for e in response["data"]]
    embeddings.extend(batch_embeddings)

df = pd.DataFrame({"text": chunks, "embedding": embeddings})

Batch 0 to 999


Save the embeddings as csv

In [26]:
SAVE_PATH = "cancer-knowledge.csv"
df.to_csv(SAVE_PATH, index=False)

# Use the embedding as knowledge base

In [28]:
import ast  # for converting embeddings saved as strings back to arrays

df = pd.read_csv(SAVE_PATH)

# convert embeddings from CSV str type back to list type
df['embedding'] = df['embedding'].apply(ast.literal_eval)

In [29]:
df

Unnamed: 0,text,embedding
0,Liver Cancer\nHepatobiliary Cancers\nAvailable...,"[0.015058930031955242, -0.0007402518531307578,..."
1,affected by a cancer diagnosis by funding and ...,"[-0.0017696862341836095, -0.002883933018893003..."
2,and treatment\nFree online at \nNCCN.org/guid...,"[0.007323516067117453, -0.0016455147415399551,..."
3,74 Index6\nNCCN Guidelines for Patients® \nLiv...,"[0.015031230635941029, 0.008718114346265793, 0..."
4,Gallbladder\nCommon bile ductThe liver and \nn...,"[-0.0022459784522652626, 0.02593529038131237, ..."
...,...,...
76,Cancer Center of Northwestern \nUniversity\nCh...,"[-0.003479365259408951, -0.00542986998334527, ..."
77,health.ucdavis.edu/cancer\nUC San Diego Moores...,"[0.009501981548964977, 0.0006425384781323373, ..."
78,cancer grade 11\ncancer stage 10–13\nchemoth...,"[-0.006136395037174225, 0.00018382516282144934..."
79,"radiation therapy (RT) 34\nresectable 29, 41...","[-0.007198337931185961, -0.013739843852818012,..."


### Prepare Search functions

In [32]:
from scipy import spatial  # for calculating vector similarities for search

# search function
def strings_ranked_by_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
    top_n: int = 100
) -> tuple[list[str], list[float]]:
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    query_embedding_response = openai.Embedding.create(
        model=EMBEDDING_MODEL,
        input=query,
    )
    query_embedding = query_embedding_response["data"][0]["embedding"]
    strings_and_relatednesses = [
        (row["text"], relatedness_fn(query_embedding, row["embedding"]))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n], relatednesses[:top_n]

In [34]:
strings, relatednesses = strings_ranked_by_relatedness("liver transplant", df, top_n=5)
for string, relatedness in zip(strings, relatednesses):
    print(f"{relatedness=:.3f}")
    display(string)

relatedness=0.864


'A partial hepatectomy is different than a liver \ntransplant. \nA partial hepatectomy is not for everyone. Liver \ndamage, such as cirrhosis or fibrosis, can make \nsurgery more difficult or not possible. The size \nand location of the tumor, as well as your liver \nfunction (Child-Pugh score), will play a role in if \ntumor resection is the best option for you. You \nmust also have a working liver and be healthy \nenough for surgery. \nGet to know your \ncare team and let \nthem get to  \nknow you.30\nNCCN Guidelines for Patients® \nLiver Cancer, 20213 Treatment overview  Liver transplant\nLiver transplant\nIn a liver transplant, the entire diseased liver \nis removed and replaced with a healthy donor \nliver. The new liver may be donated from a \nperson who recently died or a section of liver \nmay be donated from a living person. A liver \ntransplant is based on certain size limits and \ntumor locations. \nThe liver is divided into 8 sections or segments \nbased on portal vein and 

relatedness=0.863


'Other treatments may be given if you are \nwaiting for a transplant. These treatments are \ncalled bridging therapy and include ablation and \nembolization. \nThere is still a chance that cancer will return \nafter a liver transplant. It is also possible your \nbody will reject the donor liver. You will be given \nmedicine to prevent rejection. \nBridge therapy\nThere can be long wait times to receive a liver \ntransplant. Treatments may be given while you \nwait for a transplant. These treatments are \ncalled bridge or bridging therapy and include \nablation, embolization, or radiation therapy.\nLiver transplant\nThe liver is divided into \n8 sections or segments \nbased on the location of \nthe portal vein, hepatic \nvein, and bile ducts.31\nNCCN Guidelines for Patients® \nLiver Cancer, 20213 Treatment overview  Liver transplant\nDownstaging therapy\nDownstaging therapy is used to reduce \nthe tumor burden in selected patients \nwith more advanced HCC (without distant \nmetastasis) 

relatedness=0.856


'healthy enough for resection. A tumor \nthat cannot be removed with surgery is \ncalled unresectable. A liver transplant \nmight be an option for some. Together, \nyou and your doctor will choose a \ntreatment plan that is best for you.\nOverview\nSurgery to remove a tumor is called resection. \nSurgery that removes the tumor with part of the \nliver is called partial hepatectomy. Sometimes, \nsurgery is not possible because of where the \ntumor is located or the liver is too damaged. In \naddition, sometimes liver transplant is not an \noption. \nThere are treatments if a partial hepatectomy or \nliver transplant are not options. \nTransplant\nTransplant may be an option if the following \nUNOS criteria are met:\n \x86Alpha-fetoprotein (AFP) levels are 1000 \nng/mL or less and tumor is 2 to 5 cm in \ndiameter or 2 to 3 tumors are 1 to 3 cm \n \x86No large veins, arteries, or bile ducts have \ncancer\n \x86No disease outside the liver (extrahepatic)If transplant is an option, then you

relatedness=0.853


'After a liver transplant, you will start \nsurveillance. Surveillance consists of testing on \na regular basis to watch for signs that cancer \nhas returned. Imaging tests and blood tests to \nlook for alpha-fetoprotein (AFP) are needed. \nSurveillance includes:\n \x86Imaging tests every 3 to 6 months for 2 \nyears, then every 6 to 12 months\n \x86AFP every 3 to 6 months for 2 years, then \nevery 6 to 12 months46\nNCCN Guidelines for Patients® \nLiver Cancer, 20215 Unresectable  Treatment without surgery\nTreatment without surgery\nNot everyone is healthy enough for a liver \ntransplant. You may not want a liver transplant. \nThere are treatments if surgery or a liver \ntransplant are not options. For treatment \noptions, see Guide 5.\nLocoregional therapy\nLocoregional therapy focuses on the area \nor region where the cancer is. It includes \nablation, arterially directed therapy, and \nradiation therapy. These are the preferred \ntreatment options for those not receiving a liver \nt

relatedness=0.846


'Resectable treatment options\nSurgery to remove tumor (preferred)\nAblation\nArterially directed therapy\nExternal beam radiation therapy (EBRT)43\nNCCN Guidelines for Patients® \nLiver Cancer, 20214 Resectable  Transplant\n 4 Resectable  Transplant | Surveillance | Key points\nTransplant\nTransplant may be an option if the following \nUNOS criteria are met:\n \x86AFP levels are 1000 ng/mL or less and \ntumor is 2 to 5 cm in diameter or 2 to 3 \ntumors are 1 to 3 cm \n \x86No large veins, arteries, or bile ducts have \ncancer (no macrovascular involvement)\n \x86No disease outside the liver (extrahepatic)\nTransplant is an option\nIf transplant is an option, then you will:\n \x86Be referred to a liver transplant center that \nhas an experienced transplant team\n \x86Have bridging therapy \nBridging therapy is treatment given while \nwaiting for a transplant to prevent cancer \nfrom growing or spreading. If cancer grows or \nspreads, you might not be able to have a liver \ntransplant. 

### 3. Ask by appending related chunks as the context

In [38]:
GPT_MODEL = 'gpt-3.5-turbo'

def query_message(
    query: str,
    df: pd.DataFrame,
    model: str,
    token_budget: int
) -> str:
    """Return a message for GPT, with relevant source texts pulled from a dataframe."""
    strings, relatednesses = strings_ranked_by_relatedness(query, df)
    introduction = 'Use the below articles on the National Comprehensive Cancer Network Guideline Liver Cancer to answer the subsequent question. If the answer cannot be found in the articles, write "I could not find an answer."'
    question = f"\n\nQuestion: {query}"
    message = introduction
    for string in strings:
        next_article = f'\n\nNational Comprehensive Cancer Network Guideline Liver Cancer section:\n"""\n{string}\n"""'
        if (
            tiktoken_len(message + next_article + question)
            > token_budget
        ):
            break
        else:
            message += next_article
    return message + question


def ask(
    query: str,
    df: pd.DataFrame = df,
    model: str = GPT_MODEL,
    token_budget: int = 4096 - 500,
    print_message: bool = False,
) -> str:
    """Answers a query using GPT and a dataframe of relevant texts and embeddings."""
    message = query_message(query, df, model=model, token_budget=token_budget)
    if print_message:
        print(message)
    messages = [
        {"role": "system", "content": "You answer questions about the National Comprehensive Cancer Network Guideline Liver Cancer."},
        {"role": "user", "content": message},
    ]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0
    )
    response_message = response["choices"][0]["message"]["content"]
    return response_message

In [40]:
ask('What is the requirement for liver transplant?', print_message=True)

Use the below articles on the National Comprehensive Cancer Network Guideline Liver Cancer to answer the subsequent question. If the answer cannot be found in the articles, write "I could not find an answer."

National Comprehensive Cancer Network Guideline Liver Cancer section:
"""
After a liver transplant, you will start 
surveillance. Surveillance consists of testing on 
a regular basis to watch for signs that cancer 
has returned. Imaging tests and blood tests to 
look for alpha-fetoprotein (AFP) are needed. 
Surveillance includes:
 Imaging tests every 3 to 6 months for 2 
years, then every 6 to 12 months
 AFP every 3 to 6 months for 2 years, then 
every 6 to 12 months46
NCCN Guidelines for Patients® 
Liver Cancer, 20215 Unresectable  Treatment without surgery
Treatment without surgery
Not everyone is healthy enough for a liver 
transplant. You may not want a liver transplant. 
There are treatments if surgery or a liver 
transplant are not options. For treatment 
options, see Gui

'The UNOS criteria for liver transplant include: AFP levels are 1000 ng/mL or less and tumor is 2 to 5 cm in diameter or 2 to 3 tumors are 1 to 3 cm, no large veins, arteries, or bile ducts have cancer, and no disease outside the liver (extrahepatic).'

In [41]:
ask('What is large language model?')

'I could not find an answer. The provided articles are about the National Comprehensive Cancer Network Guideline Liver Cancer and do not discuss the topic of large language model.'

In [42]:
ask('What are the stages of liver cancer?')

'The stages of liver cancer are Stage 1A, Stage 1B, Stage 2, Stage 3A, Stage 3B, Stage 4A, and Stage 4B. Each stage is defined by the size and number of tumors, involvement of blood vessels or nearby organs, lymph node involvement, and distant metastasis. The TNM system is also used to describe the extent of cancer growth.'

In [43]:
ask('How to diagnose liver cancer?')

'Liver cancer is usually confirmed with imaging tests such as CT or MRI scans. Blood tests, physical exams, and biopsies may also be used to diagnose liver cancer. A multidisciplinary team of doctors evaluates liver health and stages the cancer based on the number of tumors, tumor size(s) and location(s), and if the tumor(s) involves any blood vessels, bile ducts, or nearby organs. Family history and medical history are also taken into consideration to determine the best treatment plan. Regular screening tests such as ultrasounds and alpha-fetoprotein (AFP) tests may be recommended for those at risk of developing liver cancer. (Sources: National Comprehensive Cancer Network Guideline Liver Cancer sections on "Diagnosing liver cancer" and "Tests for treatment planning")'

In [44]:
ask('What kind of blood test can diagnose liver cancer?')

'An alpha-fetoprotein (AFP) test can diagnose liver cancer.'

In [46]:
ask('What drugs work for metastatic?')

'The preferred first-line systemic therapy for metastatic liver cancer is atezolizumab with bevacizumab. Other recommended options for first-line systemic therapy for metastatic liver cancer include sorafenib and lenvatinib. Regorafenib is an option for next-line systemic therapy for disease progression. Nivolumab and FOLFOX are also useful in some cases. (Sources: National Comprehensive Cancer Network Guideline Liver Cancer, Guide 10, Guide 11)'

In [47]:
ask('what are reactions of a tranplant')

'After a liver transplant, the body may reject the donor liver, but medicine will be given to prevent rejection. Additionally, surveillance consisting of regular testing is done to watch for signs of cancer returning.'

In [49]:
ask('what is the name of the tumor that can be removed')

'The name of the tumor that can be removed is resectable.'

In [50]:
ask('can you give me a summary of this paper')

'The National Comprehensive Cancer Network Guideline Liver Cancer provides information on diagnosing and treating liver cancer. The guideline emphasizes the importance of organizing medical records and seeking a second opinion. It also provides information on general health tests, physical exams, and blood tests. The guideline discusses treatment options, including surgery, liver transplant, ablation, embolization, and radiation therapy. It also covers clinical trials and supportive care. The guideline encourages patients to discuss their treatment goals with their doctor and seek treatment at a hospital or cancer center that specializes in liver cancer.'

In [51]:
ask('i have a quiz tomorrow on this paper, what are the possible quiz questions')

'As an AI language model, I do not have access to your quiz questions. However, based on the information provided in the National Comprehensive Cancer Network Guideline Liver Cancer, possible quiz questions could include:\n\n1. What are some questions to ask your doctors about testing and staging for liver cancer?\n2. What are some questions to ask your doctors about their experience treating liver cancer?\n3. What are some questions to ask about treatment options for liver cancer?\n4. What are some questions to ask about surgery for liver cancer?\n5. What are some questions to ask about radiation therapy for liver cancer?\n6. What are some questions to ask about clinical trials for liver cancer?\n7. What are some questions to ask about side effects of liver cancer treatment?\n8. What is a pain diary and how can it be helpful for managing liver cancer pain?\n9. What is a resectable tumor and what are some treatment options for it?\n10. What are some resources available for liver cancer

In [53]:
ask('How does atezolizumab work?')

'Atezolizumab is an immunotherapy drug that increases the activity of the immune system to find and destroy cancer cells. It is a targeted therapy that blocks the signals that cause liver cancer to grow and spread.'