# T5 Models


In [1]:
import os
import glob
import tensorflow as tf
import pandas as pd
import numpy as np
import zipfile as zf
from glob import glob

# from keras.saving.hdf5_format import save_attributes_to_hdf5_group
from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoModelWithLMHead
    # T5Tokenizer,
    # TFT5Model,
    # TFT5ForConditionalGeneration,
    # TFBertModel,
    # TFBertForQuestionAnswering,
)
import sentencepiece
from metapub import PubMedFetcher
from semanticscholar import SemanticScholar
from metapub import FindIt
import requests
import urllib
import json

2023-07-20 05:31:27.780607: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-20 05:31:27.829557: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-20 05:31:27.830767: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
# import functions from a file stored in a different directory
import sys

sys.path.append("/home/ubuntu/work/therapeutic_accelerator/scripts/utils")
sys.path.append("/home/ubuntu/work/therapeutic_accelerator/scripts/database")

from utils import import_config
from db_tools import db_connection

config, keys = import_config()

# engine = db_connection(password = keys["postgres"], host = config["database"]["host"])
import chromaDB_connection as cDBc

# Chroma setup
chroma_server_host = "34.238.51.66"
chroma_client = cDBc.create_chroma_client(chroma_server_host)

# Working Collection
collection = chroma_client.get_or_create_collection("specter_abstracts")

In [4]:
# Hard coded variables
max_sequence_length = 512
embedding_size = 200

In [5]:
tokenizer = AutoTokenizer.from_pretrained('t5-base')
model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict=True)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [6]:
# T5Abstract_model = TFT5ForConditionalGeneration.from_pretrained("t5-base")
# T5tokens = T5Tokenizer.from_pretrained("t5-base")

In [20]:
### T5 Abstractive Text Summarization Model
class summary_model:
    def __init__(self, tokenizer, model):
        self.tokenizer = tokenizer
        self.model = model

    def summarize_text(self, text):

        encoding = self.tokenizer.encode(
            "summarize: " + text,
            return_tensors="pt",
            max_length = 3000,
            truncation=True
            )

        output = self.model.generate(
            encoding,
            num_beams=3,
            no_repeat_ngram_size=2,
            top_k=10,
            top_p=80,
            max_length=150,
            min_length=30,
        )
        
        summarization = [
            self.tokenizer.decode(
                w, skip_special_tokens=True, clean_up_tokenization_spaces=True
            )
            for w in output
        ][0]
        
        return summarization

# Get Results from ChromaDB


In [8]:
question = "What is pancreatic cancer?"
results = cDBc.query_chroma(collection, question)
results

{'ids': ['38374595-0',
  '11181159-0',
  '203622768-0',
  '211474643-0',
  '232429176-0',
  '38325820-0',
  '10984456-0',
  '234597674-0',
  '29710885-0',
  '26735832-0'],
 'distances': [209.34613037109375,
  227.97434997558594,
  228.57211303710938,
  291.1222839355469,
  294.4068603515625,
  296.2919006347656,
  302.37457275390625,
  306.3112487792969,
  310.4975891113281,
  317.519287109375],
 'embeddings': None,
 'metadatas': [{'corpusid': 38374595, 'chunk': 0},
  {'corpusid': 11181159, 'chunk': 0},
  {'corpusid': 203622768, 'chunk': 0},
  {'corpusid': 211474643, 'chunk': 0},
  {'corpusid': 232429176, 'chunk': 0},
  {'corpusid': 38325820, 'chunk': 0},
  {'corpusid': 10984456, 'chunk': 0},
  {'corpusid': 234597674, 'chunk': 0},
  {'corpusid': 29710885, 'chunk': 0},
  {'corpusid': 26735832, 'chunk': 0}],
 'documents': ['Microvascular density (MVD), a marker for tumor angiogenesis, has been demonstrated to have prognostic significance in various malignancies. Previous studies have dem

In [None]:
# for tokenization and splitting text
text_splitter = cDBc.text_splitter(tokenizer = tokenizer, chunk_size=512, chunk_overlap=20).create_text_splitter()

In [None]:
docs = text_splitter.create_documents(results['documents'])

In [None]:
docs

[Document(page_content='Microvascular density (MVD), a marker for tumor angiogenesis, has been demonstrated to have prognostic significance in various malignancies. Previous studies have demonstrated that MVD is an independent prognostic factor in pancreatic adenocarcinoma and that longer survival is associated with hypovascular tumors. The prognostic importance of MVD in pancreatic neuroendocrine tumor (NET) has not been documented. We evaluated MVD in pancreatic NET and correlated it with clinicopathologic features and patient outcome to determine whether MVD is a useful prognostic indicator for these patients. Twenty-five pancreatic NETs from our archival files resected between 1981 and 2000 were identified. The mean MVD was determined for each tumor from the 3 most vascularized 200 × fields. Clinical follow-up ranged from 1 to 19 years, with a mean of 4.9 years. At last follow-up, 6 patients were dead of disease, 10 patients were alive without disease,', metadata={}),
 Document(pag

In [None]:
# # Questions
# prompts = pd.read_csv("/home/ubuntu/work/therapeutic_accelerator/data/prompts.csv")
# # testing prompt one
# question = prompts.loc[0, "Prompt"]

# Testing


In [21]:
t5_model = summary_model(tokenizer, model)

In [23]:
text = ''.join(results['documents'])

In [29]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# def split_text_into_chunks(text, max_chunk_len=512):
#     chunks = []
#     current_chunk = ""
#     for sentence in text.split('. '):  # Split by sentences, adjust as needed
#         if len(current_chunk) + len(sentence) < max_chunk_len:
#             current_chunk += sentence + '. '
#         else:
#             chunks.append(current_chunk.strip())
#             current_chunk = sentence + '. '
#     if current_chunk:
#         chunks.append(current_chunk.strip())
#     return chunks

def summarize_text_with_t5(text):
    model_name = "t5-base"
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)

    # chunks = split_text_into_chunks(text)
    summaries = []
    chunks = text_splitter.split_text(text)
    
    for chunk in chunks:
        inputs = tokenizer.encode("summarize: " + chunk, return_tensors="pt", max_length=512, truncation=True)
        summary_ids = model.generate(inputs, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(summary)

    final_summary = " ".join(summaries)
    return final_summary


In [30]:
# Generate the summary
summary = summarize_text_with_t5(text)
print(summary)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


microvascular density (MVD) is a marker for tumor angiogenesis. MVD is an independent prognostic factor in pancreatic adenocarcinoma. longer survival is associated with hypovascular tumors. 10 patients alive without disease, 4 alive with disease, 5 alive with disease status unknown. MVD in pancreatic NET does not correlate with tumor size, histologic parameters or outcome. most tumors were highly vascular, and additional studies may be helpful. intrahepatic cholangiocarcinoma typically occurs in males between 50 and 70 years of age. symptoms include jaundice, pruritus, dark urine, abdominal pain, weight loss, fever. a 9 cm liver lesion was discovered, initially suspicious for hepatocellular carcinoma. report shows unusual presentation of intrahepatic cholangiocarcinoma. cholangio-carcinoma is diagnosed most frequently as an incidental finding. recurrence rate was 27.7% (64/231) in the study group. rectal cancer, preoperative serum CEA level >5.0 ng/ml 95% risk factors. prostate cancer 

In [38]:
for i in summary.split('.'): 
    print(i)

microvascular density (MVD) is a marker for tumor angiogenesis
 MVD is an independent prognostic factor in pancreatic adenocarcinoma
 longer survival is associated with hypovascular tumors
 10 patients alive without disease, 4 alive with disease, 5 alive with disease status unknown
 MVD in pancreatic NET does not correlate with tumor size, histologic parameters or outcome
 most tumors were highly vascular, and additional studies may be helpful
 intrahepatic cholangiocarcinoma typically occurs in males between 50 and 70 years of age
 symptoms include jaundice, pruritus, dark urine, abdominal pain, weight loss, fever
 a 9 cm liver lesion was discovered, initially suspicious for hepatocellular carcinoma
 report shows unusual presentation of intrahepatic cholangiocarcinoma
 cholangio-carcinoma is diagnosed most frequently as an incidental finding
 recurrence rate was 27
7% (64/231) in the study group
 rectal cancer, preoperative serum CEA level >5
0 ng/ml 95% risk factors
 prostate cancer 

In [25]:
summarization = t5_model.summarize_text(text)
summarization

'recurrence rate is 27.7% in patients with stage III colorectal cancer. a high plasma osteopontin level predicts poor response to radiotherapy, study finds. authors: MVD is an independent prognostic factor in pancreatic neuroendocrine tumor (NET)'

In [None]:
from langchain.chains import LLMChain
chain = LLMChain(llm=llm, prompt=prompt)

# Run the chain only specifying the input variable.
print(chain.run("colorful socks"))