In [1]:
import random
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

In [2]:
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
set_seed(42)

In [3]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')
t5_model = T5ForConditionalGeneration.from_pretrained('t5-base')

In [4]:
# optimizer
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in t5_model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
    {
        "params": [p for n, p in t5_model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
optimizer = AdamW(optimizer_grouped_parameters, lr=3e-4, eps=1e-8)

In [25]:
hypernym_finder_tuples = [("The hypernym for red is","The hypernym for red is color"),
                          ("The hypernym for green is","The hypernym for green is color"),
                          ("The hypernym for yellow is","The hypernym for yellow is color"),
                          ("The hypernym for hydrogen peroxide is","The hypernym for hydrogen peroxide is molecule"),
                          ("The hypernym for hydrogen peroxide is","The hypernym for hydrogen peroxide is chemical"),
                          ("The hypernym for heart is","The hypernym for heart is body part"),
                          ("The hypernym for heart is","The hypernym for heart is anatomical structure"),
                          ("The hypernym for bipolar is","The hypernym for bipolar is mood disorder"),
                          ("The hypernym for bipolar is","The hypernym for bipolar is mental illness"),
                          ("The hypernym for eukaryote is","The hypernym for x is y"),
                          ("The hypernym for Asia is","The hypernym for Asia is Continent"),
                          ("The hypernym for Europe is","The hypernym for Europe is Continent"),
                          ("The hypernym for pitbull is","The hypernym for pitbull is dog"),
                          ("The hypernym for german shepherd is","The hypernym for german shepherd is dog"),
                          ("The hypernym for greyhound is","The hypernym for greyhound is dog"),
                          ("The hypernym of atrial fibrillation is","The hypernym of atrial fibrillation is arrhythmia"),
                          ("The hypernym of ventricular tachycardia is","The hypernym of ventricular tachycardia is arrhythmia"),
                          ("The hypernym for antidepressant is","drug"),
                          ("The hypernym for amlodipine is","drug"),
                          ("The hypernym for doxorubicin is","chemotherapy"),
                          ("The hypernym for diabetes is","The hypernym for diabetes is disease"),
                          ("The hypernym for polymyalgia rheumatica is","The hypernym for polymyalgia rheumatica is disease"),
                          ("The hypernym for myocardial infarction is","The hypernym for myocardial infarction is heart attack"),
                          ("The hypernym for serotonin is","The hypernym for serotonin is molecule"),
                          ("The hypernym for nucleotide is","The hypernym for nucleotide is molecule"),
                          ("The hypernym for antifibrinolytic is","The hypernym for antifibrinolytic is blood thinner"),
                          ("The hypernym for plavix is","The hypernym for plavix is blood thinner"),
                          ("The hypernym for rivaroxaban is","The hypernym for rivaroxaban is blood thinner"),
                          ("The hypernym for forceps is","The hypernym for forceps is tool"),
                          ("The hypernym for wrench is","The hypernym for wrench is tool"),
                          ("The hypernym for lysine is","The hypernym for lysine is amino acid"),
                          ("The hypernym for lysine is","The hypernym for lysine is molecule"),
                          ("The hypernym for atria is","The hypernym for atria is heart chamber"),
                          ("The hypernym for atria is","The hypernym for atria is body part"),
                          ("The hypernym for duodenum is","The hypernym for duodenum is body part"),
                          ("The hypernym for keppra is","The hypernym for keppra is antiepileptic"),
                          ("The hypernym for keppra is","The hypernym for keppra is antiepileptic"),
                          ("The hypernym for predecessor is","The hypernym for predecessor is precursor"),
                          ("The hypernym for president is","The hypernym for president is leader"),
                          ("The hypernym for letter is","The hypernym for letter is character"),
                          ("The hypernym for apple is","The hypernym for apple is fruit"),
                          ("The hypernym for orange is","The hypernym for orange is fruit"),
                          ("The hypernym for orange is","The hypernym for orange is color"),
                          ]



In [37]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

t5_model.train()

epochs = 15

for epoch in range(epochs):
  print ("epoch ",epoch)
  for input,output in hypernym_finder_tuples:
    input_sent = "find hypernym: "+input+ " </s>"
    ouput_sent = output+" </s>"

    tokenized_inp = tokenizer.encode_plus(input_sent,  max_length=96, pad_to_max_length=True,return_tensors="pt")
    tokenized_output = tokenizer.encode_plus(ouput_sent, max_length=96, pad_to_max_length=True,return_tensors="pt")


    input_ids  = tokenized_inp["input_ids"]
    attention_mask = tokenized_inp["attention_mask"]

    labels= tokenized_output["input_ids"]
    decoder_attention_mask=  tokenized_output["attention_mask"]


    # the forward function automatically creates the correct decoder_input_ids
    output = t5_model(input_ids=input_ids, labels=labels,decoder_attention_mask=decoder_attention_mask,attention_mask=attention_mask)
    loss = output[0]

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()


epoch  0
epoch  1
epoch  2
epoch  3
epoch  4
epoch  5
epoch  6
epoch  7
epoch  8
epoch  9
epoch  10
epoch  11
epoch  12
epoch  13
epoch  14


In [38]:
# Import CWI modules and point machine path to temporary path that the CWI module uses to do work
from complex_labeller import Complexity_labeller
model_path = './cwi_seq.model'
temp_path = './temp_file.txt'
model = Complexity_labeller(model_path, temp_path)
#2044


ValueError: Variable word_embeddings already exists, disallowed. Did you mean to set reuse=True or reuse=tf.AUTO_REUSE in VarScope? Originally defined at:

  File "/home/karl/PycharmProjects/DLAI/translate/labeler.py", line 118, in construct_network
    self.word_embeddings = tf.get_variable("word_embeddings",
  File "/home/karl/PycharmProjects/DLAI/translate/labeler.py", line 458, in load
    labeler.construct_network()
  File "/home/karl/PycharmProjects/DLAI/translate/complex_labeller.py", line 21, in __init__
    self.model = labeler.SequenceLabeler.load(self.model_path)
  File "/tmp/ipykernel_54745/2995099527.py", line 5, in <module>
    model = Complexity_labeller(model_path, temp_path)
  File "/home/karl/anaconda3/envs/rapids-21.10/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3444, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)


In [72]:
def autotranslate(document):
    return_list = []
    #print('\n#####################NEW ENTRY#######################')
    #print(cui)
    old_document = str(document)
    new_document = str(document)
    cw_list = []

    #try:
    Complexity_labeller.convert_format_string(model, document)
    dataframe = Complexity_labeller.get_dataframe(model)
    cw_list = list(zip(dataframe['sentences'].values[0], dataframe['labels'].values[0], dataframe['probs'].values[0]))
    #except:
    #pass
    print(cw_list)
    for i in cw_list:
        if i[1] == 1:

            test_sent = 'find hypernym: The hypernym for {} is</s>'.format(str(i[0]))
            test_tokenized = tokenizer.encode_plus(test_sent, return_tensors="pt")

            test_input_ids  = test_tokenized["input_ids"]
            test_attention_mask = test_tokenized["attention_mask"]

            t5_model.eval()
            beam_outputs = t5_model.generate(
                input_ids=test_input_ids,attention_mask=test_attention_mask,
                max_length=60,
                early_stopping=True,
                num_beams=10,
                num_return_sequences=1,
                no_repeat_ngram_size=2
            )

            for beam_output in beam_outputs:
                sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
                print(sent)
                new_document = old_document.replace(str(i[0]), str(sent.split(' ')[-1]))
                old_document = new_document
    return document, new_document


In [73]:
documents = ['precursor of serotonin used as antiepileptic and antidepressant.',
             'Salts and esters of ALGINIC ACID that are used as HYDROGELS; DENTAL IMPRESSION MATERIALS, and as absorbent materials for surgical dressings (BANDAGES, HYDROCOLLOID).',
             'A state due to excess loss of carbon dioxide from the body.']

In [74]:
for document in documents:
    old_doc, new_doc = autotranslate(document)
    print(old_doc,'\n', new_doc)

[('precursor', 1, array([0.10260871, 0.89739126], dtype=float32)), ('of', 0, array([9.999379e-01, 6.214276e-05], dtype=float32)), ('serotonin', 1, array([0.21891482, 0.7810852 ], dtype=float32)), ('used', 0, array([9.992920e-01, 7.079054e-04], dtype=float32)), ('as', 0, array([9.9993837e-01, 6.1578088e-05], dtype=float32)), ('antiepileptic', 1, array([0.02242535, 0.97757465], dtype=float32)), ('and', 0, array([9.999101e-01, 8.985217e-05], dtype=float32)), ('antidepressant', 1, array([0.03892209, 0.9610779 ], dtype=float32)), ('.', 0, array([9.9995625e-01, 4.3727112e-05], dtype=float32))]




The hypernym for precursor is precursor
The hypernym for serotonin is molecule
drug
drug
precursor of serotonin used as antiepileptic and antidepressant. 
 precursor of molecule used as drug and drug.
[('Salts', 1, array([0.25910294, 0.7408971 ], dtype=float32)), ('and', 0, array([9.9987948e-01, 1.2050271e-04], dtype=float32)), ('esters', 1, array([0.2572822 , 0.74271786], dtype=float32)), ('of', 0, array([9.9994743e-01, 5.2535663e-05], dtype=float32)), ('ALGINIC', 0, array([0.9330063, 0.0669937], dtype=float32)), ('ACID', 0, array([0.97299373, 0.02700627], dtype=float32)), ('that', 0, array([9.9992144e-01, 7.8585384e-05], dtype=float32)), ('are', 0, array([9.9993289e-01, 6.7087574e-05], dtype=float32)), ('used', 0, array([0.9986003 , 0.00139964], dtype=float32)), ('as', 0, array([9.9993026e-01, 6.9689435e-05], dtype=float32)), ('HYDROGELS', 0, array([0.9472308 , 0.05276922], dtype=float32)), (';', 0, array([9.9995840e-01, 4.1616262e-05], dtype=float32)), ('DENTAL', 0, array([0.9404594

In [None]:
                                                                                                                                                                                                                                                                                # You stopped here. You were thinking about running this on a few examples and tidying up the function to replace the
# actual words. May need a few more "shots" and a few more epochs for accuracy.

In [None]:
# From here on in the code were tests - can ignore, or play
#pip uninstall transformers
#pip install git+https://github.com/StellaAthena/transformers
import tensorflow as tf
from transformers import AutoTokenizer, AutoModelForCausalLM

In [3]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")

In [4]:
model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")

In [7]:
from transformers import pipeline
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

In [10]:
generator("The hypernym for atrial fibrillation is", do_sample=True, max_length=10)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 11, but ``max_length`` is set to 10. This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``.


[{'generated_text': 'The hypernym for atrial fibrillation is at'}]

In [12]:
import nlpcloud

In [13]:
client = nlpcloud.Client("gpt-j", "your_token", gpu=True)

In [14]:
generation = client.generation("""Context: The heart's upper chambers (atria) beat out of coordination with the lower chambers (ventricles). This condition may have no symptoms, but when symptoms do appear they include palpitations, shortness of breath, and fatigue. Treatments include drugs, electrical shock (cardioversion), and minimally invasive surgery (ablation).
            Question: What is a general hypernym for atrial fibrillation that an 8th grader can understand?
            Answer: arrhythmia
            ###
            Context: Doxorubicin is a type of chemotherapy drug called an anthracycline. It slows or stops the growth of cancer cells by blocking an enzyme called topo isomerase 2. Cancer cells need this enzyme to divide and grow. You might have doxorubicin in combination with other chemotherapy drugs.
            Question: What is a general hypernym for doxorubicin that an 8th grader can understand?
            Answer: Chemotherapy
            ###
            Context: Diabetes is a chronic (long-lasting) health condition that affects how your body turns food into energy. Most of the food you eat is broken down into sugar (also called glucose) and released into your bloodstream. When your blood sugar goes up, it signals your pancreas to release insulin
            Question: What is a general hypernym for diabetes that an 8th grader can understand?
            Answer:""",
    length_no_input=True,
    end_sequence="\n###",
    remove_input=True)
print(generation["generated_text"])

#  Use mostly wn for hypernym
# Need to build out cosine similarity for hypernym defs
# If no sub possible with WN - then use UMLS
# May be best to reconsitute sentence after wn sub - rerun CWI - then run biomedical translator? probs not

HTTPError: 401 Client Error: Unauthorized for url: https://api.nlpcloud.io/v1/gpu/gpt-j/generation: 

In [None]:
from translate_functions import find_complex_words, wn_hypernym_sub, sub_umls_hypernym, recon_doc.py,

#from sub_umls_hypernym import sub_umls_hypernym
#from recon_doc import recon_doc
#from runGinger import runGinger
#from grade_the_document import grade_the_document

In [None]:
!nvidia-smi

In [None]:
"""GPT Is a powerful tool, but it is known to give falsehoods. This falsehood return is worse with larger models
as the model learns the way humans give falsehood answers. https://twitter.com/owainevans_uk/status/1438472786188636162
For this reason, we believe that few-shot training at the very least and specific parameters of what the model should do
is necessary for use in biomedical text dejargonization"""


In [None]:
for line in open('/home/karl/PycharmProjects/DLAI/datasets/UMLS/MRDEF_test.txt'):
    splitline = line.split('|')
    cui = splitline[0]
    document = splitline[1]
    cw_list = find_complex_words(document)
    cw_list = wn_hypernym_sub(cw_list)