In [1]:
# from transformers import pipeline
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from torch.utils.data import Dataset
import torch
import os
import os.path as p
import json

In [2]:
from bastifonctions import *

import sys
sys.path.append("metrics")
from helper_metrics import *

In [3]:
data_path = "/home/etien/Documents/EPFLcourses/MA3/Meditron/generated_patients"
train_data, labels = download_data(data_path)

In [4]:
from transformers import EvalPrediction

# Custom metric

def compute_metrics(p: EvalPrediction, model, tokenizer):
    true_labels = p.label_ids
    input_ids = p.input_ids

    # Get the model's output
    with torch.no_grad():
        outputs = model(input_ids)
        # Assuming the model returns the loss as the first output and logits as the second output
        all_hidden_states = outputs.hidden_states
        second_last_layer = all_hidden_states[-2]

    with torch.no_grad():
        # freeze all the parameters
        for param in model.parameters():
            param.requires_grad = False
        
        outputs_true = model(true_labels)
        all_hidden_states_true = outputs_true.hidden_states
        second_last_layer_true = all_hidden_states_true[-2]

    # Compute the distance between the true label embeddings and the second last layer's output.
    distance = tensor_distance(second_last_layer, second_last_layer_true, distance_type="L2")

    return {"distance": distance.item()}


In [5]:
from transformers import LongformerForMaskedLM
model = LongformerForMaskedLM.from_pretrained("allenai/longformer-base-4096", output_hidden_states=True)
#model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
retriever = 'TF-IDF'

In [6]:
fine_tune_model(model, train_data, labels, tokenizer, retriever, compute_metrics)

[4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 409

  return torch._C._cuda_getDeviceCount() > 0
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33metiennalphat[0m ([33mbastiteam[0m). Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/2580 [00:00<?, ?it/s]

In [1]:
from tf_idf import *
# Load medical guidelines from a data folder
data_folder = "./data/structure/Guidelines/split_guidelines"
tf_idf_path = "./data/TF-IDF matrix"
tf_idf_matrix, vectorizer = create_matrix(tf_idf_path)


In [2]:
# Example usage:
query = "cholera"
relevant_guidelines = retrieve_top_k_guidelines(query,tf_idf_matrix, vectorizer, data_folder, k=3)

In [6]:
next(relevant_guidelines)

'{\n    "source": "wikidoc",\n    "title": "List of epidemics",\n    "url": "https://www.wikidoc.org/index.php/List_of_epidemics",\n    "text": "List of epidemics\\nThis article is a list of major epidemics.\\n\\n# Worldwide Pandemics\\n- 165-180: Antonine Plague, perhaps smallpox\\n- 541: the Plague of Justinian\\n- 1300s: the Black Death\\n- 1501-1587: typhus\\n- 1732-1733: influenza\\n- 1775-1776: influenza\\n- 1816-1826: cholera\\n- 1829-1851: cholera\\n- 1847-1848: influenza\\n- 1852-1860: cholera\\n- 1855-1950s: bubonic plague: Third Pandemic\\n- 1857-1859: influenza\\n- 1863-1875: cholera\\n- 1889-1892: influenza\\n- 1899-1923: cholera\\n- 1918-1920: avian flu: Spanish flu: more people were hospitalized in World War I from this epidemic than wounds. Estimates of the dead range from 20 to 40 million worldwide (WHO)\\n- 1960s: cholera called El Tor\\n- 1980s to present:  HIV\\n\\n# Regional\\n\\n## Asia\\n- 1957-1958: avian flu: Asian flu\\n- 1968-1969: avian flu: Hong Kong flu\\n

In [23]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=2,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator= data_collator
)

trainer.train()


Step,Training Loss


TrainOutput(global_step=25, training_loss=0.8925203704833984, metrics={'train_runtime': 232.5414, 'train_samples_per_second': 0.194, 'train_steps_per_second': 0.108, 'total_flos': 23516282880000.0, 'train_loss': 0.8925203704833984, 'epoch': 5.0})

In [26]:
text_gen_pipeline = TextGenerationPipeline(model=model, tokenizer=tokenizer, device=-1)  # device=0 for GPU

In [29]:
texts[0]

'\n{\n  "symptoms": [\n    {\n      "name of the symptom": "Throbbing headaches",\n      "intensity of symptom": "8 out of 10",\n      "specific attributes of the symptom": {\n        "location": "Temples",\n        "size": "",\n        "color": "",\n        "frequency": "Intermittent"\n      },\n      "When did the symptom appear ": "",\n      "previous treatments": "Antihypertensive medications",\n      "reaction to previous treaments": "The patient reports that the headaches have not been completely resolved by the medication.",\n      "behaviour affecting the symptom": ""\n    },\n    {\n      "name of the symptom": "Palpitations",\n      "intensity of symptom": "7 out of 10",\n      "specific attributes of the symptom": {\n        "location": "",\n        "size": "",\n        "color": "",\n        "frequency": "Intermittent"\n      },\n      "When did the symptom appear ": "",\n      "previous treatments": "Anxiolytics",\n      "reaction to previous treaments": "The patient report

In [30]:
text_gen_pipeline(texts[5])

Token indices sequence length is longer than the specified maximum sequence length for this model (1749 > 1024). Running this sequence through the model will result in indexing errors
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


IndexError: index out of range in self

In [63]:
train_encodings

{'input_ids': tensor([[  198,    90,   198,  ..., 50256, 50256, 50256],
        [  198,    90,   198,  ..., 50256, 50256, 50256],
        [43094, 35533,    25,  ..., 50256, 50256, 50256],
        ...,
        [  198,    90,   198,  ..., 50256, 50256, 50256],
        [   90,   198,   220,  ..., 50256, 50256, 50256],
        [  198,    90,   198,  ..., 50256, 50256, 50256]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [37]:
from torch.utils.data import Dataset
from Dataset import load_dataset



train_dataset = load_dataset(train_encodings)


In [38]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

In [46]:
print(data_collator)

DataCollatorForLanguageModeling(tokenizer=GPT2Tokenizer(name_or_path='gpt2-medium', vocab_size=50257, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}, mlm=False, mlm_probability=0.15, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='pt')


In [55]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=1,
    logging_dir='./logs',
    logging_steps=10,
    report_to=[],
    save_steps=50,
    save_total_limit=2,
    fp16=False,
    fp16_full_eval=False,
    use_cpu = True
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

trainer.train()


TypeError: GPT2Model.forward() got an unexpected keyword argument 'labels'

In [53]:
model(**train_encodings['input_ids'])

TypeError: GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50304, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear(in_features=768, out_features=768, bias=True)
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
        )
        (ln_2): LayerNorm()
        (mlp): MLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): Linear(in_features=768, out_features=50304, bias=False)
) argument after ** must be a mapping, not Tensor

In [52]:
train_encodings

{'input_ids': tensor([[  198,    90,   198,  ..., 50256, 50256, 50256],
        [  198,    90,   198,  ..., 50256, 50256, 50256],
        [43094, 35533,    25,  ..., 50256, 50256, 50256],
        ...,
        [  198,    90,   198,  ..., 50256, 50256, 50256],
        [   90,   198,   220,  ..., 50256, 50256, 50256],
        [  198,    90,   198,  ..., 50256, 50256, 50256]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [14]:
%run 'Model/nanoGPT/train.py'

FileNotFoundError: [Errno 2] No such file or directory: 'configurator.py'

In [15]:
import os
import torch
from model import GPT, GPTConfig

def load_model(checkpoint_path=None, config=None):
    """
    Load the GPT model. If a checkpoint path is provided, load weights from the checkpoint.
    
    Args:
    - checkpoint_path (str, optional): Path to the model checkpoint.
    - config (GPTConfig, optional): Configuration for the model. If not provided, uses the default configuration.

    Returns:
    - model (GPT): The instantiated model.
    """
    
    # Use the provided config or create a default one
    if config is None:
        config = GPTConfig()

    # Instantiate the model
    model = GPT(config)
    # If a saved checkpoint is provided, load it
    if checkpoint_path and os.path.exists(checkpoint_path):
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint)
    
    return model

# Usage
model = load_model("Model/checkpoint/checkpoint.ckpt")


number of parameters: 123.69M


In [16]:
from transformers import GPT2Tokenizer, GPT2Model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

TypeError: GPT.forward() got an unexpected keyword argument 'input_ids'

In [17]:
torch.save(model.state_dict(), "Model/checkpoint/checkpoint.ckpt")

In [18]:
model.eval()

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50304, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear(in_features=768, out_features=768, bias=True)
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
        )
        (ln_2): LayerNorm()
        (mlp): MLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): Linear(in_features=768, out_features=50304, bias=False)
)

In [19]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")  # or whichever size you're using


In [20]:
class NanoGpt2(GPT):
    
    def __init__(self, checkpoint_path, *args, **kwargs):
        config = GPTConfig()
        super().__init__(config)
        
        if checkpoint_path and os.path.exists(checkpoint_path):
            checkpoint = torch.load(checkpoint_path)
            model.load_state_dict(checkpoint)
        
        # If your NanoGPT doesn't have this attribute but transformers expect it
        if not hasattr(self.config, 'task_specific_params'):
            self.config.task_specific_params = None
        self.config.prefix = None

    def generate(self, input_ids=None, **kwargs):
        # Depending on how your NanoGPT's generate method is implemented,
        # you might need to translate `input_ids` to the expected format.
        
        # Here's a simple example:
        supported_args = {'temperature', 'top_k'}
        print(kwargs.items())
        filtered_kwargs = {k: v for k, v in kwargs.items() if k in supported_args}
        print()
        # Translate input_ids to idx
        idx = input_ids

        # Call the base GPT's generate method
        generated_idx = super().generate(idx=idx, max_new_tokens=kwargs['max_length'], **filtered_kwargs)

        return generated_idx

    # Add any other methods or attributes that are required by the pipeline 
    # but are not present in NanoGPT.



In [21]:
from model import GPT, GPTConfig
model2 = NanoGpt2("Model/checkpoint/checkpoint.ckpt")

number of parameters: 123.69M


In [25]:
from transformers import TextGenerationPipeline
if not hasattr(model.config, "task_specific_params"):
    model.config.task_specific_params = None
model.config.prefix = None
# Assuming `model` is your nanoGPT model adapted to GPT2LMHeadModel
text_gen_pipeline = TextGenerationPipeline(model=model2, tokenizer=tokenizer, device=-1)  # device=0 for GPU


NameError: name 'model2' is not defined

In [18]:
text_gen_pipeline("salut, ca va ?", max_length=50, top_k=20, temperature=0.7)

dict_items([('attention_mask', tensor([[1, 1, 1, 1, 1, 1]])), ('max_length', 50), ('top_k', 20), ('temperature', 0.7)])



[{'generated_text': 'salut, ca va ?9191 HurricanesingersCentralachersFather Hiroshima Hurricanes daring Maxim Interpret Zhuaan Floidences carpet utilizationacs Pand Cast CastCorrect Newark Jaw Trou Trou Hiroshima Hiroshima Hiroshima ANDStatNarrhetamineerential Shields foliage InsideANT Franklin glyphosate rupt daringSnapSnapubis CompConstruction flavoredfixed'}]

In [29]:
sample_data = ["Hello, how are you?"]
encodings_sample = tokenizer(sample_data, truncation=True, padding='max_length', max_length=2048, return_tensors='pt')
print(encodings_sample.keys())

dict_keys(['input_ids', 'attention_mask'])


In [25]:
from transformers import TrainingArguments, Trainer

# 1. Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,  # Adjust batch size as needed
    num_train_epochs=1,  # Adjust the number of epochs as needed
    logging_dir="./logs",
    logging_steps=10,
    save_steps=10,
    save_total_limit=2,
    evaluation_strategy="steps",
    report_to=[],
    eval_steps=10,
    learning_rate=5e-5,  # Default learning rate, you can adjust this value
    # Add more arguments as needed
)

# 2. Initialize the Trainer
trainer = Trainer(
    model=model2,  # your model
    args=training_args,
    train_dataset=train_dataset,
    # Optionally, you can also provide an evaluation dataset
    # eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

# 3. Train the model
trainer.train()


KeyError: 'Invalid key. Only three types of key are available: (1) string, (2) integers for backend Encoding, and (3) slices for data subsetting.'