In [1]:
!pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
!pip install -q datasets bitsandbytes einops wandb

In [2]:
!python --version

Python 3.10.13


In [3]:
!huggingface-cli login --token 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

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
import json
from datasets import Dataset

# Initialize empty lists to store data
ids = []
dialogues = []
summaries = []

# Open the JSONL file and read its contents line by line
with open("/kaggle/input/test-train-jsonl/train.jsonl", "r") as file:
    for line in file:
        # Parse each JSON object in the JSONL file
        data = json.loads(line)
        # Extract values for 'idx', 'inputs', and 'target'
        idx = data["idx"]
        dialogue = data["inputs"]
        summary = data["target"]
        # Append the values to respective lists
        ids.append(idx)
        dialogues.append(dialogue)
        summaries.append(summary)

# Create a Hugging Face dataset using the lists of data
dataset = Dataset.from_dict({
    "id": ids,
    "dialogue": dialogues,
    "summary": summaries
})
train_dataset = dataset

In [5]:
dataset[0]

{'id': 0,
 'dialogue': 'The lungs are clear, and without focal air space opacity. The cardiomediastinal silhouette is normal in size and contour, and stable. There is no pneumothorax or large pleural effusion.',
 'summary': 'No acute cardiopulmonary abnormality.'}

In [6]:
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training

model_id = "meta-llama/Llama-2-7b-chat-hf"

# 
# load model in NF4 quantization with double quantization,
# set compute dtype to bfloat16
# 
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    # bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    # use_cache=False,
    # device_map="auto",
)
# model = prepare_model_for_kbit_training(model)
model.config.use_cache = False

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [7]:
def prompt_formatter(sample):
    return f"""<s>### Instruction:
You are a helpful, respectful and honest assistant. \
Your task is to summarize the following dialogue. \
Your answer should be based on the provided dialogue only.

### Dialogue:
{sample['dialogue']}

### Summary:
{sample['summary']} </s>"""

n = 0
print(prompt_formatter(train_dataset[n]))

<s>### Instruction:
You are a helpful, respectful and honest assistant. Your task is to summarize the following dialogue. Your answer should be based on the provided dialogue only.

### Dialogue:
The lungs are clear, and without focal air space opacity. The cardiomediastinal silhouette is normal in size and contour, and stable. There is no pneumothorax or large pleural effusion.

### Summary:
No acute cardiopulmonary abnormality. </s>


In [8]:
from transformers import TrainingArguments, AutoTokenizer
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer

# 
# construct a Peft model.
# the QLoRA paper recommends LoRA dropout = 0.05 for small models (7B, 13B)
# 
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM", 
)
model = get_peft_model(model, peft_config)

# 
# set up the trainer
# 
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

args = TrainingArguments(
    output_dir="llama2-7b-chat-opr",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=2,
    logging_steps=10,
    save_strategy="epoch",
    learning_rate=2e-4,
    optim="paged_adamw_32bit",
    bf16=False,
    fp16=True,
#     tf32=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    disable_tqdm=False,
)
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    peft_config=peft_config,
    max_seq_length=256,
    tokenizer=tokenizer,
    packing=True,
    formatting_func=prompt_formatter, 
    args=args,
)

2024-05-03 15:13:27.099181: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-03 15:13:27.099304: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-03 15:13:27.235779: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
10,2.302
20,1.7556
30,1.3239
40,1.1078
50,1.0889
60,1.037
70,1.0459
80,0.9406
90,0.9312
100,0.9882


In [10]:
output_folder = "llama2-7b-chat-opr-save"
model.save_pretrained(output_folder, safe_serialization=True)

trainer.save_model()

In [11]:
import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer

model_folder = "llama2-7b-chat-opr/"

# load both the adapter and the base model
model = AutoPeftModelForCausalLM.from_pretrained(
    model_folder,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_4bit=True,
    device_map='auto'
)
tokenizer = AutoTokenizer.from_pretrained(model_folder)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
import json
from datasets import Dataset

# Initialize empty lists to store data
ids = []
dialogues = []
summaries = []

# Open the JSONL file and read its contents line by line
with open("/kaggle/input/test-train-jsonl/test.jsonl", "r") as file:
    for line in file:
        # Parse each JSON object in the JSONL file
        data = json.loads(line)
        # Extract values for 'idx', 'inputs', and 'target'
        idx = data["idx"]
        dialogue = data["inputs"]
        summary = data["target"]
        # Append the values to respective lists
        ids.append(idx)
        dialogues.append(dialogue)
        summaries.append(summary)

# Create a Hugging Face dataset using the lists of data
test_dataset = Dataset.from_dict({
    "id": ids,
    "dialogue": dialogues,
    "summary": summaries
})

In [24]:
sample = test_dataset[201]

prompt = f"""### Instruction:
You are a helpful, respectful and honest assistant. \
Your task is to summarize the following dialogue. \
Your answer should be based on the provided dialogue only.

### Dialogue:
{sample['dialogue']}

### Summary:
"""
print(prompt)

### Instruction:
You are a helpful, respectful and honest assistant. Your task is to summarize the following dialogue. Your answer should be based on the provided dialogue only.

### Dialogue:
Heart size normal. Lungs are clear. XXXX are normal. No pneumonia, effusions, edema, pneumothorax, adenopathy, nodules or masses.

### Summary:



In [25]:
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.cuda()
outputs = model.generate(input_ids=input_ids, max_new_tokens=50, temperature=0.7)

print('Output:\n',
      tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):])
print('\nGround truth:\n', sample['summary'])

Output:
 No acute cardiopulmonary abnormalities 

Ground truth:
 Normal chest.


In [None]:
import torch
from peft import AutoPeftModelForCausalLM

model_folder = "llama2-7b-chat-opr"

model = AutoPeftModelForCausalLM.from_pretrained(
    model_folder,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16
)

# merge the lora adapter and the base model
merged_model = model.merge_and_unload()

In [None]:
from transformers import AutoTokenizer

output_folder = 'merged-llama2-7b-chat-opr-blank'

# save the merged model and the tokenizer
merged_model.save_pretrained(output_folder, safe_serialization=True)

# tokenizer = AutoTokenizer.from_pretrained(model_folder)
# tokenizer.save_pretrained(output_folder)


In [None]:
# Clear output folder
import os

def remove_folder_contents(folder):
    for the_file in os.listdir(folder):
        file_path = os.path.join(folder, the_file)
        try:
            if os.path.isfile(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                remove_folder_contents(file_path)
                os.rmdir(file_path)
        except Exception as e:
            print(e)

folder_path = '/kaggle/working/merged-llama2-7b-chat-opr-blank'
remove_folder_contents(folder_path)
os.rmdir(folder_path)

In [36]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_folder = 'merged-llama2-7b-chat-opr'

tokenizer = AutoTokenizer.from_pretrained(model_folder)

model = AutoModelForCausalLM.from_pretrained(
    model_folder,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_4bit=True,
    device_map="auto",
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [37]:
from transformers import pipeline, GenerationConfig

gen_config = GenerationConfig.from_pretrained(model_folder)
gen_config.max_new_tokens = 50
gen_config.temperature = 0.7
gen_config.repetition_penalty = 1.1
gen_config.pad_token_id = tokenizer.eos_token_id

pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map='auto',
    generation_config=gen_config,
)

In [None]:
sample = test_dataset[44]

prompt = f"""### Instruction:
You are a helpful, respectful and honest assistant. \
Your task is to summarize the following dialogue. \
Your answer should be based on the provided dialogue only.

### Dialogue:
{sample['dialogue']}

### Summary:
"""

output = pipe(prompt)

print('Output:\n', output[0]['generated_text'][len(prompt):])
print('\nGround truth:\n', sample['summary'])

In [None]:
output_dir = "test/saved"
model.save_pretrained(output_dir, safe_serialization=False)

In [None]:
from IPython.display import FileLink
FileLink(r'/kaggle/working/llama2-7b-chat-opr')

In [None]:
import os
from zipfile import ZipFile

# Define the directory containing the files
directory = '/kaggle/working/merged-llama2-7b-chat-opr'

# Create a list of files in the directory
files = os.listdir(directory)

# Create a ZIP archive containing all the files
with ZipFile('/kaggle/working/merged_files.zip', 'w') as zipf:
    for file in files:
        # Add each file to the ZIP archive
        zipf.write(os.path.join(directory, file), file)

# Provide a download link for the ZIP archive
from IPython.display import FileLink
FileLink(r'/kaggle/working/merged_files.zip')


In [None]:
!rm -rf /kaggle/working/merged_files.zip

In [32]:
import zipfile
import os
from IPython.display import FileLink

def zip_dir(directory = "/kaggle/working/llama2-7b-chat-opr-blank", file_name = 'outputs.zip'):
    """
    zip all the files in a directory
    
    Parameters
    _____
    directory: str
        directory needs to be zipped, defualt is current working directory
        
    file_name: str
        the name of the zipped file (including .zip), default is 'directory.zip'
        
    Returns
    _____
    Creates a hyperlink, which can be used to download the zip file)
    """
    os.chdir(directory)
    zip_ref = zipfile.ZipFile(file_name, mode='w')
    for folder, _, files in os.walk(directory):
        for file in files:
            if file_name in file:
                pass
            else:
                zip_ref.write(os.path.join(folder, file))

    return FileLink(file_name)

In [33]:
zip_dir()

In [34]:
!pip install bert-score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13


In [39]:
# from bert_score import score

# input_ids = tokenizer(prompt, return_tensors="pt").input_ids.cuda()
# outputs = model.generate(input_ids=input_ids, max_new_tokens=50, temperature=0.7, )

# print('Output:\n',
#       tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):])
# print('\nGround truth:\n', sample['summary'])

                       
# print(score(tokenizer.batch_decode(outputs.detach().cpu().numpy(), sample["summary"], lang="en", verbose=True)))

sample = test_dataset[100]

prompt = f"""### Instruction:
You are a helpful, respectful and honest assistant. \
Your task is to summarize the following dialogue. \
Your answer should be based on the provided dialogue only.

### Dialogue:
{sample['dialogue']}

### Summary:
"""
print(prompt)

from bert_score import score

input_ids = tokenizer(prompt, return_tensors="pt").input_ids.cuda()

outputs = model.generate(input_ids=input_ids, max_new_tokens=50, temperature=0.7)

predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)

print('Output:\n', predictions[0])
print('\nGround truth:\n', sample['summary'])

P, R, F1 = score([predictions[0]], [sample['summary']], lang="en", verbose=True)

print(f"Precision: {P.mean():.2f}")
print(f"Recall: {R.mean():.2f}")
print(f"F1-Score: {F1.mean():.2f}")

### Instruction:
You are a helpful, respectful and honest assistant. Your task is to summarize the following dialogue. Your answer should be based on the provided dialogue only.

### Dialogue:
The heart size is normal. The mediastinal contour is within normal limits. The lungs are free of any focal infiltrates. There are no nodules or masses. No visible pneumothorax. No visible pleural fluid. The XXXX are grossly normal. There is no visible free intraperitoneal air under the diaphragm.

### Summary:

Output:
 ### Instruction:
You are a helpful, respectful and honest assistant. Your task is to summarize the following dialogue. Your answer should be based on the provided dialogue only.

### Dialogue:
The heart size is normal. The mediastinal contour is within normal limits. The lungs are free of any focal infiltrates. There are no nodules or masses. No visible pneumothorax. No visible pleural fluid. The XXXX are grossly normal. There is no visible free intraperitoneal air under the diaph

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.06 seconds, 15.59 sentences/sec
Precision: 0.80
Recall: 0.86
F1-Score: 0.83
