<a href="https://colab.research.google.com/github/jihedouni/CLCT/blob/main/Models/LED.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#check GPU
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Sat Oct 29 11:27:47 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    42W / 400W |      0MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
#check RAM
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 89.6 gigabytes of available RAM

You are using a high-RAM runtime!


In [None]:
#google colab connection
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install sentencepiece
!pip install nltk
!pip install datasets
!pip install transformers

Import

In [None]:
from datasets import load_dataset
from transformers import (
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    AutoModelForSeq2SeqLM,
)
import pandas as pd
import numpy as np
import nltk
nltk.download("punkt", quiet=True)

import gdown

#CSV 
import csv 
import re
import torch

In [None]:
#dataset
url = "https://drive.google.com/drive/folders/1nkNg5LZ_KNYM9kxlwAc1_Ek9H0zTEQVc"
gdown.download_folder(url, output='german_dataset')
dataset_path = "german_dataset/"

In [None]:
#model
model_or_path = "allenai/led-large-16384"
path_output_directory = "res_model/"
encoder_max_length = 8192
decoder_max_length = 512
batch_size = 1

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Dataset

Example

In [None]:
 #example texts - fold 1
 train_path = dataset_path + "German_Train_hDe_to_En_Step_1.csv"
 test_path = dataset_path + "German_Val_hDe_to_En_Step_1.csv"

In [None]:
#train
df = pd.read_csv(train_path, sep=';')
print(df[df["id"]==15])

In [None]:
#train
df = pd.read_csv(test_path, sep=';')
print(df[df["id"]==7])

Preprocessing

In [None]:
#https://huggingface.co/allenai/led-large-16384
#https://huggingface.co/transformers/v3.0.2/notebooks.html
def process_data_to_model_inputs(batch):

    source, target = batch["document"], batch["summary"]
    
    inputs = tokenizer(
        source, padding="max_length", truncation=True, max_length=encoder_max_length
    )
    outputs = tokenizer(
        target, padding="max_length", truncation=True, max_length=decoder_max_length
    )

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask

    batch["global_attention_mask"] = len(batch["input_ids"]) * [
        [0 for _ in range(len(batch["input_ids"][0]))]
    ]
    # since above lists are references, the following line changes the 0 index for all samples
    batch["global_attention_mask"][0][0] = 1
    batch["labels"] = outputs.input_ids

    batch = {k: v for k, v in inputs.items()}
    # Ignore padding in the loss
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in outputs["input_ids"]
    ]
    return batch

def generate_summary(input_text, model, tokenizer):
    max_length = 512
    inputs = tokenizer(
        input_text,
        padding="max_length",
        truncation=True,
        max_length=encoder_max_length,
        return_tensors="pt",
    )
    input_ids = inputs.input_ids.to("cuda")
    attention_mask = inputs.attention_mask.to("cuda")
    global_attention_mask = torch.zeros_like(attention_mask)
    global_attention_mask[:, 0] = 1
    outputs = model.generate(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return output_str


# Transformer

In [None]:
def finetune_model(id, model, tokenizer, dataset): 

  train_dataset = dataset.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=dataset.column_names,
  )

  train_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "global_attention_mask", "labels"],
  )
  #training without evaluation
  training_args = Seq2SeqTrainingArguments(
    output_dir= path_output_directory + "Step" + str(id),
    do_train=True,
    evaluation_strategy="no", #The evaluation strategy to adopt during training.
    per_device_train_batch_size=batch_size,  # The batch size per GPU/TPU core/CPU for training.
    fp16=True,   # Whether to use fp16 16-bit (mixed) precision training instead of 32-bit training.
    logging_steps=100, # Number of update steps between two logs if logging_strategy="steps".
    save_steps=1000, # Number of updates steps before two checkpoint saves if save_strategy="steps".
    weight_decay=0.1,
    label_smoothing_factor=0.1,
    logging_dir="logs",
    save_total_limit=2, # If a value is passed, will limit the total amount of checkpoints. 
    optim="adamw_torch",
    gradient_accumulation_steps=4, # Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
  )
  data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

  trainer = Seq2SeqTrainer(
    model= model,
    args=training_args,
    data_collator=data_collator, #The function to use to form a batch from a list of elements of train_dataset or eval_dataset
    train_dataset=train_dataset, #The dataset to use for training.
    tokenizer=tokenizer, # The tokenizer used to preprocess the data. 
  )
  trainer.train()
  trainer.save_model(path_output_directory + "Step" + str(id))

In [None]:
for id in range(1,6):
  #fold i training dataset
  train_path = dataset_path + "German_Train_hDe_to_En_Step_"+str(id)+".csv" 
  dataset = load_dataset('csv', data_files=train_path, delimiter=";", split="train")
  #load model
  model = AutoModelForSeq2SeqLM.from_pretrained(model_or_path, use_cache=False)
  # set generate hyperparameters
  model.config.num_beams = 4
  model.config.max_length = 512
  model.config.min_length = 100
  model.config.length_penalty = 2.0
  model.config.early_stopping = True
  model.config.no_repeat_ngram_size = 3
  #model     = model.to(device)
  # load tokenizer
  tokenizer = AutoTokenizer.from_pretrained(model_or_path)
  #finetuning
  finetune_model(id, model, tokenizer, dataset)
  print(id, "is finetuned.")

Generation

In [None]:
input_text = "" #add test text 
#example: generate Summary using fold 5
print("Generate Summary (Example):")
print(generate_summary(input_text, model, tokenizer))