<a href="https://colab.research.google.com/github/jihedouni/CLCT/blob/main/Models/LED.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#check GPU
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Sat Oct 29 11:27:47 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    42W / 400W |      0MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
#check RAM
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 89.6 gigabytes of available RAM

You are using a high-RAM runtime!


In [None]:
#google colab connection
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install sentencepiece
!pip install nltk
!pip install datasets
!pip install transformers

Import

In [None]:
from datasets import load_dataset
from transformers import (
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    AutoModelForSeq2SeqLM,
)
import pandas as pd
import numpy as np
import nltk
nltk.download("punkt", quiet=True)

import gdown

#CSV 
import csv 
import re
import torch



In [None]:
url = "https://drive.google.com/drive/folders/1nkNg5LZ_KNYM9kxlwAc1_Ek9H0zTEQVc"
gdown.download_folder(url, output='german_dataset')

In [None]:
model_or_path = "pszemraj/led-large-book-summary"
dataset_path = "german_dataset/"
path_output_directory = "res_model/"

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Dataset

In [None]:
# set max encoder length -> 8192
encoder_max_length = 8192#16384
decoder_max_length = 512
batch_size = 1 # only for tests

Preprocessing

In [None]:
#https://huggingface.co/allenai/led-large-16384
#https://huggingface.co/transformers/v3.0.2/notebooks.html
def process_data_to_model_inputs(batch):

    source, target = batch["document"], batch["summary"]
    
    inputs = tokenizer(
        source, padding="max_length", truncation=True, max_length=encoder_max_length
    )
    outputs = tokenizer(
        target, padding="max_length", truncation=True, max_length=decoder_max_length
    )

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask

    batch["global_attention_mask"] = len(batch["input_ids"]) * [
        [0 for _ in range(len(batch["input_ids"][0]))]
    ]
    # since above lists are references, the following line changes the 0 index for all samples
    batch["global_attention_mask"][0][0] = 1
    batch["labels"] = outputs.input_ids

    batch = {k: v for k, v in inputs.items()}
    # Ignore padding in the loss
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in outputs["input_ids"]
    ]
    return batch

def generate_summary(input_text, model, tokenizer):
    max_length = 512
    inputs = tokenizer(
        input_text,
        padding="max_length",
        truncation=True,
        max_length=encoder_max_length,
        return_tensors="pt",
    )
    input_ids = inputs.input_ids.to("cuda")
    attention_mask = inputs.attention_mask.to("cuda")
    global_attention_mask = torch.zeros_like(attention_mask)
    global_attention_mask[:, 0] = 1
    outputs = model.generate(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return output_str


In [None]:
def train_and_evaluate(id, model, tokenizer, dataset): 

  train_dataset = dataset.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=dataset.column_names,
  )

  train_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "global_attention_mask", "labels"],
  )
  #training without evaluation
  training_args = Seq2SeqTrainingArguments(
    output_dir= path_output_directory + "Step" + str(id),
    do_train=True,
    evaluation_strategy="no", #The evaluation strategy to adopt during training.
    per_device_train_batch_size=batch_size,  # The batch size per GPU/TPU core/CPU for training.
    fp16=True,   # Whether to use fp16 16-bit (mixed) precision training instead of 32-bit training.
    logging_steps=100, # Number of update steps between two logs if logging_strategy="steps".
    save_steps=1000, # Number of updates steps before two checkpoint saves if save_strategy="steps".
    weight_decay=0.1,
    label_smoothing_factor=0.1,
    logging_dir="logs",
    save_total_limit=2, # If a value is passed, will limit the total amount of checkpoints. 
    optim="adamw_torch",
    gradient_accumulation_steps=4, # Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
  )
  data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

  trainer = Seq2SeqTrainer(
    model= model,
    args=training_args,
    data_collator=data_collator, #The function to use to form a batch from a list of elements of train_dataset or eval_dataset
    train_dataset=train_dataset, #The dataset to use for training.
    tokenizer=tokenizer, # The tokenizer used to preprocess the data. 
  )
  trainer.train()
  trainer.save_model(path_output_directory + "Step" + str(id))

  
  #val_dataset =  load_dataset('csv', data_files=val_path, delimiter=";", split="validation")

In [None]:
for id in range(1,6):
  train_path = dataset_path + "German_Train_hDe_to_En_Step_"+str(id)+".csv" 
  dataset = load_dataset('csv', data_files=train_path, delimiter=";", split="train")

  model = AutoModelForSeq2SeqLM.from_pretrained(model_or_path, use_cache=False)
  # set generate hyperparameters
  model.config.num_beams = 4
  model.config.max_length = 512
  model.config.min_length = 256
  model.config.length_penalty = 2.0
  model.config.early_stopping = True
  model.config.no_repeat_ngram_size = 3

  #model     = model.to(device)
  # load tokenizer
  tokenizer = AutoTokenizer.from_pretrained(model_or_path)

  train_and_evaluate(id, model, tokenizer, dataset)
  print(id, "is trained.")



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-4ac4fe9288726c90/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-4ac4fe9288726c90/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


Downloading:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.84G [00:00<?, ?B/s]

Some weights of LEDForConditionalGeneration were not initialized from the model checkpoint at pszemraj/led-large-book-summary and are newly initialized: ['led.encoder.embed_tokens.weight', 'led.decoder.embed_tokens.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

  0%|          | 0/263 [00:00<?, ?ba/s]

Using cuda_amp half precision backend
***** Running training *****
  Num examples = 263
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 4
  Total optimization steps = 195
You're using a LEDTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
100,4.3755




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to /content/drive/MyDrive/models/Ksum/LED8K_large/Step1
Configuration saved in /content/drive/MyDrive/models/Ksum/LED8K_large/Step1/config.json
Model weights saved in /content/drive/MyDrive/models/Ksum/LED8K_large/Step1/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/models/Ksum/LED8K_large/Step1/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/models/Ksum/LED8K_large/Step1/special_tokens_map.json


1 is trained.




Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-7a618b72ba25b17e/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-7a618b72ba25b17e/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--pszemraj--led-large-book-summary/snapshots/38be53ce6e01a1536f12b318c8308cca23d78b05/config.json
Model config LEDConfig {
  "_name_or_path": "pszemraj/led-large-book-summary",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "architectures": [
    "LEDForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "attention_window": [
    1024,
    1024,
    1024,
    1024,
    1024,
    1024,
    1024,
    1024,
    1024,
    1024,
    1024,
    1024
  ],
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  

  0%|          | 0/263 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 263
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 4
  Total optimization steps = 195
You're using a LEDTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
100,4.3815




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to /content/drive/MyDrive/models/Ksum/LED8K_large/Step2
Configuration saved in /content/drive/MyDrive/models/Ksum/LED8K_large/Step2/config.json
Model weights saved in /content/drive/MyDrive/models/Ksum/LED8K_large/Step2/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/models/Ksum/LED8K_large/Step2/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/models/Ksum/LED8K_large/Step2/special_tokens_map.json


2 is trained.




Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-4287aeb36e618a4b/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-4287aeb36e618a4b/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--pszemraj--led-large-book-summary/snapshots/38be53ce6e01a1536f12b318c8308cca23d78b05/config.json
Model config LEDConfig {
  "_name_or_path": "pszemraj/led-large-book-summary",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "architectures": [
    "LEDForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "attention_window": [
    1024,
    1024,
    1024,
    1024,
    1024,
    1024,
    1024,
    1024,
    1024,
    1024,
    1024,
    1024
  ],
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  

  0%|          | 0/263 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 263
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 4
  Total optimization steps = 195
You're using a LEDTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
100,4.396




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to /content/drive/MyDrive/models/Ksum/LED8K_large/Step3
Configuration saved in /content/drive/MyDrive/models/Ksum/LED8K_large/Step3/config.json
Model weights saved in /content/drive/MyDrive/models/Ksum/LED8K_large/Step3/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/models/Ksum/LED8K_large/Step3/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/models/Ksum/LED8K_large/Step3/special_tokens_map.json


3 is trained.




Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-4d8e4eb3ec0a787c/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-4d8e4eb3ec0a787c/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--pszemraj--led-large-book-summary/snapshots/38be53ce6e01a1536f12b318c8308cca23d78b05/config.json
Model config LEDConfig {
  "_name_or_path": "pszemraj/led-large-book-summary",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "architectures": [
    "LEDForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "attention_window": [
    1024,
    1024,
    1024,
    1024,
    1024,
    1024,
    1024,
    1024,
    1024,
    1024,
    1024,
    1024
  ],
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  

  0%|          | 0/263 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 263
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 4
  Total optimization steps = 195
You're using a LEDTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
100,4.3686




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to /content/drive/MyDrive/models/Ksum/LED8K_large/Step4
Configuration saved in /content/drive/MyDrive/models/Ksum/LED8K_large/Step4/config.json
Model weights saved in /content/drive/MyDrive/models/Ksum/LED8K_large/Step4/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/models/Ksum/LED8K_large/Step4/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/models/Ksum/LED8K_large/Step4/special_tokens_map.json


4 is trained.




Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-98389588d6421483/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-98389588d6421483/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--pszemraj--led-large-book-summary/snapshots/38be53ce6e01a1536f12b318c8308cca23d78b05/config.json
Model config LEDConfig {
  "_name_or_path": "pszemraj/led-large-book-summary",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "architectures": [
    "LEDForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "attention_window": [
    1024,
    1024,
    1024,
    1024,
    1024,
    1024,
    1024,
    1024,
    1024,
    1024,
    1024,
    1024
  ],
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  

  0%|          | 0/264 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 264
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 4
  Total optimization steps = 198
You're using a LEDTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
100,4.3862




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to /content/drive/MyDrive/models/Ksum/LED8K_large/Step5
Configuration saved in /content/drive/MyDrive/models/Ksum/LED8K_large/Step5/config.json
Model weights saved in /content/drive/MyDrive/models/Ksum/LED8K_large/Step5/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/models/Ksum/LED8K_large/Step5/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/models/Ksum/LED8K_large/Step5/special_tokens_map.json


5 is trained.


# Transformer

In [None]:
train_and_evaluate(model, tokenizer, dataset)

In [None]:
print(res_model.device)

In [None]:
input_text = "" #add test text 
#example: generate Summary
print("Generate Summary (Example):")
print(generate_summary(input_text, model, tokenizer))