<a href="https://colab.research.google.com/github/jihedouni/CLCT/blob/main/Models/LED.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#check GPU
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Sat Oct 29 11:27:47 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    42W / 400W |      0MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
#check RAM
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 89.6 gigabytes of available RAM

You are using a high-RAM runtime!


In [None]:
#google colab connection
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install sentencepiece
!pip install nltk
!pip install datasets
!pip install transformers

Import

In [None]:
from datasets import load_dataset
from transformers import (
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    AutoModelForSeq2SeqLM,
    BigBirdPegasusForConditionalGeneration
)
import pandas as pd
import numpy as np
import nltk
nltk.download("punkt", quiet=True)

import gdown

#CSV 
import csv 
import re
import torch

In [None]:
#dataset
url = "https://drive.google.com/drive/folders/1nkNg5LZ_KNYM9kxlwAc1_Ek9H0zTEQVc"
gdown.download_folder(url, output='german_dataset')
dataset_path = "german_dataset/"

In [None]:
#model
model_or_path = "pszemraj/bigbird-pegasus-large-K-booksum"
path_output_directory = "res_model/"
encoder_max_length = 4096
decoder_max_length = 512
batch_size = 1 # only for tests
train_epochs = 1 
learning_rate = 1e-4
seed = 42 

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [None]:
if "t5" in model_or_path:
    prefix = "summarize: "
else:
    prefix = ""

print(prefix)

# Dataset

Example

In [None]:
 #example texts - fold 1
 train_path = dataset_path + "German_Train_hDe_to_En_Step_1.csv"
 test_path = dataset_path + "German_Val_hDe_to_En_Step_1.csv"

In [None]:
#train
df = pd.read_csv(train_path, sep=';')
print(df[df["id"]==15])

In [None]:
#train
df = pd.read_csv(test_path, sep=';')
print(df[df["id"]==7])

Preprocessing

In [None]:
#https://huggingface.co/transformers/v3.0.2/notebooks.html
#https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/summarization.ipynb#scrollTo=545PP3o8IrJV
#https://huggingface.co/pszemraj/bigbird-pegasus-large-K-booksum
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def generate_summary(input_text, model, tokenizer):
    max_length = 512
    inputs = tokenizer(
        input_text,
        padding="max_length",
        truncation=True,
        max_length=encoder_max_length,
        return_tensors="pt",
    )
    input_ids = inputs.input_ids.to("cuda")
    attention_mask = inputs.attention_mask.to("cuda")
    outputs = model.generate(input_ids, attention_mask=attention_mask)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return output_str


# Transformer

In [None]:
def finetune_model(id, model, tokenizer, dataset): 

  train_dataset = dataset.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=dataset.column_names,
  )

  #training arguments
  args = Seq2SeqTrainingArguments(
    output_dir= path_output_directory +"checkpoints/Step" + str(id),
    evaluation_strategy = "no",  #The evaluation strategy to adopt during training.
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size, # The batch size per GPU/TPU core/CPU for training.
    logging_steps=100, # Number of update steps between two logs if logging_strategy="steps".
    save_steps=1000, # Number of updates steps before two checkpoint saves if save_strategy="steps".
    weight_decay=0.01,
    save_total_limit=3, # If a value is passed, will limit the total amount of checkpoints. 
    num_train_epochs=train_epochs,
    fp16=True,  # Whether to use fp16 16-bit (mixed) precision training instead of 32-bit training.
  )

  #training
  data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

  trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
  )

  trainer.train()
  trainer.save_model(path_output_directory + "Step" + str(id))

In [None]:
for id in range(1,6):
  #fold i training dataset
  train_path = dataset_path + "German_Train_hDe_to_En_Step_"+str(id)+".csv" 
  dataset = load_dataset('csv', data_files=train_path, delimiter=";", split="train")
  #load model
  model = BigBirdPegasusForConditionalGeneration.from_pretrained(model_or_path, use_cache=False)
  # load tokenizer
  tokenizer = AutoTokenizer.from_pretrained(model_or_path)
  #finetuning
  finetune_model(id, model, tokenizer, dataset)
  print(id, "is finetuned.")

Generation

In [None]:
input_text = "" #add test text 
#example: generate Summary using fold 5
print("Generate Summary (Example):")
print(generate_summary(input_text, model, tokenizer))