# Installation

In [1]:
!pip install -U adapter-transformers
!pip install datasets
!pip install accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Device

In [2]:
!nvidia-smi

Thu Mar  9 21:49:27 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   76C    P0    30W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
import torch

assert torch.cuda.is_available() == True

# Model

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("dbmdz/german-gpt2", device_map="auto")
model = AutoModelForCausalLM.from_pretrained("dbmdz/german-gpt2", device_map="auto")
model.config.pad_token_id = model.config.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Dataset

In [5]:
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [6]:
import os

data_path = "/content/gdrive/MyDrive/Thesis/datasets/monolingual Leichte Sprache"
assert os.path.exists(data_path) == True

In [7]:
def concat_datasets(
    path=data_path,
    columns=None
):
    columns = ["phrase"] if columns is None else columns
    files = glob.glob(f"{path}/*.csv")
    return pd.concat((pd.read_csv(file)[columns] for file in files)).dropna()

In [8]:
def group_texts(examples, block_size=50):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [
            t[i:i + block_size] for i in range(0, total_length, block_size)
        ]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [9]:
from datasets import Dataset
import glob
import pandas as pd

dataset_df = concat_datasets()

dataset = Dataset.from_pandas(dataset_df, preserve_index=False)
column_names = dataset.column_names

dataset = dataset.train_test_split(
    test_size=0.1,
    shuffle=True,
    seed=40
)

dataset = dataset.map(
    lambda batch: tokenizer(batch["phrase"]),
    remove_columns=column_names,
    batched=True
)

dataset = dataset.map(group_texts, batched=True)

dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/392999 [00:00<?, ? examples/s]

Map:   0%|          | 0/43667 [00:00<?, ? examples/s]

Map:   0%|          | 0/392999 [00:00<?, ? examples/s]

Map:   0%|          | 0/43667 [00:00<?, ? examples/s]

# Adapter

In [10]:
from transformers import AdapterConfig

adapter_name = "Adapter_Bottleneck"
adapter_config = AdapterConfig(
    mh_adapter=True,
    output_adapter=True,
    reduction_factor=16,
    non_linearity="gelu"
)

if adapter_name not in model.adapter_summary():
    model.add_adapter(adapter_name=adapter_name, config=adapter_config)
else:
    pass

assert adapter_name in model.adapter_summary()

# Training

In [11]:
from transformers import AdapterTrainer, TrainingArguments
from torch import nn

model.train_adapter(adapter_setup="Adapter_Bottleneck")

training_args = TrainingArguments(
    output_dir=f"/content/gdrive/MyDrive/Thesis/adapters/{adapter_name}/checkpoints",
    do_train=True,
    remove_unused_columns=False,
    label_smoothing_factor=0.1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=1e-4,
    weight_decay=0.01,
    gradient_accumulation_steps=4,
    num_train_epochs=5,
    save_steps = 3000,
    save_total_limit=1,
    save_strategy="steps",
    evaluation_strategy="steps",
    overwrite_output_dir=True,
    load_best_model_at_end=True
)

In [12]:
'''
URL: https://github.com/yxuansu/SimCTG
'''
def compute_valid_token_num(valid_len_list):
    res = 0
    for one_len in valid_len_list:
        res += one_len * (one_len - 1)
    return res

def build_mask_matrix(seqlen, valid_len_list, prefix_len = 0):
    '''
        prefix_len: the length of prefix that we do not want to compute CL loss for.
        (1) if a sequence of length 4 contains zero padding token (i.e., the valid length is 4),
            then the loss padding matrix looks like
                 [0., 1., 1., 1.],
                 [1., 0., 1., 1.],
                 [1., 1., 0., 1.],
                 [1., 1., 1., 0.]
        (2) if a sequence of length 4 contains 1 padding token (i.e., the valid length is 3),
            then the loss padding matrix looks like
                 [0., 1., 1., 0.],
                 [1., 0., 1., 0.],
                 [1., 1., 0., 0.],
                 [0., 0., 0., 0.]
    '''
    res_list = []
    base_mask = torch.ones(seqlen, seqlen) - torch.eye(seqlen, seqlen)
    base_mask = base_mask.type(torch.FloatTensor)
    bsz = len(valid_len_list)
    for i in range(bsz):
        one_base_mask = base_mask.clone()
        one_valid_len = valid_len_list[i]
        one_base_mask[:,one_valid_len:] = 0.
        one_base_mask[one_valid_len:, :] = 0.
        if prefix_len > 0:
            one_base_mask[:prefix_len, :prefix_len] = 0.
        res_list.append(one_base_mask)
    res_mask = torch.stack(res_list, dim = 0)#torch.FloatTensor(res_list)
    #print (res_mask)
    assert res_mask.size() == torch.Size([bsz, seqlen, seqlen])
    return res_mask
        
def contrastive_loss(margin, score_matrix, input_ids, pad_token_id, prefix_len=0):
    '''
       margin: predefined margin to push similarity score away
       score_matrix: bsz x seqlen x seqlen
       input_ids: bsz x seqlen
       pad_token_id: indicating which tokens are padding token
    '''
    bsz, seqlen, _ = score_matrix.size()
    gold_score = torch.diagonal(score_matrix, offset=0, dim1=1, dim2=2) # bsz x seqlen
    gold_score = torch.unsqueeze(gold_score, -1)
    assert gold_score.size() == torch.Size([bsz, seqlen, 1])
    difference_matrix = gold_score - score_matrix
    assert difference_matrix.size() == torch.Size([bsz, seqlen, seqlen])
    loss_matrix = margin - difference_matrix # bsz x seqlen x seqlen
    loss_matrix = torch.nn.functional.relu(loss_matrix)

    ### input mask
    input_mask = torch.ones_like(input_ids).type(torch.FloatTensor)
    if loss_matrix.is_cuda:
        input_mask = input_mask.cuda(loss_matrix.get_device())
    input_mask = input_mask.masked_fill(input_ids.eq(pad_token_id), 0.0)

    if loss_matrix.is_cuda:
        input_mask = input_mask.cuda(loss_matrix.get_device())

    valid_len_list = torch.sum(input_mask, dim = -1).tolist()
    loss_mask = build_mask_matrix(seqlen, [int(item) for item in valid_len_list], prefix_len)
    if score_matrix.is_cuda:
        loss_mask = loss_mask.cuda(score_matrix.get_device())
    masked_loss_matrix = loss_matrix * loss_mask

    loss_matrix = torch.sum(masked_loss_matrix, dim = -1)
    assert loss_matrix.size() == input_ids.size()
    loss_matrix = loss_matrix * input_mask
    cl_loss = torch.sum(loss_matrix) / torch.sum(loss_mask)
    return cl_loss

In [13]:
class CustomTrainer(AdapterTrainer):
    def __init__(
        self,
        model,
        args,
        tokenizer,
        train_dataset,
        eval_dataset,
        margin=0.5
    ):
        super().__init__(
            model=model,
            args=args,
            tokenizer=tokenizer,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset
        )
        self.margin = margin

    def compute_loss(
        self,
        model,
        inputs,
        return_outputs=False,
    ):
        input_ids = inputs.get("input_ids")
        bsz, seqlen = input_ids.size()

        labels = inputs.get("labels")
        outputs = model(**inputs, output_hidden_states=True)
        logits = outputs.get("logits")
        assert logits.size() == torch.Size([bsz, seqlen, model.config.vocab_size])

        mle_loss = self.label_smoother(outputs, labels, shift_labels=True)
        
        # Contrastive Loss
        last_hidden_states = outputs.hidden_states[-1]
        assert last_hidden_states.size() == torch.Size([bsz, seqlen, model.config.hidden_size])

        norm_rep = last_hidden_states / last_hidden_states.norm(dim=2, keepdim=True)
        cosine_scores = torch.matmul(norm_rep, norm_rep.transpose(1,2))
        assert cosine_scores.size() == torch.Size([bsz, seqlen, seqlen])

        cl_loss = contrastive_loss(
            self.margin,
            cosine_scores,
            input_ids,
            model.config.pad_token_id,
            prefix_len=0
        )

        loss = mle_loss + cl_loss
        
        return (loss, outputs) if return_outputs else loss

In [14]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    margin=0.5
)

In [31]:
orig_trainer = AdapterTrainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
)

In [15]:
trainer.train()

model.save_adapter(
    save_directory=f"/content/gdrive/MyDrive/Thesis/adapters/{adapter_name}/model",
    adapter_name=adapter_name
)

***** Running training *****
  Num examples = 96287
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 15045
  Number of trainable parameters = 1789056
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
500,5.2358,4.91959
1000,4.8724,4.827098
1500,4.8068,4.780792
2000,4.7689,4.752088
2500,4.7333,4.729804
3000,4.7289,4.711168
3500,4.6955,4.698153
4000,4.683,4.686267
4500,4.6684,4.677396
5000,4.6651,4.668129


***** Running Evaluation *****
  Num examples = 10689
  Batch size = 8
***** Running Evaluation *****
  Num examples = 10689
  Batch size = 8
***** Running Evaluation *****
  Num examples = 10689
  Batch size = 8
***** Running Evaluation *****
  Num examples = 10689
  Batch size = 8
***** Running Evaluation *****
  Num examples = 10689
  Batch size = 8
***** Running Evaluation *****
  Num examples = 10689
  Batch size = 8
Saving model checkpoint to /content/gdrive/MyDrive/Thesis/adapters/Adapter_Bottleneck/checkpoints/checkpoint-3000
Configuration saved in /content/gdrive/MyDrive/Thesis/adapters/Adapter_Bottleneck/checkpoints/checkpoint-3000/Adapter_Bottleneck/adapter_config.json
Module weights saved in /content/gdrive/MyDrive/Thesis/adapters/Adapter_Bottleneck/checkpoints/checkpoint-3000/Adapter_Bottleneck/pytorch_adapter.bin
Configuration saved in /content/gdrive/MyDrive/Thesis/adapters/Adapter_Bottleneck/checkpoints/checkpoint-3000/Adapter_Bottleneck/head_config.json
Module weights 