In [None]:
# pip install -U transformers datasets
#import os
#os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
import random, math
import torch
from torch.utils.data import DataLoader, TensorDataset, Dataset
from torch.optim import AdamW
from datasets import load_dataset
from transformers import (
    AutoTokenizer, AutoModel, BertConfig, BertLMHeadModel, EncoderDecoderModel
)
from transformers import BertTokenizer

# ---- config
SEED = 0
SRC_CKPT = "bert-base-uncased"              # encoder (EN)
TGT_CKPT = "bert-base-multilingual-cased"   # decoder (FR-capable)
MAX_SRC_LEN = 128
MAX_TGT_LEN = 128
BATCH_SIZE = 8
EPOCHS = 12                                 # raise to 20–30 if not overfitting
LR = 5e-5
traindl_opt=1

random.seed(SEED)
torch.manual_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ---- tokenizers
tok_src = BertTokenizer.from_pretrained(SRC_CKPT)
tok_tgt = BertTokenizer.from_pretrained(TGT_CKPT)
PAD_ID = tok_tgt.pad_token_id
EOS_ID = tok_tgt.sep_token_id
BOS_ID = tok_tgt.cls_token_id

# ---- model: BERT encoder + BERT LM-head decoder with cross-attn

#ref
# dec_cfg = BertConfig.from_pretrained(TGT_CKPT, is_decoder=True, add_cross_attention=True)
# encoder=AutoModel.from_pretrained(SRC_CKPT),
# decoder=BertLMHeadModel.from_pretrained(TGT_CKPT, config=dec_cfg),

encoder = AutoModel.from_pretrained(SRC_CKPT,
                                    bos_token_id=BOS_ID,
                                    eos_token_id=EOS_ID
                                    )

decoder=BertLMHeadModel.from_pretrained(TGT_CKPT,
                                        add_cross_attention=True,
                                        is_decoder=True,
                                        bos_token_id=BOS_ID,
                                        eos_token_id=EOS_ID
                                        )

model = EncoderDecoderModel(encoder=encoder,
                           decoder=decoder
                           ).to(device)




# required special ids for training (right-shift) and decode
model.config.decoder_start_token_id = BOS_ID
model.config.eos_token_id = EOS_ID
model.config.pad_token_id = PAD_ID
model.config.tie_encoder_decoder = False
model.config.vocab_size = model.config.decoder.vocab_size

# ---- tiny EN–FR set: take 100 pairs from OPUS Books
# notes: you can replace this with your own parallel lists
ds = load_dataset("Helsinki-NLP/opus_books", "en-fr", split="train")  # ~1M pairs
pairs = [(ex["translation"]["en"], ex["translation"]["fr"]) for ex in ds.select(range(2000))]
random.shuffle(pairs)
pairs = pairs[:100]  # exactly 100
src_list, tgt_list = zip(*pairs)






####
def tokenize_src(Batchoftext):
    ''' 
    transform a batch of sentences into token with id and attention mask with help of Bert pretrainmodel
    Bert pretrained model 
    Some text are too long... i think this will cause error, see later
    '''
    # see https://medium.com/axinc-ai/how-tokenizer-parameters-impact-transformers-behavior-8be8030637c6
    Tok = tok_src(Batchoftext,
                    max_length=MAX_SRC_LEN,
                    #padding='max_length',
                    padding=True,
                    truncation=True,
                    return_tensors='pt'# pytorch tensor
                    )

    return Tok#Tok['input_ids'], Tok['attention_mask']

def tokenize_tgt(Batchoftext):
    ''' 
    same as source, but for targeted language,
    + BOS is not needed for decoder
    + EOS absent in cutted sentence
    '''
    Tok = tok_tgt(Batchoftext,
                    max_length=MAX_SRC_LEN,
                    padding='max_length',
                    truncation=True,
                    add_special_tokens=False,
                    return_tensors='pt'# pytorch tensor
                    )['input_ids']
    
    Tok_fixed = torch.full_like(Tok, PAD_ID)
    for i in range(Tok.shape[0]):
        y=Tok[i,:]
        y=y[y!=PAD_ID] #remove all padd
        if len(y)<MAX_SRC_LEN: #add EOS if max length not reached
            y=torch.cat((y,torch.tensor([EOS_ID])),dim=0)
        Tok_fixed[i,:len(y)]=y

    Tok_fixed[Tok_fixed == PAD_ID] = -100
    
    return Tok_fixed



class CustomDataset(Dataset):
    def __init__(self,src_list,tgt_list):
        self.src=src_list
        self.tgt=tgt_list

    def __len__(self):
        return len(self.src)

    def __getitem__(self, index):
        return self.src[index], self.tgt[index]    
        

def collate(batch):
    s, t = zip(*batch)
    z=tokenize_src(s)
    train_input_ids=z['input_ids']
    train_attention_masks=z['attention_mask']
    train_labels_ids=tokenize_tgt(t) 

    return train_input_ids, train_attention_masks, train_labels_ids




#### this two approach should be the same, no ? but first one give errors 

def compute_data_dl(src_list,tgt_list,option=None):
    ''' two method for computing train dataloader, which is used for training'''

    if option==None:
        option=1

    if option==0:

        X_train=src_list
        y_train=tgt_list
        z=tokenize_src(X_train)
        train_input_ids=z['input_ids']
        train_attention_masks=z['attention_mask']
        train_labels_ids=tokenize_tgt(y_train)

        train_dataset = TensorDataset(train_input_ids,#type(torch.int64)
                                    train_attention_masks,
                                    train_labels_ids
                                    )
        train_dataloader = DataLoader(  train_dataset,
                                        batch_size=BATCH_SIZE,
                                        shuffle=True)

    if option==1:
        train_dataloader=DataLoader(CustomDataset(src_list,tgt_list),
                                    batch_size=BATCH_SIZE,
                                    shuffle=True,
                                    collate_fn=collate)


    return train_dataloader

#####

train_dl=compute_data_dl(src_list,tgt_list,option=traindl_opt)



#################################
def translate_samples(texts, n=5):

    X=tokenize_src(list(texts[:n]))


    out = model.generate( X["input_ids"], 
                         attention_mask=X["attention_mask"],
                         num_beams=4,
                         max_new_tokens=64, 
                         early_stopping=True, 
                         decoder_start_token_id=BOS_ID, 
                         eos_token_id=EOS_ID, 
                         pad_token_id=PAD_ID,
                         #bad_words_ids=[[PAD_ID]],          # block PAD
                         repetition_penalty=1.1,            # mild
                         no_repeat_ngram_size=3             # optional hygiene
                        )
    
    return [tok_tgt.decode(o, skip_special_tokens=True) for o in out]




# train then test again
model.train()
opt = AdamW(model.parameters(), lr=LR)
steps = 0
#print(train_dl.dataset)

for epoch in range(EPOCHS):
    #for X, labels in train_dl:
    loss=0
    for X,mask, labels in train_dl:
        #print(X)
        opt.zero_grad()
        out=model(input_ids=X,attention_mask=mask,labels=labels)
        out.loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        opt.step()
        steps += 1
        loss+=out.loss
    Loss=loss/len(train_dl)
#    print(f'Loss: {Loss}')    
    print(f"epoch {epoch+1}/{EPOCHS} done ; loss is {Loss}")


model.eval()


k=5
print("\n--- AFTER ---")
preds_after = translate_samples(src_list, n=k)
for i in range(k):
    print(f"EN: {src_list[i]}")
    print(f"FR_gold: {tgt_list[i]}")
    print(f"FR_pred: {preds_after[i]}")
    print("-")



Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.key.bia

epoch 1/12 done ; loss is 7.759047508239746
epoch 2/12 done ; loss is 6.004733562469482
epoch 3/12 done ; loss is 4.981064319610596
epoch 4/12 done ; loss is 4.067440509796143
epoch 5/12 done ; loss is 3.366668224334717
epoch 6/12 done ; loss is 2.7335917949676514
epoch 7/12 done ; loss is 2.2331979274749756
epoch 8/12 done ; loss is 1.7983659505844116
epoch 9/12 done ; loss is 1.3819419145584106
epoch 10/12 done ; loss is 1.1016132831573486
epoch 11/12 done ; loss is 0.9002933502197266
epoch 12/12 done ; loss is 0.7752261757850647

--- AFTER ---
EN: As for me, I found myself obliged, the first time for months, to face alone a long Thursday evening - with the clear feeling that the old carriage had borne away my youth forever.
FR_gold: Quant à moi, je me trouvai, pour la première fois depuis de longs mois, seul en face d’une longue soirée de jeudi – avec l’impression que, dans cette vieille voiture, mon adolescence venait de s’en aller pour toujours.
FR_pred: Quant à moi, je me trouvai

In [13]:

# train then test again
model.train()
opt = AdamW(model.parameters(), lr=LR)
steps = 0
#print(train_dl.dataset)

for epoch in range(EPOCHS):
    #for X, labels in train_dl:
    loss=0
    for X,mask, labels in train_dl:
        #print(X)
        opt.zero_grad()
        #out = model(input_ids=X["input_ids"], attention_mask=X["attention_mask"], labels=labels)
        out=model(input_ids=X,attention_mask=mask,labels=labels)
        out.loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        opt.step()
        steps += 1
        loss+=out.loss
    Loss=loss/len(train_dl)
#    print(f'Loss: {Loss}')    
    print(f"epoch {epoch+1}/{EPOCHS} done ; loss is {Loss}")


model.eval()


k=5
print("\n--- AFTER ---")
preds_after = translate_samples(src_list, n=k)
for i in range(k):
    print(f"EN: {src_list[i]}")
    print(f"FR_gold: {tgt_list[i]}")
    print(f"FR_pred: {preds_after[i]}")
    print("-")

  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)


epoch 1/10 done ; loss is 1.0865552425384521
epoch 2/10 done ; loss is 0.967190682888031
epoch 3/10 done ; loss is 0.7017868757247925
epoch 4/10 done ; loss is 0.5001715421676636
epoch 5/10 done ; loss is 0.44252726435661316
epoch 6/10 done ; loss is 0.3681173026561737
epoch 7/10 done ; loss is 0.3237943649291992
epoch 8/10 done ; loss is 0.27196842432022095
epoch 9/10 done ; loss is 0.23038920760154724
epoch 10/10 done ; loss is 0.22900860011577606

--- AFTER ---
EN: As for me, I found myself obliged, the first time for months, to face alone a long Thursday evening - with the clear feeling that the old carriage had borne away my youth forever.
FR_gold: Quant à moi, je me trouvai, pour la première fois depuis de longs mois, seul en face d’une longue soirée de jeudi – avec l’impression que, dans cette vieille voiture, mon adolescence venait de s’en aller pour toujours.
FR_pred: Quant à moi, je me trouvai, pour la première fois depuis de longs mois, seul en face d une longue soirée de je

In [19]:
def translate_samples(texts, n):

    
    X = tok_src(list(texts[:n]), 
            return_tensors="pt",
            #padding='max_length',  
            padding=True, 
            truncation=True, 
            max_length=MAX_SRC_LEN
            ).to(device)#
    #X=tokenize_src(list(texts[:n]))


    out = model.generate( X["input_ids"], 
                         attention_mask=X["attention_mask"],
                         num_beams=4,
                         max_new_tokens=64, 
                         early_stopping=True, 
                         decoder_start_token_id=BOS_ID, 
                         eos_token_id=EOS_ID, 
                         pad_token_id=PAD_ID,
                         #bad_words_ids=[[PAD_ID]],          # block PAD
                         repetition_penalty=1.1,            # mild
                         no_repeat_ngram_size=3             # optional hygiene
                        )
    
    return [tok_tgt.decode(o, skip_special_tokens=True) for o in out]


k=15
print("\n--- AFTER ---")
preds_after = translate_samples(src_list, n=k)
for i in range(k):
    print(f"EN: {src_list[i]}")
    print(f"FR_gold: {tgt_list[i]}")
    print(f"FR_pred: {preds_after[i]}")
    print("-")



--- AFTER ---
EN: As for me, I found myself obliged, the first time for months, to face alone a long Thursday evening - with the clear feeling that the old carriage had borne away my youth forever.
FR_gold: Quant à moi, je me trouvai, pour la première fois depuis de longs mois, seul en face d’une longue soirée de jeudi – avec l’impression que, dans cette vieille voiture, mon adolescence venait de s’en aller pour toujours.
FR_pred: Quant à moi, je me trouvai, pour la première fois depuis depuis de longs mois, seul en face d une longue soirée de jeudi avec l impression que, dans cette voiture, mon adolescence venait de s en aller pour toujours.
-
EN: No one asked him who Booby was.
FR_gold: Personne ne lui demanda qui était Ganache.
FR_pred: Personne ne lui demanda qui était Ganache.
-
EN: M. Seurel's here .. .'
FR_gold: M. Seurel est là…
FR_pred: M. Seurel est M..
-
EN: After the ball where everything was charming but feverish and mad, where he had himself so madly chased the tall Pier

In [1]:
n=4
s=src_list[:n]
#print(s)
t=tgt_list[:n]
A,a,b=collate((s, t))
train_dl=DataLoader(CustomDataset(s, t), batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate)
A,a,b=zip(*train_dl)


#train_input_ids=tokenize_src(X_train)['input_ids']
#train_attention_masks = tokenize_src(X_train)['attention_mask']
z=tokenize_src(s)
train_input_ids=z['input_ids']
train_attention_masks=z['attention_mask']
train_labels_ids=tokenize_tgt(t)

train_dataset = TensorDataset(train_input_ids,#type(torch.int64)
                              train_attention_masks,
                              train_labels_ids
                              )
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)#? not sur if shuffle will be the same in the two loaded


mytrain_dl=train_dataloader
myA,mya,myb=zip(*mytrain_dl)


print(A[0])
print('..................')
print(myA[0])


#print(a[0])
print('..................')
#print(mya[0][:n])

#print(b[0][:n])
print('..................')
#print(myb[0][:n])

NameError: name 'src_list' is not defined

In [None]:
#mytrain_dl.dataset.tensors[0]
#torch.diff(A,myA).sum()
(b[0]-myb[0]).sum()


q0=model(input_ids=A[0],attention_mask=a[0],labels=b[0])
q1=model(input_ids=myA[0],attention_mask=mya[0],labels=myb[0])
(q0.logits-q1.logits ).sum()


mytrain_dl.dataset.__getitem__

In [None]:
ds = load_dataset("Helsinki-NLP/opus_books", "en-fr", split="train")  # ~1M pairs
pairs = [(ex["translation"]["en"], ex["translation"]["fr"]) for ex in ds.select(range(2000))]
random.shuffle(pairs)
pairs = pairs[:100]  # exactly 100
src_list, tgt_list = zip(*pairs)

tgt_list=tgt_list[:3]
tgt_list=tuple(['Once upon a time','I want to be free'])

# target labels: NO BOS; append EOS; mask PAD with -100
Y = tok_tgt(list(tgt_list), 
            padding='max_length',
            #padding=True, 
            truncation=True, 
            max_length=MAX_TGT_LEN,
            add_special_tokens=False,
            return_tensors="pt"
            )["input_ids"]


# append EOS before padding if room
Y_fixed = torch.full_like(Y, PAD_ID)
for i in range(Y.size(0)):
    toks = [t for t in Y[i].tolist() if t != PAD_ID]
    if len(toks) < MAX_TGT_LEN:
        toks = toks + [EOS_ID]
    toks = toks[:MAX_TGT_LEN]
    Y_fixed[i, :len(toks)] = torch.tensor(toks, dtype=Y_fixed.dtype)


# target labels: NO BOS; append EOS; mask PAD with -100
Y = tok_tgt(list(tgt_list), 
            padding='max_length',
            #padding=True, 
            truncation=True, 
            max_length=MAX_TGT_LEN,
            add_special_tokens=False,
            return_tensors="pt"
            )["input_ids"]


Y_fixed1 = torch.full_like(Y, PAD_ID)
for i in range(Y.shape[0]):
    y=Y[i,:]
    y=y[y!=PAD_ID]
    if len(y)<MAX_TGT_LEN:
        y=torch.cat((y,torch.tensor([EOS_ID])),dim=0)
    Y_fixed1[i,:len(y)]=y


#print(Y_fixed[0,:])
#print(Y_fixed1[0,:])

for i in range(Y.shape[0]):
    if torch.diff(Y_fixed[i,:]-Y_fixed1[i,:]).sum() !=0:
        print(i)
        print (torch.diff(Y_fixed[i,:]-Y_fixed1[i,:]) )


In [None]:
train_dl = DataLoader(Pairs(src_list, tgt_list), batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate)

train_dl.dataset.t
#train_dl=train_dataloader