In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras 
import transformers
from transformers import pipeline, AutoTokenizer, AutoModel, TFAutoModel, AutoModelForSeq2SeqLM, Trainer, Seq2SeqTrainingArguments,TrainingArguments,DataCollatorForSeq2Seq
from datasets import load_dataset, DatasetDict, Dataset, Features, Value, Translation, load_from_disk, concatenate_datasets
from huggingface_hub import list_datasets, notebook_login,login
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F
from torch.nn.functional import cross_entropy
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,f1_score

2025-06-19 09:22:07.950547: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Model Training #

### Load pre-trained model and tokenizer ###

In [None]:
# Load the model and save to disk locally
#tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
#model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

#tokenizer.save_pretrained("/Users/haochunniu/Desktop/Python/Transformer/pretrained_tokenizer")
#model.save_pretrained("/Users/haochunniu/Desktop/Python/Transformer/pretrained_model")

tokenizer = AutoTokenizer.from_pretrained("/Users/haochunniu/Desktop/Python/Transformer/pretrained_tokenize",local_files_only=True)
model = AutoModelForSeq2SeqLM.from_pretrained("/Users/haochunniu/Desktop/Python/Transformer/pretrained_model",local_files_only=True)


### Add new tokens to existed pre-trained tokenizer ###
In reality, pre-trained tokenizer might already covered the language we are using. Yet, in some special cases, we might want to ***add some special words into out tokenizer's vocabulary list***. After adding these new words, ***the tokenizer will recognize these words and split the sequence correctly***.

In [None]:
### Create a list of new tokens that need to be added
new_words = np.unique(['Eric','Amy','SS'])

### Call the vocabulary from the pretrained tokenizer and check if the word is already in there
### If the word is already in the tokenizer's vocabulary, we don't need to add.
existing_vocab = tokenizer.get_vocab()
tokens_to_add = [tok for tok in new_words if f"{tok}" not in existing_vocab]

### Add the tokens to tokenizer
tokenizer.add_tokens(tokens_to_add)

### Save the updated tokenizer
tokenizer.save_pretrained("/Users/haochunniu/Desktop/Python/Transformer/updated_tokenize")

### Update the embedding size
model.resize_token_embeddings(len(tokenizer))

### Save the updated model
model.save_pretrained("/Users/haochunniu/Desktop/Python/Transformer/updated_model")

### Load the updated tokenozer and model
updated_tokenizer = AutoTokenizer.from_pretrained("/Users/haochunniu/Desktop/Python/Transformer/updated_tokenize",local_files_only=True)
updated_model = AutoModelForSeq2SeqLM.from_pretrained("/Users/haochunniu/Desktop/Python/Transformer/updated_model",local_files_only=True)

### Read the train, validation and test CSV files ###

In [None]:
train_df = pd.read_csv("/Users/haochunniu/Desktop/Python/Transformer/train.csv")
val_df = pd.read_csv("/Users/haochunniu/Desktop/Python/Transformer/val.csv")
test_df = pd.read_csv("/Users/haochunniu/Desktop/Python/Transformer/test.csv")

### Tokenize train, test, and validation data ###

In [None]:
### Create tokenization function
def batch_tokenize(batch):
    input_enc = updated_tokenizer(batch["original_lang_text"],padding="max_length",truncation=True,max_length=80) # The max_length depneds on how long the max length of the text is
    target_enc = updated_tokenizer(batch["target_lang_text"],padding="max_length",truncation=True,max_length=80) # The max_length depneds on how long the max length of the text is
    input_enc["labels"] = target_enc["input_ids"]
    return input_enc

### Tokenize each dataframe
train_dataset = Dataset.from_pandas(train_df,preserve_index=False).map(batch_tokenize,remove_columns=train_df.columns.tolist())
val_dataset = Dataset.from_pandas(val_df,preserve_index=False).map(batch_tokenize,remove_columns=val_df.columns.tolist())
test_dataset = Dataset.from_pandas(test_df,preserve_index=False).map(batch_tokenize,remove_columns=test_df.columns.tolist())

### Save the dataset locally
train_dataset.save_to_disk("/Users/haochunniu/Desktop/Python/Transformer/dataset/train")
val_dataset.save_to_disk("/Users/haochunniu/Desktop/Python/Transformer/dataset/val")
test_dataset.save_to_disk("/Users/haochunniu/Desktop/Python/Transformer/dataset/test")

### Load the dataset from local
train_dataset = load_from_disk("/Users/haochunniu/Desktop/Python/Transformer/dataset/train")
val_dataset = load_from_disk("/Users/haochunniu/Desktop/Python/Transformer/dataset/val")
test_dataset = load_from_disk("/Users/haochunniu/Desktop/Python/Transformer/dataset/test")

### Create the data collator based on tokenizer ###

In [None]:
data_collator = DataCollatorForSeq2Seq(updated_tokenizer)

### Train the model ###

In [None]:
CHECKPOINTS_DIR = "/Users/haochunniu/Desktop/Python/Transformer/final_model/save_checkpoints"
LOGGING_DIR = "/Users/haochunniu/Desktop/Python/Transformer/logs"

# Setup GPU if available
device = torch.device("cude" if torch.cuda.is_available() else "cpu")

# Training Arguments
training_args = Seq2SeqTrainingArguments(
    output_dir=CHECKPOINTS_DIR,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    learning_rate=3e-4,
    num_train_epochs=10,
    logging_dir=LOGGING_DIR,
    logging_steps=1000,
    logging_strategy="steps",
    save_steps=1000,
    save_strategy="steps",
    eval_strategy="steps",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=updated_model,
    tokenizer=updated_tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator
)

# Start training
trainer.train()
#trainer.train(resume_from_checkpoint=True) #Sometimes we are not able to finish the entire training at once, by using this parameter we can force model to pickup the training from last saved checkpoint.

# Save the final best model
FINAL_MODEL_DIR = "/Users/haochunniu/Desktop/Python/Transformer/final_model/best_model" 
trainer.save_model(FINAL_MODEL_DIR )

# Inferencing #

### Load the best model from the training result ###

In [None]:
final_model = AutoModelForSeq2SeqLM.from_pretrained("/Users/haochunniu/Desktop/Python/Transformer/final_model/best_model")
final_model = final_model.to(device)

### Create a function that would output predictions and token logit scores ###

In [None]:
def generate_translation_with_logit_scores(batch):

    # Tokenize input
    inputs = updated_tokenizer(batch["original_lang_text"],return_tensors="pt",padding=True,truncation=True,max_length=80).to(device)

    # Predict
    output = final_model.generate(
        **inputs,
        return_dict_in_generate=True,
        output_scores=True,
        max_length=80,
        output_hidden_states=False
    )

    predicted_ids = output.sequences[0]
    decoded_tokens = updated_tokenizer.convert_ids_to_tokens(predicted_ids,skip_special_tokens=False)

    # Token level logit scores
    token_scores = []
    for i,logits in enumerate(output.scores):
        token_id = predicted_ids[i+1] # Offset 1, because in t5 model the first token will always be the start token
        prob = F.softmax(logits,dim=-1)[0,token_id]
        token_scores.append(prob.item())
    
    per_token_logit_score = []
    decoded_tokens2 = []
    for token,score in zip(decoded_tokens[1:],token_scores): # Skip first token
        if token not in updated_tokenizer.all_special_tokens: # Skip the special tokens
            per_token_logit_score.append((token,score))
            decoded_tokens2.append(token.replace("_","")) # t5 use _ as blank space
    
    # Final string and average score
    decoded_text = updated_tokenizer.decode(predicted_ids,skip_special_tokens=True)
    avg_logit_score = sum(score for _, score in per_token_logit_score)/len(per_token_logit_score) if per_token_logit_score else 0.0

    return {"translation":decoded_text,"avg_logit_score":avg_logit_score,"decoded_tokens":np.asarray(decoded_tokens2)}
    

### Inference on test dataset ###

In [None]:
test_dataset_with_pred = test_dataset.map(generate_translation_with_logit_scores)
test_dataset_with_pred.save_to_disk("/Users/haochunniu/Desktop/Python/Transformer/dataset/test_with_pred")