# **1. Getting Started**
___

## **1.1 Installing dependencies**

In [None]:
pip install evaluate pandas sentence_transformers torch tqdm transformers bert_score rouge_score peft datasets numpy

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux20

# **2. Data Preprocessing**

## **The datasets**

1. *genZ_slangs(1.7k_rows)_edited* (created by creating a context column from the genz_slang_dataset using ChatGPT)

2. *genz_slang_dataset* (from https://huggingface.co/datasets/MLBtrio/genz-slang-dataset)

## **CSV curation/conversion into fine-tuning format**

Adds the context column from the *genZ_slangs(1.7k_rows)_edited* into the *genz_slang_dataset*, to create the *genz_finetune_combined_dataset* (which is formatted for the finetuning)

genz_finetune_combined_dataset is saved in *"dataset/"* folder


In [None]:
import pandas as pd

# === Load Datasets ===

df1 = pd.read_csv("dataset/genZ_slangs(1.7k_rows)_edited.csv")
df2 = pd.read_csv("dataset/genz_slang_dataset.csv")

# === Clean column names ===
df1.columns = df1.columns.str.strip().str.lower()
df2.columns = df2.columns.str.strip().str.lower()

# === Sanity check: align by index and length ===
min_len = min(len(df1), len(df2))
df1 = df1.head(min_len).reset_index(drop=True)
df2 = df2.head(min_len).reset_index(drop=True)

# === Combine example + description + context from df2, and target from df1 ===
combined_data = []
for i in range(min_len):
    example = str(df2.loc[i, "example"]).strip()
    description = str(df2.loc[i, "description"]).strip()
    context = str(df2.loc[i, "context"]).strip()
    target = str(df1.loc[i, "proper english"]).strip()

    if example and target:
        input_text = (
            f"Paraphrase this Gen-Z sentence into formal English:\n"
            f"Example: {example}\n"
            f"Description: {description}\n"
            f"Context: {context}"
        )
        combined_data.append({"input": input_text, "target": target})

# === Save to CSV ===
combined_df = pd.DataFrame(combined_data)
output_path = "dataset/genz_finetune_combined_dataset.csv"
combined_df.to_csv(output_path, index=False)
print(f"Combined dataset saved to: {output_path}")

# **3. Creating the Evaluation Methods**
___


Mean Cosine Similarity : The average cosine similarity of all preds<=>refs\
Mean METEOR : The average METEOR score of all preds<=>refs\
Mean Bert Precision : The average BERT precision of all preds<=>refs\
Mean Bert Recall : The average BERT recall of all preds<=>refs\
Mean Bert f1 : The average BERT f1 of all preds<=>refs\
Mean Rouge-1 : The average Rouge-1 (1-gram) of all preds<=>refs\
Mean Rouge-2 : The average Rouge-2 (2-gram) of all preds<=>refs\
Mean Rouge-L : The average Rouge-L (Longest common substring) of all preds<=>refs\
Geometric Mean BLEU : The geometric mean of all BLEU-n (1 to bleu_n) of all preds<=>refs\
Mean BLEU-n : The average BLEU-n of all preds<=>refs\

In [None]:
from sentence_transformers import SentenceTransformer, util
from evaluate import load

# Initialise evaluation objects
embedder = SentenceTransformer('all-MiniLM-L6-v2')
bleu = load("bleu")
meteor = load("meteor")
bertscore = load("bertscore")
rouge = load("rouge")

def cosine_similarity(preds, refs):
    pred_emb = embedder.encode(preds, convert_to_tensor=True)
    ref_emb = embedder.encode(refs, convert_to_tensor=True)
    return util.cos_sim(pred_emb, ref_emb).diagonal().tolist()

def bleu_score(preds, refs, n=2):
    results = bleu.compute(predictions=preds, references=refs, smooth=True, max_order=n)
    return results

def meteor_score(preds, refs):
    results = meteor.compute(predictions=preds, references=refs)
    return results

def bert_score(preds, refs):
    results = bertscore.compute(predictions=preds, references=refs, lang='en', model_type="distilbert-base-uncased")
    return results

def rouge_score(preds, refs):
    results = rouge.compute(predictions=preds, references=refs)
    return results

def eval_model(model_name, data, bleu_n=2):
    # data is a df with preds and refs column
    # model_name is a str
    # bleu_n is an int, default 2
    length = len(data)
    cos_sim_eval_score = cosine_similarity(preds=data['preds'], refs=data['refs'])
    bleu_eval_score = bleu_score(preds=data['preds'], refs=data['refs'], n=bleu_n)
    METEOR_eval_score = meteor_score(preds=data['preds'], refs=data['refs'])
    Bert_eval_score = bert_score(preds=data['preds'], refs=data['refs'])
    rouge_eval_score = rouge_score(preds=data['preds'], refs=data['refs'])

    eval_scores = {
        'Model name': model_name,
        'Mean Cosine Similarity': sum(cos_sim_eval_score)/length,
        'Mean METEOR' : METEOR_eval_score['meteor'],
        # 'Mean Bert Precision' : sum(Bert_eval_score['precision'])/length,
        # 'Mean Bert Recall' : sum(Bert_eval_score['recall'])/length,
        'Mean Bert f1' : sum(Bert_eval_score['f1'])/length,
        # 'Mean Rouge-1' : rouge_eval_score['rouge1'],
        # 'Mean Rouge-2' : rouge_eval_score['rouge2'],
        'Mean Rouge-L' : rouge_eval_score['rougeL'],
        'Geometric Mean BLEU' : bleu_eval_score['bleu'],
    }

    for i in range(bleu_n):
        eval_scores[f'Mean BLEU-{i+1}'] = bleu_eval_score['precisions'][i]

    results = pd.DataFrame([eval_scores])
    return results


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

# **4. Zero-Shot Testing**

___

Evaluating the base models' capabilities to understand gen-z slang (testing against *genz_finetune_combined_dataset*)

## **4.1 Loading the dataset**

In [None]:
# Zero-Shot Evaluation of Gen-Z to Formal English Translation

## Load Required Libraries
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from tqdm import tqdm

## Load Your Dataset
# Assuming your dataset is a CSV file with columns: 'Slang', 'Description', 'Example', 'Proper English'
df = pd.read_csv("dataset/genz_finetune_combined_dataset.csv")

In [None]:
print(df.columns.tolist())

['Slang', 'Description', 'Example', 'Proper English']


## **4.2 Creating the method and running inference**

In [None]:
## Zero-Shot Inference Function
def evaluate_model(model_name, df, limit=100):
    print(f"\nEvaluating: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    model.eval()

    # preds, refs, sims = [], [], []
    preds, refs = [], []
    for i, row in tqdm(df.head(limit).iterrows(), total=limit):
        prompt = row['input']
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
        with torch.no_grad():
            output = model.generate(**inputs, max_length=60)
        pred = tokenizer.decode(output[0], skip_special_tokens=True)
        preds.append(pred)
        refs.append(row["target"])

    # Preparing for eval_model
    data = {
        'refs': refs,
        'preds': preds
    }
    data = pd.DataFrame(data)

    eval_results = eval_model(model_name=model_name, data=data, bleu_n=2)

    return eval_results

## Evaluate Multiple Models
models_to_test = [
    "google/flan-t5-base",
    "google/flan-t5-large",
    "google/t5-v1_1-base",
    "facebook/bart-large"
]

results = []
for model_name in models_to_test:
    result = evaluate_model(model_name, df)
    results.append(result)

summary = pd.concat(results, ignore_index=True)
print("\nüìä Summary of Results:")
print(summary)


Evaluating: google/flan-t5-base


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [04:12<00:00,  2.53s/it]


Average Semantic Similarity: 0.4408

Evaluating: google/flan-t5-large


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [08:08<00:00,  4.88s/it]


Average Semantic Similarity: 0.4617

Evaluating: google/t5-v1_1-base


tokenizer_config.json:   0%|          | 0.00/1.86k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]


  0%|          | 0/100 [00:00<?, ?it/s][A
  1%|          | 1/100 [00:06<10:58,  6.66s/it][A
  2%|‚ñè         | 2/100 [00:12<10:00,  6.13s/it][A
  3%|‚ñé         | 3/100 [00:15<08:00,  4.96s/it][A
  4%|‚ñç         | 4/100 [00:18<06:06,  3.82s/it][A
  5%|‚ñå         | 5/100 [00:20<05:28,  3.46s/it][A
  6%|‚ñå         | 6/100 [00:26<06:20,  4.04s/it][A
  7%|‚ñã         | 7/100 [00:30<06:13,  4.02s/it][A
  8%|‚ñä         | 8/100 [00:32<05:26,  3.55s/it][A
  9%|‚ñâ         | 9/100 [00:35<05:03,  3.33s/it][A
 10%|‚ñà         | 10/100 [00:39<05:25,  3.62s/it][A
 11%|‚ñà         | 11/100 [00:44<05:47,  3.91s/it][A
 12%|‚ñà‚ñè        | 12/100 [00:48<05:46,  3.93s/it][A
 13%|‚ñà‚ñé        | 13/100 [00:53<06:15,  4.31s/it][A
 14%|‚ñà‚ñç        | 14/100 [00:57<06:04,  4.24s/it][A
 15%|‚ñà‚ñå        | 15/100 [01:01<06:02,  4.26s/it][A
 16%|‚ñà‚ñå        | 16/100 [01:04<05:19,  3.81s/it][A
 17%|‚ñà‚ñã        | 17/100 [01:09<05:47,  4.19s/it][A
 18%|‚ñà‚ñä        | 18/100 [01:14<05

Average Semantic Similarity: 0.1447

Evaluating: facebook/bart-large


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.02G [00:00<?, ?B/s]


  0%|          | 0/100 [00:00<?, ?it/s][A
  1%|          | 1/100 [00:12<21:07, 12.80s/it][A
  2%|‚ñè         | 2/100 [00:26<21:39, 13.26s/it][A
  3%|‚ñé         | 3/100 [00:39<21:37, 13.37s/it][A
  4%|‚ñç         | 4/100 [00:48<18:15, 11.41s/it][A
  5%|‚ñå         | 5/100 [01:02<19:55, 12.59s/it][A
  6%|‚ñå         | 6/100 [01:12<18:05, 11.55s/it][A
  7%|‚ñã         | 7/100 [01:22<17:13, 11.11s/it][A
  8%|‚ñä         | 8/100 [01:31<15:51, 10.34s/it][A
  9%|‚ñâ         | 9/100 [01:41<15:44, 10.38s/it][A
 10%|‚ñà         | 10/100 [01:52<15:38, 10.43s/it][A
 11%|‚ñà         | 11/100 [02:06<16:58, 11.45s/it][A
 12%|‚ñà‚ñè        | 12/100 [02:16<16:29, 11.24s/it][A
 13%|‚ñà‚ñé        | 13/100 [02:28<16:33, 11.42s/it][A
 14%|‚ñà‚ñç        | 14/100 [02:42<17:25, 12.16s/it][A
 15%|‚ñà‚ñå        | 15/100 [02:52<16:20, 11.54s/it][A
 16%|‚ñà‚ñå        | 16/100 [03:06<16:54, 12.08s/it][A
 17%|‚ñà‚ñã        | 17/100 [03:19<17:18, 12.52s/it][A
 18%|‚ñà‚ñä        | 18/100 [03:33<17

Average Semantic Similarity: 0.3745

üìä Summary of Results:
                  Model  Avg Semantic Similarity
0   google/flan-t5-base                 0.440839
1  google/flan-t5-large                 0.461683
2   google/t5-v1_1-base                 0.144678
3   facebook/bart-large                 0.374488


# **5. Finetuning**

___

## **5.1 Dataset Preparation**

Cleaning and splitting the dataset

In [None]:
# Fine-Tuning FLAN-T5 on Gen-Z to Formal English Translation

## Import Libraries
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from datasets import Dataset
from peft import get_peft_model, LoraConfig, TaskType

## Load and Prepare Dataset
df = pd.read_csv("dataset/genz_finetune_combined_dataset.csv")

# Strip and clean
df.columns = df.columns.str.strip()
df = df.dropna(subset=["input", "target"])
df = df[df["input"].str.strip() != ""]
df = df[df["target"].str.strip() != ""]

# Keep only relevant columns
df = df[["input", "target"]]  # <- THIS is important!

# Convert safely
dataset = Dataset.from_pandas(df)

# Train/Validation split
dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset['train']
val_dataset = dataset['test']

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m491.2/491.2 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.0/84.0 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m61.1/61.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m116.3/116.3 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m183.9/1

## **5.3 Finetuning the model on the *genz_finetune_combined_dataset***

Running inference and evaluating the model's performance at every epoch

In [None]:
import numpy as np

## Tokenization
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess(example):
    input_text = str(example['input']).strip()
    target_text = str(example['target']).strip()

    if input_text == "" or target_text == "":
        return {}

    model_inputs = tokenizer(input_text, truncation=True, padding="max_length", max_length=300)
    labels = tokenizer(target_text, truncation=True, padding="max_length", max_length=300)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_dataset.map(preprocess, batched=False).filter(lambda x: "labels" in x)
tokenized_val = val_dataset.map(preprocess, batched=False).filter(lambda x: "labels" in x)

## Load Model
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q", "k", "v"],
    lora_dropout=0.1,
    task_type=TaskType.SEQ_2_SEQ_LM )

model = get_peft_model(model, lora_config)


## Metric Functions
# meteor = load("meteor")
# bertscore = load("bertscore")

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # Replace -100s in labels for proper decoding
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Ensure preds are a NumPy array of int32
    preds = np.array(preds).astype(np.int32)

    # Hard clip values to valid token range
    preds = np.clip(preds, 0, tokenizer.vocab_size - 1)
    labels = np.clip(labels, 0, tokenizer.vocab_size - 1)

    # Defensive decode with try/except to avoid corrupt entries
    try:
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    except OverflowError as e:
        print("‚ö†Ô∏è Skipping decoding due to invalid token ids")
        return {
            "meteor": 0.0,
            "bertscore_f1": 0.0
        }

    # Strip and filter empty ones
    decoded_preds = [p.strip() for p in decoded_preds]
    decoded_labels = [l.strip() for l in decoded_labels]

    paired = [(p, l) for p, l in zip(decoded_preds, decoded_labels) if p and l]
    if not paired:
        return {
            "meteor": 0.0,
            "bertscore_f1": 0.0
        }

    filtered_preds, filtered_labels = zip(*paired)

    # result = {}
    # result.update(meteor.compute(predictions=filtered_preds, references=filtered_labels))
    # result["bertscore_f1"] = np.mean(
    #     bertscore.compute(predictions=filtered_preds, references=filtered_labels, lang="en")["f1"]
    # )

    # return {
    #     "meteor": result["meteor"],
    #     "bertscore_f1": result["bertscore_f1"],
    # }

    bleu_n = 2
    bleu_eval = bleu_score(preds=filtered_preds, refs=filtered_labels, n=bleu_n)
    meteor_eval = meteor_score(preds=filtered_preds, refs=filtered_labels)
    bert_eval = bert_score(preds=filtered_preds, refs=filtered_labels)
    rouge_eval = rouge_score(preds=filtered_preds, refs=filtered_labels)
    cos_sim_eval = cosine_similarity(preds=filtered_preds, refs=filtered_labels)
    length = len(filtered_preds)

    # Package results
    result = {
        "Mean Cosine Similarity": sum(cos_sim_eval) / length,
        "Mean METEOR": meteor_eval["meteor"],
        "Mean Bert f1": sum(bert_eval["f1"]) / length,
        "Mean Rouge-L": rouge_eval["rougeL"],
        "Geometric Mean BLEU": bleu_eval["bleu"],
    }

    for i in range(bleu_n):
        result[f"Mean BLEU-{i+1}"] = bleu_eval["precisions"][i]

    return result

## Training Arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./flan-t5-genz",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    save_total_limit=1,
    predict_with_generate=True,
    logging_dir='./logs',
    logging_strategy="steps",
    logging_steps=10,
    save_strategy="epoch",
    load_best_model_at_end=True
)

## Data Collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

## Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

## Model Trainer
trainer.train()

model = model.merge_and_unload()

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Map:   0%|          | 0/1601 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1601 [00:00<?, ? examples/s]

Map:   0%|          | 0/178 [00:00<?, ? examples/s]

Filter:   0%|          | 0/178 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
  trainer = Seq2SeqTrainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mtangignatius[0m ([33mtangignatius-nanyang-technological-university-singapore[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Mean cosine similarity,Mean meteor,Mean bert f1,Mean rouge-l,Geometric mean bleu,Mean bleu-1,Mean bleu-2
1,33.8871,33.362797,0.310333,0.153427,0.707244,0.148471,0.091422,0.15493,0.053947
2,23.3505,21.656569,0.253672,0.166204,0.678224,0.13178,0.143217,0.211409,0.148289
3,12.2421,9.501206,0.261301,0.169498,0.663132,0.148214,0.153469,0.252632,0.227848
4,6.2792,5.044898,0.162795,0.042807,0.627086,0.041246,0.019647,0.092593,0.055762
5,4.9327,4.601783,0.183241,0.071663,0.60524,0.063198,0.028786,0.192982,0.166667
6,4.5665,4.383889,0.237088,0.093948,0.661762,0.107674,0.044567,0.293706,0.252427
7,4.3764,4.225906,0.305202,0.138795,0.689021,0.172433,0.087283,0.317365,0.236641
8,4.2477,4.044216,0.323662,0.161257,0.706469,0.193898,0.109076,0.301533,0.185804
9,4.1364,3.902737,0.336236,0.186305,0.708995,0.216511,0.135292,0.323116,0.202454
10,4.1002,3.845811,0.348324,0.19017,0.711993,0.228831,0.143605,0.327251,0.202899


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

# **6. Saving the model weights after finetuning**

___

In [None]:
model.save_pretrained("model/flan-t5-genz-final")
tokenizer.save_pretrained("model/flan-t5-genz-final")

('/content/drive/MyDrive/Colab Notebooks/NLP/fine-tuned models/flan-t5-genz-final/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/NLP/fine-tuned models/flan-t5-genz-final/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/NLP/fine-tuned models/flan-t5-genz-final/spiece.model',
 '/content/drive/MyDrive/Colab Notebooks/NLP/fine-tuned models/flan-t5-genz-final/added_tokens.json',
 '/content/drive/MyDrive/Colab Notebooks/NLP/fine-tuned models/flan-t5-genz-final/tokenizer.json')

# **7. Running inference on the finetuned model**
---

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import pandas as pd

# === Load Fine-Tuned Model ===
model_path = "model/flan-t5-genz-final" 

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

# === Load Input Prompts for Testing ===
df = pd.read_csv("dataset/genz_finetune_combined_dataset.csv")
test_inputs = df['input'].dropna().sample(5).tolist()

# === Run Inference ===
for inp in test_inputs:
    encoded = tokenizer(inp, return_tensors="pt", truncation=True, padding=True).to(device)
    output = model.generate(**encoded, max_length=60)
    decoded = tokenizer.decode(output[0], skip_special_tokens=True)

    print(f"Prompt: {inp}")
    print(f"Output: {decoded}\\n")


Prompt: Translate this Gen-Z sentence into formal English:
Example: You got double sixes, NR!
Description: Nice roll
Context: A compliment used in dice games when someone gets a good result.
Output: :good\n
Prompt: Translate this Gen-Z sentence into formal English:
Example: BRB, BAK in 5.
Description: Back at keyboard
Context: Used to inform someone that you are back at your computer after being away.
Output: \n
Prompt: Translate this Gen-Z sentence into formal English:
Example: The letter had SWALK written on the envelope.
Description: Sealed with a loving kiss
Context: A more affectionate version of SWAK, often used in love letters.
Output: :SALK written on the envelope.\n
Prompt: Translate this Gen-Z sentence into formal English:
Example: I have such FOMO, everyone‚Äôs going to that concert except me.
Description: Fear of missing out
Context: Refers to the feeling of anxiety or regret over missing an event or experience.
Output: \n
Prompt: Translate this Gen-Z sentence into formal E

# **Streamlit for visualisation**

___

#### **Installing dependencies**

In [None]:
!pip install -q streamlit pyngrok

[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m44.3/44.3 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m9.8/9.8 MB[0m [31m73.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m6.9/6.9 MB[0m [31m90.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m79.1/79.1 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h

## **Streamlit**

Ensure the saved finetuned model is present in the *"model/"* folder

In [None]:
%%writefile app.py
import streamlit as st

@st.cache_resource
def load_model():
  model_path = "model/flan-t5-genz-final"
  tokenizer = AutoTokenizer.from_pretrained(model_path)
  model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to("cuda" if torch.cuda.is_available() else "cpu")
  model.eval()
  return model, tokenizer

def translate_genz(model,tokenizer, text):
  prompt = f"Paraphrase this Gen-Z sentence into formal English:\n{text}"
  inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True).to(model.device)
  output = model.generate(**inputs, max_length=100)
  return tokenizer.decode(output[0], skip_special_tokens=True)

if 'model_loaded' not in st.session_state:
    st.session_state.model_loaded = False

st.title("Gen-Z Translator")
prompt = st.text_input("Enter a Gen-Z sentence")

if st.button("Translate"):
    if not st.session_state.model_loaded:
        st.session_state.model, st.session_state.tokenizer = load_model()
        st.session_state.model_loaded = True
    if st.session_state.model_loaded:
        translation = translate_genz(st.session_state.model, st.session_state.tokenizer, prompt)
        st.write(translation)


Writing app.py


## **Running Streamlit**

Please put your own auth_token

Instructions for how to get the auth_token : https://ngrok.com/docs/agent/

Click on the first link (left) to launch the Streamlit client

In [None]:
from pyngrok import ngrok

auth_token = 'Your auth_token' # Replace with your own auth_token

ngrok.set_auth_token(auth_token)

!nohup streamlit run app.py --server.port 80 &
url = ngrok.connect(addr = '80')
print(url)

nohup: appending output to 'nohup.out'
NgrokTunnel: "https://661a-35-230-54-122.ngrok-free.app" -> "http://localhost:80"
