In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/en-np-translation/train.ne
/kaggle/input/en-np-translation/train.en


In [2]:
%%bash
pip install -q datasets sacrebleu torch transformers sentencepiece transformers[sentencepiece]
pip install -q accelerate -U
pip install -q wandb
pip install -q evaluate

In [8]:
import torch
import pandas as pd
import os
import numpy as np
import sacrebleu
import datasets
import warnings
import random
import re
import matplotlib.pyplot as plt
import evaluate
import wandb

from tqdm import tqdm
from collections import Counter

from sklearn.model_selection import train_test_split
from datasets import Dataset, load_metric
from transformers import pipeline, AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

warnings.filterwarnings("ignore")

In [11]:
config = {
    'EPOCHS': 10,
    'BLEU': 'bleu',
    'SRC_LANG_CODE': 'en',
    'TGT_LANG_CODE': 'np',
    'SRC_LANGUAGE': 'English',
    'TGT_LANGUAGE': 'Nepali',
    'SRC_TRAIN_RAW_PATH': '/kaggle/input/en-np-translation/train.en',
    'TGT_TRAIN_RAW_PATH': '/kaggle/input/en-np-translation/train.ne',
    'MAX_SEQ_LEN': 32,
    'BATCH_SIZE': 32,
    'DEVICE': torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
    'MODEL_ID': 't5-small',
    'ENC_TYPE': 'utf-8',
    'VOCAB_SIZE': 32000,
    'MAX_SOURCE_LENGTH': 96,
    'MAX_TARGET_LENGTH': 96,
    'MAX_GEN_LENGTH': 128,
    'WEIGHT_DECAY': 0.01,
    'LR': 5e-5
}

In [10]:
with open(config['SRC_TRAIN_RAW_PATH'], 'r', encoding = config['ENC_TYPE']) as f:
    en_lines = f.readlines()

with open(config['TGT_TRAIN_RAW_PATH'], 'r', encoding = config['ENC_TYPE']) as f:
    ne_lines = f.readlines()

data_df = pd.DataFrame(list(zip(en_lines, ne_lines)), columns = [config['SRC_LANG_CODE'], config['TGT_LANG_CODE']])
data_df[config['SRC_LANG_CODE']] = data_df[config['SRC_LANG_CODE']].str.replace('\n', '', regex = False)
data_df[config['TGT_LANG_CODE']] = data_df[config['TGT_LANG_CODE']].str.replace('\n', '', regex = False)

del en_lines
del ne_lines

# instruction fine-tuning
def generate_translate_prompt(row):
    src_text = row[config['SRC_LANG_CODE']]
    translate_prompt = f"Translate the given sentence from {config['SRC_LANGUAGE']} to {config['TGT_LANGUAGE']}: \"{src_text}\""
    return translate_prompt

data_df['input_prompt'] = data_df.apply(generate_translate_prompt, axis = 1)

train_val_df, test_df = train_test_split(data_df, test_size = 0.05, random_state = 69)
train_df, val_df = train_test_split(train_val_df, test_size = 0.15, random_state = 69)

# huggingface dataset object
train_ds_raw = Dataset.from_pandas(train_df, split = 'train')
validation_ds_raw = Dataset.from_pandas(val_df, split = 'validation')
test_ds_raw = Dataset.from_pandas(test_df, split = 'test')

In [12]:
tokenizer = AutoTokenizer.from_pretrained(config['MODEL_ID'])
model = AutoModelForSeq2SeqLM.from_pretrained(config['MODEL_ID'])
model.to(config['DEVICE'])

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [14]:
t5_translation_prompt = 'translate English to Nepali:'

def tokenize_function(batch):
    source_inputs = [f"{t5_translation_prompt} {example}" for example in batch['en']]
    target_outputs = [example for example in batch['np']]
    return tokenizer(source_inputs, text_target = target_outputs, max_length = 256, truncation = True, padding = 'max_length')

# tokenize dataset in batch for speed
train_dataset = train_ds_raw.map(tokenize_function, batched=True, remove_columns = ['en', 'np', 'input_prompt', '__index_level_0__'])
validation_dataset = validation_ds_raw.map(tokenize_function, batched=True, remove_columns = ['en', 'np', 'input_prompt', '__index_level_0__'])
test_dataset = test_ds_raw.map(tokenize_function, batched=True, remove_columns = ['en', 'np', 'input_prompt', '__index_level_0__'])

train_dataset, validation_dataset, test_dataset

Map:   0%|          | 0/122689 [00:00<?, ? examples/s]

Map:   0%|          | 0/21651 [00:00<?, ? examples/s]

Map:   0%|          | 0/7597 [00:00<?, ? examples/s]

(Dataset({
     features: ['input_ids', 'attention_mask', 'labels'],
     num_rows: 122689
 }),
 Dataset({
     features: ['input_ids', 'attention_mask', 'labels'],
     num_rows: 21651
 }),
 Dataset({
     features: ['input_ids', 'attention_mask', 'labels'],
     num_rows: 7597
 }))

In [15]:
# import metric
bleu_metric = evaluate.load('bleu')

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    
    # decode predicted sentence and skip special tokens
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # add padding (-100 = invalid token) and decode predicted target labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # postprocess for bleu metric
    post_decoded_preds = [pred.strip() for pred in decoded_preds]
    post_decoded_labels = [[label.strip()] for label in decoded_labels]
    
    # compute blue score
    result = bleu_metric.compute(predictions=post_decoded_preds, references=post_decoded_labels)
    
    return {'bleu': result['bleu']}

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [16]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer = tokenizer,
    model = model,
    pad_to_multiple_of = 8
)

In [20]:
training_args = Seq2SeqTrainingArguments(
    output_dir = '/kaggle/working/fine-tuned-translation-en-np',
    optim = 'adamw_torch',
    num_train_epochs = 3,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 32,
    save_strategy = 'epoch',
    evaluation_strategy = 'epoch',
    load_best_model_at_end=True,
    report_to = 'none',
    fp16 = True,
    predict_with_generate = True,
)

# we setup trainer with all previous variables
trainer = Seq2SeqTrainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = validation_dataset,
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics,
)

In [18]:
trainer.evaluate(test_dataset)

{'eval_loss': 11.13097858428955,
 'eval_bleu': 0.007495560437934589,
 'eval_runtime': 187.9439,
 'eval_samples_per_second': 40.422,
 'eval_steps_per_second': 2.527}

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu
1,0.0281,0.025754,0.031191
2,0.0256,0.024373,0.054622


In [None]:
trainer.save_model("FineTunedTransformer")

In [None]:
from transformers import pipeline

translator = pipeline('translation_en_to_ne', model=model.to('cpu'), tokenizer=tokenizer)

In [None]:
text = "translate English to French: I like this machine learning notebook."
translator(text)