#### 1.0 Importing necessary libraries

In [1]:
from datasets import load_dataset
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import warnings
warnings.filterwarnings("ignore")

#### 1.1 loading the dataset

In [2]:
df = load_dataset("csv", data_files="./dataset/enswdataset.csv")

Found cached dataset csv (/home/starlabs/.cache/huggingface/datasets/csv/default-b6e27a0700ebebe3/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/1 [00:00<?, ?it/s]

#### 1.2 Viewing the dataset information

In [3]:
print("Dataset object:\n\n", df)

Dataset object:

 DatasetDict({
    train: Dataset({
        features: ['English sentence', 'Swahii Translation'],
        num_rows: 8492
    })
})


#### 1.3 Viewing the split dataset information

In [4]:
split_df = df['train'].train_test_split(train_size=0.9, seed=20)

Loading cached split indices for dataset at /home/starlabs/.cache/huggingface/datasets/csv/default-b6e27a0700ebebe3/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-82a620d671faa70e.arrow and /home/starlabs/.cache/huggingface/datasets/csv/default-b6e27a0700ebebe3/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-0a0d7c836fcb93ae.arrow


In [5]:
print("\nSplit datasets:\n\n", split_df)


Split datasets:

 DatasetDict({
    train: Dataset({
        features: ['English sentence', 'Swahii Translation'],
        num_rows: 7642
    })
    test: Dataset({
        features: ['English sentence', 'Swahii Translation'],
        num_rows: 850
    })
})


#### 1.4  Loading the tokenizer and model

In [6]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-swc"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

#### 1.5 Set the maximum sequence length and define the preprocessing function


In [7]:
max_length = 128

In [8]:
def preprocess_function(examples):
    inputs = examples['English sentence']
    targets = examples['Swahii Translation']
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, padding="max_length", truncation=True
    )
    return model_inputs

#### 1.6 Preprocess the training and validation sets

In [9]:
train_dataset = split_df['train'].map(
    preprocess_function, batched=True, num_proc=4, remove_columns=["English sentence", "Swahii Translation"]
)

Loading cached processed dataset at /home/starlabs/.cache/huggingface/datasets/csv/default-b6e27a0700ebebe3/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-71dbe1ab21e3eb56_*_of_00004.arrow


In [10]:
eval_dataset = split_df['test'].map(
    preprocess_function, batched=True, num_proc=4, remove_columns=["English sentence", "Swahii Translation"]
)

Loading cached processed dataset at /home/starlabs/.cache/huggingface/datasets/csv/default-b6e27a0700ebebe3/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-fb0df2806c8661f8_*_of_00004.arrow


#### 1.7 Define the training arguments and create the trainer

In [11]:
training_args = Seq2SeqTrainingArguments(
    output_dir='./models/',
    predict_with_generate=True,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    push_to_hub=False,
    logging_dir="./logs",
    logging_steps=500,
)

In [12]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

#### 1.8 train the model

In [13]:
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=956, training_loss=0.10757422347447862, metrics={'train_runtime': 14086.3703, 'train_samples_per_second': 1.085, 'train_steps_per_second': 0.068, 'total_flos': 518102609559552.0, 'train_loss': 0.10757422347447862, 'epoch': 2.0})

#### 1.9 evaluate the model on the validation set

In [14]:
result = trainer.evaluate()
print(result)

{'eval_loss': 0.08861713856458664, 'eval_runtime': 240.7144, 'eval_samples_per_second': 3.531, 'eval_steps_per_second': 0.224, 'epoch': 2.0}


#### 2.0 export the trained model

In [15]:
model.save_pretrained("./models")
tokenizer.save_pretrained("./models")

('./models/tokenizer_config.json',
 './models/special_tokens_map.json',
 './models/vocab.json',
 './models/source.spm',
 './models/target.spm',
 './models/added_tokens.json')

#### 2.1 creating a pipeline for translation

In [16]:
translator = pipeline(
    "text2text-generation",
    model="./models",
    tokenizer="./models",
)


#### 2.2 prompt the user to enter a sentence for translation

In [18]:
while True:
    text = input("Enter an English sentence for translation to Swahili (type 'exit' to quit): ")
    if text == "exit":
        break
    translated_text = translator(text, max_length=max_length)[0]['generated_text']
    print(f"Translated text: {translated_text}")

Enter an English sentence for translation to Swahili (type 'exit' to quit): prints a feedback message before breaking
Translated text: linachapisha ujumbe wa haraka wa mama kabla ya kuvunja
Enter an English sentence for translation to Swahili (type 'exit' to quit): exit
