<a href="https://colab.research.google.com/github/ibrahimt0140/Final-Project-Generative-AI/blob/main/%C4%B0brahim_Ta%C5%9Fk%C4%B1n_210208963.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets sentencepiece --quiet

from transformers import MarianMTModel, MarianTokenizer
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import Dataset
import torch

# Model and tokenizer
model_name = "Helsinki-NLP/opus-mt-en-de"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Small data set (only 3 examples)
data = [
    {"en": "The patient has a fever.", "de": "Der Patient hat Fieber."},
    {"en": "Take this medicine twice a day.", "de": "Nehmen Sie dieses Medikament zweimal täglich."},
    {"en": "You need to rest.", "de": "Sie müssen sich ausruhen."}
]
dataset = Dataset.from_list(data)

# Tokenizer
def tokenize(batch):
    inputs = tokenizer(batch['en'], padding="max_length", truncation=True, max_length=64)
    targets = tokenizer(batch['de'], padding="max_length", truncation=True, max_length=64)
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_dataset = dataset.map(tokenize, batched=True)

# Training options
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    num_train_epochs=1,
    weight_decay=0.01,
    save_total_limit=1,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    logging_dir='./logs',
    logging_steps=1,
    report_to="none"
)

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

# Sample before training
def translate(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
    translated = model.generate(**inputs)
    return tokenizer.decode(translated[0], skip_special_tokens=True)

print("Before fine-tuning:")
print("English: I have a math exam tomorrow.")
print("German:", translate("I have a math exam tomorrow."))

# Training
trainer.train()

print("\nAfter fine-tuning:")
print("English: I have a math exam tomorrow.")
print("German:", translate("I have a math exam tomorrow."))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


Before fine-tuning:
English: I have a math exam tomorrow.


model.safetensors:   0%|          | 0.00/298M [00:00<?, ?B/s]

German: Ich habe morgen eine Matheprüfung.


Step,Training Loss
1,8.462
2,6.342





After fine-tuning:
English: I have a math exam tomorrow.
German: Ich habe morgen eine Matheprüfung.
