In [None]:
# ==========================================================
# 1. Gereken Paketleri Yükle
# ==========================================================
!pip install -q transformers datasets evaluate nltk accelerate

import os
import json
import shutil
import numpy as np
import nltk
nltk.download("punkt")

import torch
from datasets import load_dataset, Dataset
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    DataCollatorForSeq2Seq,
    TrainingArguments,
    Seq2SeqTrainer
)
import evaluate

from google.colab import drive
drive.mount('/content/drive')


In [None]:
# ==========================================================
# 2. Dosyayı Google Drive'dan Colab'a Kopyala
# ==========================================================
original_data_path = "/content/drive/My Drive/akış 1000 veri.jsonl"
local_data_path = "/content/akış_1000_veri.jsonl"

# Kopyalama işlemi
try:
    shutil.copy(original_data_path, local_data_path)
    print(f"{original_data_path} başarıyla kopyalandı.")
except Exception as e:
    raise FileNotFoundError(f"Dosya kopyalanamadı: {e}")


In [None]:
# ==========================================================
# 3. Veri Setini Yükle
# ==========================================================
from datasets import Dataset

# JSONL dosyasını pandas ile oku
import pandas as pd
df = pd.read_json(local_data_path, lines=True)

# Dataset'e dönüştür
dataset = Dataset.from_pandas(df)

# %80 eğitim - %20 test bölmesi
dataset = dataset.train_test_split(test_size=0.2)

print(dataset)



In [None]:
# ==========================================================
# 4. Model ve Tokenizer
# ==========================================================
model_name = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)


In [None]:
print(dataset.column_names)


In [None]:
# ==========================================================
# 5. Tokenizasyon
# ==========================================================
def tokenize(example):
    messages = example["messages"]

    # Tüm mesajları birleştir (system + user)
    chat_input = ""
    for msg in messages:
        if msg["role"] in ["system", "user"]:
            chat_input += f"{msg['role']}: {msg['content']}\n"

    # Son assistant cevabı (etiket)
    assistant_msg = next((msg["content"] for msg in reversed(messages) if msg["role"] == "assistant"), "")

    # Tokenize et
    inputs = tokenizer(
        chat_input.strip(),
        max_length=512,
        padding="max_length",
        truncation=True,
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            assistant_msg,
            max_length=128,
            padding="max_length",
            truncation=True,
        )

    inputs["labels"] = labels["input_ids"]
    return inputs



In [None]:
# ==========================================================
# 6. Data Collator
# ==========================================================
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


In [None]:
!pip install rouge_score
!pip install evaluate nltk datasets


In [None]:
# ==========================================================
# 7. Metrikler
# ==========================================================
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    bleu_result = bleu.compute(predictions=decoded_preds, references=[[l] for l in decoded_labels])

    pred_tokens = [pred.split() for pred in decoded_preds]
    label_tokens = [label.split() for label in decoded_labels]

    precision = precision_metric.compute(predictions=pred_tokens, references=label_tokens, average="micro")["precision"]
    recall = recall_metric.compute(predictions=pred_tokens, references=label_tokens, average="micro")["recall"]
    f1 = f1_metric.compute(predictions=pred_tokens, references=label_tokens, average="micro")["f1"]

    return {
        "rouge1": round(rouge_result["rouge1"] * 100, 2),
        "rougeL": round(rouge_result["rougeL"] * 100, 2),
        "bleu": round(bleu_result["bleu"] * 100, 2),
        "precision": round(precision * 100, 2),
        "recall": round(recall * 100, 2),
        "f1": round(f1 * 100, 2),
    }


In [None]:
# ==========================================================
# 8. Eğitim Ayarları
# ==========================================================
training_args = TrainingArguments(
    output_dir="./flan-t5-finetuned",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    weight_decay=0.01,
    save_total_limit=1,
    logging_steps=10,
    push_to_hub=False,
)


In [None]:
tokenized_dataset = dataset.map(tokenize, batched=False)

In [None]:
!pip install --upgrade transformers


In [None]:
# ==========================================================
# 9. Trainer ve Eğitim
# ==========================================================
from transformers import Seq2SeqTrainingArguments

from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./flan-t5-finetuned",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
    logging_steps=10,
    push_to_hub=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    # processing_class=tokenizer,  # Gerekirse ekle, yoksa kaldır
)

trainer.train()



#  wandb şifresi = abda9f461371669c2516207660e00058a83e1e09