<a href="https://colab.research.google.com/github/hselino/complaint_analysis/blob/main/summ_fine_tune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nbformat

path = "/content/drive/MyDrive/Colab Notebooks/summ_fine_tune.ipynb"  # replace with your file

with open(path, "r", encoding="utf-8") as f:
    nb = nbformat.read(f, as_version=4)

# Remove invalid widgets metadata
if "widgets" in nb["metadata"]:
    del nb["metadata"]["widgets"]

# Optionally, remove cell outputs too
for cell in nb.cells:
    if "outputs" in cell:
        cell["outputs"] = []
    if "execution_count" in cell:
        cell["execution_count"] = None

with open(path, "w", encoding="utf-8") as f:
    nbformat.write(nb, f)

print("✅ Notebook cleaned and ready for GitHub.")


In [None]:
!pip install transformers[torch] datasets evaluate accelerate rouge_score nltk -q

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from datasets import Dataset
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/prj/segmented_summ.csv')
df = df.dropna()
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.1)


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_checkpoint = "google/mt5-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)


In [None]:
max_input_length = 512
max_target_length = 128

def preprocess_function(examples):
    inputs = examples["content"]
    targets = examples["abstract"]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]


    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)


In [None]:
import evaluate

rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    return {k: round(v * 100, 2) for k, v in result.items()}

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

batch_size = 8

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=10,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=10,
    fp16=False
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,

)

In [None]:
trainer.train()

trainer.save_model("/content/drive/MyDrive/prj/fine_tuned_model/summarizer")
tokenizer.save_pretrained("/content/drive/MyDrive/prj/fine_tuned_model/summarizer")

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/prj/fine_tuned_model/summarizer")
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/prj/fine_tuned_model/summarizer")

In [None]:
import evaluate

rouge = evaluate.load("rouge")

# Take a few examples from test set
examples = tokenized_datasets["test"].select(range(10))

predictions = []
references = []

for example in examples:
    input_text = tokenizer.decode(example["input_ids"], skip_special_tokens=True)
    label_text = tokenizer.decode(example["labels"], skip_special_tokens=True)
    output = model.generate(tokenizer(input_text, return_tensors="pt").input_ids)
    generated_summary = tokenizer.decode(output[0], skip_special_tokens=True)

    predictions.append(generated_summary)
    references.append(label_text)

results = rouge.compute(predictions=predictions, references=references)
print(results)


In [None]:
results

In [None]:
file_path = "asd.txt"

with open(file_path, "r", encoding="utf-8") as file:
    input_text = file.read()

print("Original Text:\n", input_text[:500])  # Preview first 500 chars

In [None]:
def generate_summary(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    output = model.generate(**inputs, max_length=128, num_beams=4, early_stopping=True)
    return tokenizer.decode(output[0], skip_special_tokens=True)

input_text

print("Summary:", generate_summary(input_text))