In [42]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

tokenizer = AutoTokenizer.from_pretrained("Falconsai/text_summarization")
model = AutoModelForSeq2SeqLM.from_pretrained("Falconsai/text_summarization")

In [65]:
import pandas as pd
import datasets
from sklearn.model_selection import train_test_split

file_name = '../data/processed/summarized_insights.csv'

dataset = datasets.load_dataset("csv", data_files=file_name)


In [66]:
def preprocess_function(examples):
    inputs = [doc for doc in examples["Combined Comments"]]
    model_inputs = tokenizer(inputs, max_length=512, padding="max_length", truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["Summary"], max_length=128, padding="max_length", truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/900 [00:00<?, ? examples/s]



In [67]:
# Split the tokenized dataset into training and temp datasets (70% training, 30% temp)
train_dataset, temp_dataset = tokenized_datasets['train'].train_test_split(test_size=0.3).values()

# Split the temp dataset into evaluation and test datasets (50% evaluation, 50% test)
eval_dataset, test_dataset = temp_dataset.train_test_split(test_size=0.5).values()

In [68]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
)


In [72]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    tokenizer=tokenizer,
    eval_dataset=eval_dataset
)

trainer.train()


  0%|          | 0/675 [00:00<?, ?it/s]

  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 2.750612258911133, 'eval_runtime': 2.3853, 'eval_samples_per_second': 56.596, 'eval_steps_per_second': 14.254, 'epoch': 1.0}


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 2.6488096714019775, 'eval_runtime': 2.0044, 'eval_samples_per_second': 67.35, 'eval_steps_per_second': 16.962, 'epoch': 2.0}
{'loss': 3.1384, 'learning_rate': 5.185185185185185e-06, 'epoch': 2.22}


  0%|          | 0/34 [00:00<?, ?it/s]

{'eval_loss': 2.624072790145874, 'eval_runtime': 2.035, 'eval_samples_per_second': 66.34, 'eval_steps_per_second': 16.708, 'epoch': 3.0}
{'train_runtime': 228.8495, 'train_samples_per_second': 11.798, 'train_steps_per_second': 2.95, 'train_loss': 3.094055447048611, 'epoch': 3.0}


TrainOutput(global_step=675, training_loss=3.094055447048611, metrics={'train_runtime': 228.8495, 'train_samples_per_second': 11.798, 'train_steps_per_second': 2.95, 'train_loss': 3.094055447048611, 'epoch': 3.0})

In [79]:
example = eval_dataset[1]
inputs = tokenizer(example["Combined Comments"], return_tensors="pt", padding=True, truncation=True, max_length=512)
outputs = model.generate(input_ids=inputs["input_ids"].to(model.device), attention_mask=inputs["attention_mask"].to(model.device), max_length=128)
prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(prediction)




In [74]:
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_metric
import torch

# Evaluate the model
eval_results = trainer.evaluate(eval_dataset=eval_dataset)

# Load metrics
bleu_metric = load_metric('bleu')
rouge_metric = load_metric('rouge')

# Function to generate predictions
def generate_predictions(model, tokenizer, dataset):
    model.eval()
    predictions = []
    references = []
    for example in dataset:
        inputs = tokenizer(example["Combined Comments"], return_tensors="pt", padding=True, truncation=True, max_length=512)
        outputs = model.generate(input_ids=inputs["input_ids"].to(model.device), attention_mask=inputs["attention_mask"].to(model.device), max_length=128)
        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(prediction)
        references.append(example["Summary"])
    return predictions, references

# Generate predictions
predictions, references = generate_predictions(model, tokenizer, eval_dataset)
print(predictions)

# Calculate BLEU and ROUGE scores
bleu_score = bleu_metric.compute(predictions=[nltk.word_tokenize(pred) for pred in predictions], references=[[nltk.word_tokenize(ref)] for ref in references])
rouge_score = rouge_metric.compute(predictions=predictions, references=references)

# Visualize the results
plt.figure(figsize=(10, 5))
plt.bar(['BLEU', 'ROUGE-L'], [bleu_score['bleu'], rouge_score['rougeL'].mid.fmeasure], color=['blue', 'green'])
plt.xlabel('Metric')
plt.ylabel('Score')
plt.title('Model Performance Metrics')
plt.show()

# Save the model
model.save_pretrained('./saved_model')
tokenizer.save_pretrained('./saved_model')


  0%|          | 0/34 [00:00<?, ?it/s]

KeyboardInterrupt: 