In [42]:
import numpy as np
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Load your dataset
file_name = '../data/processed/summarized_insights.csv'
df = pd.read_csv(file_name)

# Add special tokens in front of 'Combined Comments' and 'Summary'
df['training_example'] = "<comments> " + df['Combined Comments'] + " <summary> " + df['Summary'] + " </summary>"

# Split the dataset into train, validate, and test sets
train, validate, test = np.split(df.sample(frac=1, random_state=42),
                                 [int(.8*len(df)), int(.9*len(df))])

# Save split datasets to files (if necessary)
train['training_example'].to_csv('train.txt', index=False, header=False)
validate['training_example'].to_csv('validate.txt', index=False, header=False)
test['training_example'].to_csv('test.txt', index=False, header=False)

# Initialize tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Ensure tokenizer adds the special tokens
special_tokens_dict = {'additional_special_tokens': ['<comments>', '<summary>', '</summary>']}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

# Create a TextDataset for each dataset
train_dataset = TextDataset(tokenizer=tokenizer, file_path='train.txt', block_size=128)
valid_dataset = TextDataset(tokenizer=tokenizer, file_path='validate.txt', block_size=128)

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    eval_steps=500,
    save_steps=1000,
    warmup_steps=500,
    prediction_loss_only=True,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)

# Train the model
trainer.train()

# Save the model
trainer.save_model('./model')





  0%|          | 0/1083 [00:00<?, ?it/s]

{'loss': 12.2133, 'learning_rate': 5e-05, 'epoch': 1.39}
{'loss': 3.0446, 'learning_rate': 7.11835334476844e-06, 'epoch': 2.77}
{'train_runtime': 311.2221, 'train_samples_per_second': 13.9, 'train_steps_per_second': 3.48, 'train_loss': 7.267366728073783, 'epoch': 3.0}


In [49]:
data = []
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
                             
# Looping over the first 10 examples in the test set
for i in range(10):
    example = test.iloc[i]
    # Splitting the example by "<comments>"
    split_example = example["Combined Comments"].split("<comments>")
    comment_part = split_example[1] if len(split_example) > 1 else example["Combined Comments"]

    # Tokenizing the comment part
    inputs = tokenizer(comment_part, padding=True, truncation=True, return_tensors="pt", max_length=512)
    outputs = model.generate(input_ids=inputs["input_ids"].to(model.device), attention_mask=inputs["attention_mask"].to(model.device), max_length=128)
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    actual_summary = example.get("Summary", "No summary provided")

    # Adding the data to the container
    if len(prediction) > len(example["Combined Comments"]) + 1:
        data.append([example["Combined Comments"], prediction[len(example["Combined Comments"]):], actual_summary])

# Define the headers
headers = ["Example", "Generated Prediction", "Actual Summary"]

# Print the table with text wrapping
from tabulate import tabulate
print(tabulate(data, headers=headers, tablefmt="grid", maxcolwidths=[50, 50, 50]))


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


+----------------------------------------------------+----------------------------------------------------+----------------------------------------------------+
| Example                                            | Generated Prediction                               | Actual Summary                                     |
| They change prices on you all the time. The food   | - Address the quality of the food, as it was       | - Monitor and stabilize menu pricing to avoid      |
| is not bat at all, but the new owner is all about  | described as ""not good"" and ""not good."" -      | frequent changes that can upset customers. -       |
| money. Not going back.                             | Review the pricing strategy, as it was described   | Ensure food quality remains consistently high to   |
|                                                    | as ""not good."" - Consider the customer's         | maintain a positive reputation. - Review           |
|                                 