In [1]:
import numpy as np
import pandas as pd

# Load your dataset
file_name = '../data/processed/summarized_insights.csv'
df = pd.read_csv(file_name)

# Assuming your CSV has columns 'Combined Comments' for input and 'Summary' for output
# Concatenate them with a special separator (e.g., "<|summary|>")
df['training_example'] = df['Combined Comments'] + " <|summary|> " + df['Summary']

# Optionally, shuffle the dataframe
df = df.sample(frac=1).reset_index(drop=True)

# Split the dataset (e.g., 80% train, 10% validation, 10% test)
train, validate, test = np.split(df.sample(frac=1, random_state=42), 
                                 [int(.8*len(df)), int(.9*len(df))])

# Save to text files
train['training_example'].to_csv('train.txt', index=False, header=False)
validate['training_example'].to_csv('validate.txt', index=False, header=False)
test['training_example'].to_csv('test.txt', index=False, header=False)



In [3]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Load GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Assuming you have 'train.txt', 'validation.txt', and 'test.txt' files with preprocessed text
train_path = 'train.txt'
validation_path = 'validate.txt'

# Create text datasets
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=train_path,
    block_size=128
)

validation_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=validation_path,
    block_size=128
)

# Data collator used for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./gpt2_finetuned',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_dir='./logs',
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
)

# Train the model
trainer.train()

# Save the fine-tuned model
trainer.save_model('./gpt2_finetuned')



  0%|          | 0/1074 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

{'eval_loss': 3.100850820541382, 'eval_runtime': 2.7504, 'eval_samples_per_second': 70.899, 'eval_steps_per_second': 17.816, 'epoch': 1.0}
{'loss': 3.2369, 'learning_rate': 2.672253258845438e-05, 'epoch': 1.4}


  0%|          | 0/49 [00:00<?, ?it/s]

{'eval_loss': 3.058077096939087, 'eval_runtime': 3.0534, 'eval_samples_per_second': 63.864, 'eval_steps_per_second': 16.048, 'epoch': 2.0}
{'loss': 2.8733, 'learning_rate': 3.445065176908752e-06, 'epoch': 2.79}


  0%|          | 0/49 [00:00<?, ?it/s]

{'eval_loss': 3.056661367416382, 'eval_runtime': 2.7142, 'eval_samples_per_second': 71.844, 'eval_steps_per_second': 18.053, 'epoch': 3.0}
{'train_runtime': 311.6287, 'train_samples_per_second': 13.776, 'train_steps_per_second': 3.446, 'train_loss': 3.0371584901152597, 'epoch': 3.0}


In [8]:
# Load the fine-tuned model and tokenizer
model_path = './gpt2_finetuned'  # Adjust the path to your fine-tuned model
model = GPT2LMHeadModel.from_pretrained(model_path)
model.eval()

# Function to generate summary
def generate_summary(text, max_length=512):
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt")
    summary_ids = model.generate(inputs, max_length=max_length, num_beams=5, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Path to the test file
test_file_path = 'test.txt'

# Read a few examples from the test set
num_examples = 3
with open(test_file_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()

full_text, actual_summary = lines[0].split("<|summary|>")
predicted_summary = generate_summary(full_text.strip())

print(f"Example {i+1}")
print("Full Text:", full_text.strip())
print("Actual Summary:", actual_summary.strip())
print("Predicted Summary:", predicted_summary)
print("\n" + "-"*50 + "\n")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Example 2
Full Text: "Was a huge fan of their Pinto location on Christopher Street location, which had way more Thai focus and less fusion. This new spot really confuses me. As a loyal patron of the former now-closed location, I wanted to like it, and yet this is a place where somehow, everything had something off and wrong with it, and you sit there wondering what the heck went wrong. From the noodle dishes to the crab in a coconut, everything felt over-salted, under, not fresh. The best part was actually scraping the coconut out of the coconut shell - that was delicious! But I could have gotten that elsewhere. Disappointing :/
Actual Summary: - The customer misses the Thai-focused menu and less fusion approach of the old Pinto location on Christopher Street.
Predicted Summary: summarize: "Was a huge fan of their Pinto location on Christopher Street location, which had way more Thai focus and less fusion. This new spot really confuses me. As a loyal patron of the former now-closed loc