In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the data
train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')

# Split train data into train and validation sets
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

# Save the new splits
train_data.to_csv('new_train_data.csv', index=False)
val_data.to_csv('val_data.csv', index=False)

# Print the sizes of each set
print(f"New training set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")
print(f"Test set size: {len(test_data)}")

New training set size: 287150
Validation set size: 71788
Test set size: 89735


In [4]:
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence

# Load the data
train_data = pd.read_csv('train_data.csv')

# Split train data into train and validation sets
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

# Custom dataset class
class BookRecommendationDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.attn_masks = []

        for _, row in dataframe.iterrows():
            text = f"Title: {row['Title']} Author: {row['Author']} Year: {row['Year']} Publisher: {row['Publisher']} Rating: {row['Rating']}"
            
            encodings_dict = tokenizer(text, truncation=True, max_length=max_length, padding="max_length")

            self.input_ids.append(torch.tensor(encodings_dict['input_ids'], dtype=torch.long))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask'], dtype=torch.long))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attn_masks[idx],
            'labels': self.input_ids[idx]  # Use input_ids as labels for causal language modeling
        }

# Load pre-trained model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

# Configure the model
config = GPT2Config.from_pretrained('gpt2')
config.pad_token_id = tokenizer.pad_token_id
model = GPT2LMHeadModel.from_pretrained('gpt2', config=config)

# Prepare datasets
train_dataset = BookRecommendationDataset(train_data, tokenizer, max_length=128)
val_dataset = BookRecommendationDataset(val_data, tokenizer, max_length=128)

# Custom collate function
def collate_fn(batch):
    input_ids = pad_sequence([item['input_ids'] for item in batch], batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence([item['attention_mask'] for item in batch], batch_first=True, padding_value=0)
    labels = pad_sequence([item['labels'] for item in batch], batch_first=True, padding_value=-100)  # -100 is the ignore index for CrossEntropyLoss
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    eval_steps=400,
    save_steps=800,
    warmup_steps=500,
    prediction_loss_only=True,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=collate_fn,
)

# Inspect the data (optional, for debugging)
print("Inspecting the first few samples of the training data:")
for i, sample in enumerate(train_dataset):
    print(f"Sample {i}:")
    print(f"Input IDs shape: {sample['input_ids'].shape}")
    print(f"Input IDs dtype: {sample['input_ids'].dtype}")
    print(f"Attention mask shape: {sample['attention_mask'].shape}")
    print(f"Attention mask dtype: {sample['attention_mask'].dtype}")
    print(f"Labels shape: {sample['labels'].shape}")
    print(f"Labels dtype: {sample['labels'].dtype}")
    if i == 2:  # Print only the first 3 samples
        break

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./gpt2-finetuned")
tokenizer.save_pretrained("./gpt2-finetuned")

print("Training completed and model saved.")



Inspecting the first few samples of the training data:
Sample 0:
Input IDs shape: torch.Size([128])
Input IDs dtype: torch.int64
Attention mask shape: torch.Size([128])
Attention mask dtype: torch.int64
Labels shape: torch.Size([128])
Labels dtype: torch.int64
Sample 1:
Input IDs shape: torch.Size([128])
Input IDs dtype: torch.int64
Attention mask shape: torch.Size([128])
Attention mask dtype: torch.int64
Labels shape: torch.Size([128])
Labels dtype: torch.int64
Sample 2:
Input IDs shape: torch.Size([128])
Input IDs dtype: torch.int64
Attention mask shape: torch.Size([128])
Attention mask dtype: torch.int64
Labels shape: torch.Size([128])
Labels dtype: torch.int64


Step,Training Loss


KeyboardInterrupt: 