In [12]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import os
import json

In [13]:
class BookDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=512,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [14]:
output_dir = 'models'
dataset_path = 'book_genre_prediction.csv'
epochs = 10
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Define genres
genres = [
    'fantasy', 'science', 'crime', 'history',
    'horror', 'thriller', 'psychology', 'romance',
    'sports', 'travel'
]

# Save genres list
with open(os.path.join(output_dir, 'genres.json'), 'w') as f:
    json.dump(genres, f)

# Initialize tokenizer and model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [15]:
# Load and prepare data
print("Loading dataset...")
df = pd.read_csv(dataset_path)
print(df['genre'].unique())
texts = df['summary'].values
genre_to_idx = {genre: idx for idx, genre in enumerate(genres)}
labels = df['genre'].map(genre_to_idx).values

if np.isnan(labels).any():
    print("Found NaN labels! These rows will be dropped.")
    print(df[pd.isna(labels)])
    # Optionally, drop these rows:
    valid_idx = ~np.isnan(labels)
    texts = texts[valid_idx]
    labels = labels[valid_idx]

# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

# Create datasets
train_dataset = BookDataset(train_texts, train_labels, tokenizer)
val_dataset = BookDataset(val_texts, val_labels, tokenizer)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

Loading dataset...
['fantasy' 'science' 'crime' 'history' 'horror' 'thriller' 'psychology'
 'romance' 'sports' 'travel']


In [16]:
# Initialize classifier
print("Initializing model...")
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(genres)
).to(device)

# Training setup
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

Initializing model...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# Training loop
print("Starting training...")
patience = 2 
epochs_no_improve = 0
best_val_loss = float('inf')

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    avg_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{epochs}, Average Loss: {avg_loss:.4f}')

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            val_loss += outputs.loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f'Validation Loss: {avg_val_loss:.4f}')

    # Early stopping check
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        epochs_no_improve = 0
        print("Saving best model...")
        model.save_pretrained(os.path.join(output_dir, 'book_analyzer_model'))
        tokenizer.save_pretrained(os.path.join(output_dir, 'book_analyzer_model'))
    else:
        epochs_no_improve += 1
        print(f"No improvement for {epochs_no_improve} epoch(s).")

    if epochs_no_improve >= patience:
        print(f"Early stopping triggered after {epoch+1} epochs.")
        break

print("Training completed!")
print(f"Model saved to {os.path.join(output_dir, 'book_analyzer_model')}")

Starting training...
Epoch 1/10, Average Loss: 1.3625
Validation Loss: 0.8873
Saving best model...
Epoch 2/10, Average Loss: 0.6793
Validation Loss: 0.7746
Saving best model...
Epoch 3/10, Average Loss: 0.4322
Validation Loss: 0.8211
No improvement for 1 epoch(s).
Epoch 4/10, Average Loss: 0.2714
Validation Loss: 0.8398
No improvement for 2 epoch(s).
Early stopping triggered after 4 epochs.
Training completed!
Model saved to models/book_analyzer_model
