In [1]:
# import nltk
# nltk.download('brown')

from nltk.corpus import brown
NUM_CLASSES = len(brown.categories())
print(f"Num_Classes: {NUM_CLASSES}, Classes: {brown.categories()}")

from datasets import Dataset
data_dict = {
    "text": [
        " ".join(sent)
        for file_id in brown.fileids()
        # Corrected line: 'fileid' changed to 'fileids'
        for sent in brown.sents(fileids=file_id)
    ],
    "label": [
        brown.categories(fileids=file_id)[0]
        for file_id in brown.fileids()
        # Corrected line: 'fileid' changed to 'fileids'
        for sent in brown.sents(fileids=file_id)
    ]
}

full_dataset = Dataset.from_dict(data_dict)

from datasets import DatasetDict
train_val_split = full_dataset.train_test_split(test_size=0.1, seed=42)

train_val_ds = train_val_split['train'] # This is 90% of the data
test_ds = train_val_split['test']     # This is 10% of the data

val_split = train_val_ds.train_test_split(test_size=(1/9), seed=42)

train_ds = val_split['train']
val_ds = val_split['test']

final_splits = DatasetDict({
    'train': train_ds,
    'validation': val_ds,
    'test': test_ds
})

Num_Classes: 15, Classes: ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [3]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = final_splits.map(preprocess_function, batched=True)
tokenized_dataset = tokenized_dataset.class_encode_column("label")

label_names = tokenized_dataset["train"].features["label"].names
print(label_names)

Map: 100%|██████████| 45872/45872 [00:07<00:00, 6383.70 examples/s]
Map: 100%|██████████| 5734/5734 [00:00<00:00, 6336.86 examples/s]
Map: 100%|██████████| 5734/5734 [00:00<00:00, 6756.65 examples/s]
Casting to class labels: 100%|██████████| 45872/45872 [00:00<00:00, 300525.78 examples/s]
Casting to class labels: 100%|██████████| 5734/5734 [00:00<00:00, 381717.95 examples/s]
Casting to class labels: 100%|██████████| 5734/5734 [00:00<00:00, 337006.60 examples/s]

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']





In [4]:
print(tokenized_dataset["train"][0])

{'text': 'But exactly how far it will go toward improving conditions is another question because there is so much that needs doing .', 'label': 2, 'input_ids': [101, 2021, 3599, 2129, 2521, 2009, 2097, 2175, 2646, 9229, 3785, 2003, 2178, 3160, 2138, 2045, 2003, 2061, 2172, 2008, 3791, 2725, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [5]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=NUM_CLASSES
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
import torch.nn as nn
from transformers import BertModel

class RegisterClassifier(nn.Module):
    def __init__(self, n_classes):
        super(RegisterClassifier, self).__init__()
        # Load the pretrained BERT model
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        
        # Dropout layer for regularization
        self.drop = nn.Dropout(p=0.3)
        
        # Fully-connected layer for classification
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        # Pass inputs through BERT
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # Use the pooled output of the token for classification
        pooled_output = outputs.pooler_output
        
        # Apply dropout and the final classification layer
        output = self.drop(pooled_output)
        return self.out(output)

In [7]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [8]:
trainer.train()

import torch
# Assuming 'model' is your trained PyTorch model
# and 'output_dir' is the directory to save the model
output_dir = "model_weights"
model_save_path = f"{output_dir}/final_model_weights.pth"
torch.save(model.state_dict(), model_save_path)

Epoch,Training Loss,Validation Loss
1,1.4704,1.345485
2,1.0093,1.223019
3,0.7306,1.264068


In [9]:
from transformers import AutoModelForSequenceClassification
import torch

# Instantiate the same model architecture
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=NUM_CLASSES
)

# Load the saved weights
output_dir = "model_weights"
model_save_path = f"{output_dir}/final_model_weights.pth"
model.load_state_dict(torch.load(model_save_path))

# Set the model to evaluation mode
model.eval()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(model_save_path))


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [10]:
# test_results = trainer.evaluate(eval_dataset=tokenized_dataset["test"])
# print("\nTest Set Evaluation Results:")
# for key, value in test_results.items():
#     print(f"{key}: {value:.4f}")

import numpy as np
predictions_output = trainer.predict(tokenized_dataset["test"])
predicted_labels = np.argmax(predictions_output.predictions, axis=1)
true_labels = tokenized_dataset["test"]["label"]

from sklearn.metrics import classification_report
print("\nClassification Report:")
print(classification_report(true_labels, predicted_labels, target_names=label_names, zero_division=0))


Classification Report:
                 precision    recall  f1-score   support

      adventure       0.49      0.58      0.53       471
 belles_lettres       0.57      0.63      0.60       722
      editorial       0.41      0.38      0.39       298
        fiction       0.54      0.46      0.49       476
     government       0.72      0.75      0.73       303
        hobbies       0.76      0.71      0.73       387
          humor       0.34      0.23      0.28        94
        learned       0.79      0.76      0.78       727
           lore       0.55      0.59      0.57       504
        mystery       0.57      0.54      0.55       408
           news       0.67      0.66      0.67       449
       religion       0.68      0.59      0.63       182
        reviews       0.62      0.52      0.57       181
        romance       0.44      0.52      0.48       440
science_fiction       0.69      0.41      0.52        92

       accuracy                           0.60      5734
     

In [11]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, BertModel
from datasets import Dataset, DatasetDict
from nltk.corpus import brown
from sklearn.metrics import classification_report
import numpy as np
import os

# --- 2. Tokenization ---
print("\nTokenizing the dataset...")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
MAX_LENGTH = 128

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=MAX_LENGTH)

tokenized_dataset = final_splits.map(preprocess_function, batched=True)
tokenized_dataset = tokenized_dataset.class_encode_column("label")
label_names = tokenized_dataset["train"].features["label"].names

# Prepare dataset for PyTorch
tokenized_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
tokenized_dataset = tokenized_dataset.remove_columns(['text'])

# --- 3. Hyperparameters and Setup ---
HIDDEN_DIM = 256
NUM_EPOCHS = 3 # Fewer epochs needed as BERT features are very strong
BATCH_SIZE = 32 # Smaller batch size for BERT
LEARNING_RATE = 0.001
DROPOUT_PROB = 0.5

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\nUsing device: {device}")

# Load Pre-trained BERT model for embedding generation
print("Loading pre-trained BERT model for embeddings...")
bert_embedder = BertModel.from_pretrained('bert-base-uncased')
# Freeze BERT parameters to use it only as a feature extractor
for param in bert_embedder.parameters():
    param.requires_grad = False
bert_embedder.to(device)

BERT_HIDDEN_SIZE = bert_embedder.config.hidden_size

# DataLoaders
train_dataloader = DataLoader(tokenized_dataset["train"], shuffle=True, batch_size=BATCH_SIZE)
val_dataloader = DataLoader(tokenized_dataset["validation"], batch_size=BATCH_SIZE)
test_dataloader = DataLoader(tokenized_dataset["test"], batch_size=BATCH_SIZE)

# --- 4. Model Definitions with BERT Embeddings ---

# Simple Neural Network (NN) with BERT Embeddings
class SimpleNN(nn.Module):
    def __init__(self, bert_embedder, hidden_dim, output_dim, dropout_prob):
        super(SimpleNN, self).__init__()
        self.bert_embedder = bert_embedder
        self.fc1 = nn.Linear(BERT_HIDDEN_SIZE, hidden_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_prob)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            bert_output = self.bert_embedder(input_ids=input_ids, attention_mask=attention_mask)
            embedded = bert_output.last_hidden_state
        pooled = embedded.mean(dim=1)
        hidden = self.relu(self.fc1(pooled))
        hidden = self.dropout(hidden)
        return self.fc2(hidden)

# Convolutional Neural Network (CNN) with BERT Embeddings
class TextCNN(nn.Module):
    def __init__(self, bert_embedder, n_filters, filter_sizes, output_dim, dropout_prob):
        super().__init__()
        self.bert_embedder = bert_embedder
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=BERT_HIDDEN_SIZE, out_channels=n_filters, kernel_size=fs)
            for fs in filter_sizes
        ])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            bert_output = self.bert_embedder(input_ids=input_ids, attention_mask=attention_mask)
            embedded = bert_output.last_hidden_state.permute(0, 2, 1)
        
        conved = [torch.relu(conv(embedded)) for conv in self.convs]
        pooled = [torch.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim=1))
        return self.fc(cat)

# Bidirectional LSTM with BERT Embeddings
class TextLSTM(nn.Module):
    def __init__(self, bert_embedder, hidden_dim, output_dim, n_layers, bidirectional, dropout_prob):
        super().__init__()
        self.bert_embedder = bert_embedder
        self.lstm = nn.LSTM(
            BERT_HIDDEN_SIZE,
            hidden_dim,
            num_layers=n_layers,
            bidirectional=bidirectional,
            batch_first=True,
            dropout=dropout_prob if n_layers > 1 else 0
        )
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            bert_output = self.bert_embedder(input_ids=input_ids, attention_mask=attention_mask)
            embedded = bert_output.last_hidden_state

        lstm_out, (hidden, cell) = self.lstm(embedded)
        
        if self.lstm.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
            
        return self.fc(hidden)

# --- 5. Training and Evaluation Functions ---
def train_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        predictions = model(input_ids, attention_mask)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            predictions = model(input_ids, attention_mask)
            loss = criterion(predictions, labels)
            total_loss += loss.item()
            
            _, predicted_labels = torch.max(predictions, dim=1)
            correct_predictions += (predicted_labels == labels).sum().item()

    avg_loss = total_loss / len(dataloader)
    accuracy = correct_predictions / len(dataloader.dataset)
    return avg_loss, accuracy

def test_model(model, dataloader, device, label_names):
    model.eval()
    predictions_list, true_labels_list = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            predictions = model(input_ids, attention_mask)
            _, predicted_labels = torch.max(predictions, dim=1)
            
            predictions_list.extend(predicted_labels.cpu().numpy())
            true_labels_list.extend(labels.cpu().numpy())
            
    print("\nClassification Report:")
    print(classification_report(true_labels_list, predictions_list, target_names=label_names, zero_division=0))

# --- 6. Run Training and Testing ---
def run_experiment(model, model_name):
    print(f"\n----- Training {model_name} -----")
    model.to(device)
    # Only train the parameters of the classification head, not the BERT embedder
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=LEARNING_RATE)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(NUM_EPOCHS):
        train_loss = train_epoch(model, train_dataloader, optimizer, criterion, device)
        val_loss, val_acc = evaluate(model, val_dataloader, criterion, device)
        print(f'Epoch {epoch+1:02} | Train Loss: {train_loss:.3f} | Val. Loss: {val_loss:.3f} | Val. Acc: {val_acc*100:.2f}%')

    print(f"\n----- Testing {model_name} -----")
    test_model(model, test_dataloader, device, label_names)
    print(f"----- End of {model_name} Experiment -----")

# Run for Simple NN
nn_model = SimpleNN(bert_embedder, HIDDEN_DIM, NUM_CLASSES, DROPOUT_PROB)
run_experiment(nn_model, "Simple Neural Network (NN) with BERT Embeddings")

# Run for Text CNN
N_FILTERS = 100
FILTER_SIZES = [2, 3, 4]
cnn_model = TextCNN(bert_embedder, N_FILTERS, FILTER_SIZES, NUM_CLASSES, DROPOUT_PROB)
run_experiment(cnn_model, "Text CNN with BERT Embeddings")

# Run for Bidirectional LSTM
LSTM_LAYERS = 2
lstm_model = TextLSTM(bert_embedder, HIDDEN_DIM, NUM_CLASSES, LSTM_LAYERS, True, DROPOUT_PROB)
run_experiment(lstm_model, "Bidirectional LSTM with BERT Embeddings")


Tokenizing the dataset...


Map: 100%|██████████| 45872/45872 [00:02<00:00, 17884.16 examples/s]
Map: 100%|██████████| 5734/5734 [00:00<00:00, 19397.07 examples/s]
Map: 100%|██████████| 5734/5734 [00:00<00:00, 14442.62 examples/s]
Casting to class labels: 100%|██████████| 45872/45872 [00:00<00:00, 475783.44 examples/s]
Casting to class labels: 100%|██████████| 5734/5734 [00:00<00:00, 457279.14 examples/s]
Casting to class labels: 100%|██████████| 5734/5734 [00:00<00:00, 476816.34 examples/s]



Using device: cuda
Loading pre-trained BERT model for embeddings...

----- Training Simple Neural Network (NN) with BERT Embeddings -----
Epoch 01 | Train Loss: 1.828 | Val. Loss: 1.630 | Val. Acc: 43.91%
Epoch 02 | Train Loss: 1.692 | Val. Loss: 1.594 | Val. Acc: 45.17%
Epoch 03 | Train Loss: 1.644 | Val. Loss: 1.550 | Val. Acc: 47.04%

----- Testing Simple Neural Network (NN) with BERT Embeddings -----

Classification Report:
                 precision    recall  f1-score   support

      adventure       0.40      0.32      0.36       471
 belles_lettres       0.40      0.65      0.50       722
      editorial       0.36      0.29      0.32       298
        fiction       0.35      0.15      0.21       476
     government       0.59      0.55      0.57       303
        hobbies       0.58      0.59      0.58       387
          humor       0.25      0.01      0.02        94
        learned       0.68      0.65      0.66       727
           lore       0.50      0.26      0.35       

In [12]:
# --- 7. Save Model Weights ---
print("\n----- Saving Final Model Weights -----")
output_dir = "model_weights"
os.makedirs(output_dir, exist_ok=True)

# Define paths for the weights
nn_save_path = os.path.join(output_dir, "simple_nn_weights.pth")
cnn_save_path = os.path.join(output_dir, "text_cnn_weights.pth")
lstm_save_path = os.path.join(output_dir, "bidirectional_lstm_weights.pth")

# Save the state dictionaries
torch.save(nn_model.state_dict(), nn_save_path)
print(f"Simple NN weights saved to {nn_save_path}")

torch.save(cnn_model.state_dict(), cnn_save_path)
print(f"Text CNN weights saved to {cnn_save_path}")

torch.save(lstm_model.state_dict(), lstm_save_path)
print(f"Bidirectional LSTM weights saved to {lstm_save_path}")


----- Saving Final Model Weights -----
Simple NN weights saved to model_weights\simple_nn_weights.pth
Text CNN weights saved to model_weights\text_cnn_weights.pth
Bidirectional LSTM weights saved to model_weights\bidirectional_lstm_weights.pth
