In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install evaluate
!pip install transformers
!pip install datasets
!pip install wandb

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py311-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
import joblib
import evaluate
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
from sklearn.metrics import accuracy_score
from sklearn.utils import resample

In [None]:
# Define accuracy metric
accuracy_metric = evaluate.load("accuracy")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

For RNN Model

In [None]:
# Tokenization function
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding=True, max_length=512)

In [None]:
# Define PyTorch Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        print(type(train_data["text"].tolist()[0]))  # Should be <class 'str'>, not list
        if isinstance(idx, list):  # If idx is a batch of indices, process them
          text = [self.texts[i] for i in idx]  # Get list of text samples
        else:
          text = self.texts[idx]  # Single sample
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [None]:
# Define RNN Model
class RNNClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super(RNNClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        _, (hidden, _) = self.lstm(x)
        out = self.fc(hidden[-1])
        return out

In [None]:
# Train RNN model
def train_rnn_model(train_dataset, test_dataset, vocab_size, batch_size=32, epochs=5):
    model = RNNClassifier(vocab_size, embed_dim=128, hidden_dim=256, num_classes=4)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch in train_loader:
            optimizer.zero_grad()
            outputs = model(batch['input_ids'])
            loss = criterion(outputs, batch['label'])
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}: Avg Train Loss = {avg_train_loss}")

    # Evaluation on Test Data
    model.eval()
    total_eval_loss = 0
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in test_loader:
            outputs = model(batch['input_ids'])
            loss = criterion(outputs, batch['label'])
            total_eval_loss += loss.item()

            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            labels = batch['label'].cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels)

    avg_eval_loss = total_eval_loss / len(test_loader)
    accuracy = accuracy_score(true_labels, predictions)

    print(f"Evaluation Loss: {avg_eval_loss}, Accuracy: {accuracy:.4f}")

    return model, avg_eval_loss, accuracy

Transfer Learning Model

In [None]:
# Function to compute accuracy
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    return {"accuracy": accuracy["accuracy"]}

In [None]:
# Fine-tuning function
def fine_tune_model(model_name, train_dataset, test_dataset):
    global tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Convert datasets to Hugging Face format
    train_dataset = Dataset.from_pandas(train_dataset)
    test_dataset = Dataset.from_pandas(test_dataset)

    # Tokenize datasets
    tokenized_train = train_dataset.map(tokenize_function, batched=True)
    tokenized_test = test_dataset.map(tokenize_function, batched=True)
    tokenized_train = tokenized_train.remove_columns(["text"])
    tokenized_test = tokenized_test.remove_columns(["text"])
    tokenized_train.set_format("torch")
    tokenized_test.set_format("torch")

    # Load model
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir= f"/content/drive/MyDrive/Guvi Project/News_Classification/results/{model_name}",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir=f"./logs/{model_name}",
        logging_steps=10,
    )

    os.environ["WANDB_DISABLED"] = "true"

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_test,
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer),
        compute_metrics=compute_metrics
    )

    # Train model
    trainer.train()

    # Evaluate model
    eval_results = trainer.evaluate()
    print(f"Evaluation Results for {model_name}:", eval_results)

    return model, eval_results["eval_loss"], eval_results["eval_accuracy"]

In [None]:
# Saving the model and Finding the best model
def save_and_find_model(name, model, eval_loss, acc):

    # Saving the Model
    model_filename = os.path.join(model_save_dir, f"{model_name}.pkl")
    joblib.dump(model, model_filename)
    print(f"Saved {model_name} model to {model_filename}")

    # Finding the best model
    if eval_loss < best["best_loss"] and acc > best["best_accuracy"]:
        best["best_loss"] = eval_loss
        best["best_model"] = model
        best["best_model_name"] = name
        best["best_accuracy"] = acc

The Train and Test dataset was pre-processed and split into two parts on AG_News_data_preparation file. They are
1. Train_part_1
2. Train_part_2
3. Test_part_1
4. Test_part_2

In [None]:
# Load dataset
train_path = f"/content/drive/MyDrive/Guvi Project/News_Classification/Dataset/train_part_1.csv"
test_path = f"/content/drive/MyDrive/Guvi Project/News_Classification/Dataset/test_part_1.csv"

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

In [None]:
# Define model checkpoints
model_ckpts = {
    "BERT": "bert-base-uncased",
    "RoBERTa": "roberta-base"
}

In [None]:
# Dictionay to store best model details
best = {
    "best_model_name" : None,
    "best_model" : None,
    "best_loss" : float("inf"),
    "best_accuracy" : 0
}

In [None]:
# Model file save location
model_save_dir = r"/content/drive/MyDrive/Guvi Project/News_Classification/Model"
os.makedirs(model_save_dir, exist_ok=True)

# Best model save location
best_model_dir = r"/content/drive/MyDrive/Guvi Project/News_Classification/"
os.makedirs(best_model_dir, exist_ok=True)

In [None]:
# Train RNN Model
rnn_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")  # Using tokenizer for vocab size
train_dataset = TextDataset(train_data["text"].tolist(), train_data["label"].tolist(), rnn_tokenizer)
test_dataset = TextDataset(test_data["text"].tolist(), test_data["label"].tolist(), rnn_tokenizer)
rnn_model, eval_loss, accuracy = train_rnn_model(train_dataset, test_dataset, vocab_size=rnn_tokenizer.vocab_size)

# Save the RNN model
save_and_find_model(name="RNN", model=rnn_model, eval_loss=eval_loss, acc=accuracy)

<class 'str'>


TypeError: list indices must be integers or slices, not list

In [None]:
!pip install wandb



In [None]:
# Train and evaluate multiple models
for model_name, model_ckpt in model_ckpts.items():
  print(f"Training model: {model_name}")
  model, eval_loss, model_acc = fine_tune_model(model_ckpt, train_data, test_data)
  save_and_find_model(name = model_name, model = model, eval_loss = eval_loss, acc=model_acc)

# Best Model
best_model_filename = os.path.join(best_model_dir, f"best_model_{best['best_model_name']}.pkl")
joblib.dump(best['best_model'], best_model_filename)
print(f"Best model {best['best_model_name']} was saved in {best_model_filename}")

Training model: BERT


Map:   0%|          | 0/60000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3800 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1771,0.199386,0.933684
2,0.0624,0.1871,0.943684
3,0.063,0.225704,0.944737


Evaluation Results for bert-base-uncased: {'eval_loss': 0.2257043719291687, 'eval_accuracy': 0.9447368421052632, 'eval_runtime': 41.1639, 'eval_samples_per_second': 92.314, 'eval_steps_per_second': 5.782, 'epoch': 3.0}
Saved BERT model to /content/drive/MyDrive/Guvi Project/News_Classification/Model/BERT.pkl
Best model BERT was saved in /content/drive/MyDrive/Guvi Project/News_Classification/best_model_BERT.pkl


In [None]:
best

{'best_model_name': 'BERT',
 'best_model': BertForSequenceClassification(
   (bert): BertModel(
     (embeddings): BertEmbeddings(
       (word_embeddings): Embedding(30522, 768, padding_idx=0)
       (position_embeddings): Embedding(512, 768)
       (token_type_embeddings): Embedding(2, 768)
       (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
       (dropout): Dropout(p=0.1, inplace=False)
     )
     (encoder): BertEncoder(
       (layer): ModuleList(
         (0-11): 12 x BertLayer(
           (attention): BertAttention(
             (self): BertSdpaSelfAttention(
               (query): Linear(in_features=768, out_features=768, bias=True)
               (key): Linear(in_features=768, out_features=768, bias=True)
               (value): Linear(in_features=768, out_features=768, bias=True)
               (dropout): Dropout(p=0.1, inplace=False)
             )
             (output): BertSelfOutput(
               (dense): Linear(in_features=768, out_features=768,

In [None]:
# Load the model
model = joblib.load(f"/content/drive/MyDrive/Guvi Project/News_Classification/best_model_BERT.pkl")

# Ensure the model is in evaluation mode
model.eval()


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
# Load the tokenizer (use the same one as in training)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Example new texts
new_texts = input("Enter the News: ")

# Tokenize the new data
encoded_inputs = tokenizer(new_texts, padding=True, truncation=True, return_tensors="pt")

# Ensure model is on GPU (if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Move encoded inputs to the same device
encoded_inputs = {key: val.to(device) for key, val in encoded_inputs.items()}

# Perform inference
with torch.no_grad():
    outputs = model(**encoded_inputs)

# Get predicted class labels
predicted_labels = torch.argmax(outputs.logits, dim=1)

class_labels = ["World", "Sports", "Business", "Sci/Tech"]
predicted_class = class_labels[predicted_labels.item()]
print(f"Predicted Class: {predicted_class}")


Enter the News: Delta Air Lines Prepares Chapter 11 Filing Delta Air Lines Inc. could file for Chapter 11 bankruptcy protection as soon as next week, a source familiar with the matter said yesterday.
Predicted Class: Business
