In [None]:
#downloading neccessarily libraries
!pip install transformers[torch] accelerate -U
!pip install transformers datasets torch accelerate

Collecting accelerate
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86

In [None]:
#importing neccessarily libraries
import numpy as np
from datasets import load_dataset
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)

In [None]:
# Load the dataset and apply tokenization on a subset
def load_and_preprocess_data(subset_ratio=0.1):
    # Load dataset
    dataset = load_dataset('ag_news')
    tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

    # Reduce dataset size by sampling a subset
    reduced_train_dataset = dataset['train'].shuffle(seed=42).select(range(int(dataset['train'].num_rows * subset_ratio)))
    reduced_test_dataset = dataset['test'].shuffle(seed=42).select(range(int(dataset['test'].num_rows * subset_ratio)))

    def tokenize_function(examples):
        # Tokenize the text to be suitable for BERT
        return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

    # Apply tokenization to all data splits
    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    return tokenized_datasets




In [None]:
# Load the model
def load_model():
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)
    return model

In [None]:
from transformers import Trainer, TrainingArguments

def train_model(tokenized_datasets, model):
    # Select a subset of the training and testing datasets
    # For instance, selecting the first 1000 examples from train and the first 500 from test
    train_subset = tokenized_datasets['train'].select(range(1000))
    test_subset = tokenized_datasets['test'].select(range(500))

    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        evaluation_strategy='epoch'
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_subset,
        eval_dataset=test_subset,
        compute_metrics=lambda p: {"accuracy": (np.argmax(p.predictions, axis=1) == p.label_ids).mean()}
    )

    trainer.train()
    return trainer



In [None]:
def save_model(model, tokenizer, save_path='AG_news_bertsentiment_model'):
    # Save the model and tokenizer
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)

In [None]:
tokenized_datasets = load_and_preprocess_data()
model = load_model()
trainer = train_model(tokenized_datasets, model)
eval_results = trainer.evaluate()
print(eval_results)

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.048827,0.544
2,No log,0.412282,0.858
3,No log,0.675446,0.826


{'eval_loss': 0.6754461526870728, 'eval_accuracy': 0.826, 'eval_runtime': 219.6369, 'eval_samples_per_second': 2.276, 'eval_steps_per_second': 0.287, 'epoch': 3.0}


In [None]:
save_model(model, tokenizer)  # Save the model and tokenizer

In [None]:
import torch

# Load the saved model and tokenizer
model_path = './AG_news_bertsentiment_model'
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizerFast.from_pretrained(model_path)

# Prepare the model for inference (set to evaluation mode)
model.eval()

# Load the test dataset
dataset = load_dataset('ag_news', split='test')

# Function to prepare input data for the model
def prepare_data(texts, tokenizer):
    # Tokenize the text with the same parameters used in training
    encoding = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=128)
    return encoding

# Function to perform inference
def predict(model, inputs):
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=1)
    return predictions

# Sample a few examples from the test dataset
sample_data = dataset.select(range(10))  # Select the first 10 examples for demonstration

# Extract texts from the sampled data
texts = sample_data['text']

# Prepare data
inputs = prepare_data(texts, tokenizer)

# Perform inference
predictions = predict(model, inputs)

# Mapping of AG News labels
label_map = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}

# Print predictions
print("Inference Results:")
for text, label, pred in zip(texts, sample_data['label'], predictions):
    print(f"Sentence: '{text}'")
    print(f"True Label: {label_map[label]} - Predicted Category: {label_map[pred.item()]}")
    print()

Inference Results:
Sentence: 'Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.'
True Label: Business - Predicted Category: Business

Sentence: 'The Race is On: Second Private Team Sets Launch Date for Human Spaceflight (SPACE.com) SPACE.com - TORONTO, Canada -- A second\team of rocketeers competing for the  #36;10 million Ansari X Prize, a contest for\privately funded suborbital space flight, has officially announced the first\launch date for its manned rocket.'
True Label: Sci/Tech - Predicted Category: Sci/Tech

Sentence: 'Ky. Company Wins Grant to Study Peptides (AP) AP - A company founded by a chemistry researcher at the University of Louisville won a grant to develop a method of producing better peptides, which are short chains of amino acids, the building blocks of proteins.'
True Label: Sci/Tech - Predicted Category: Sci/Tech

Sentence: 'Prediction Unit Helps Forecast