<a href="https://colab.research.google.com/github/hamnakhan11/hamna/blob/main/text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required libraries if not already installed
!pip install evaluate

import torch
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np

def run_training_script():
    # 1. Load the AG News Dataset from the new link
    print("Loading dataset...")
    dataset = load_dataset("sh0416/ag_news")
    print("Dataset loaded successfully.")
    print(dataset)
    # Example of a single data point
    print("\nExample data point:")
    print(dataset['train'][0])

    # The labels are integers, we need to map them to human-readable names.
    # This dataset has labels from 1 to 4, so we need to adjust the mapping.
    labels = ["World", "Sports", "Business", "Sci/Tech"]
    id2label = {idx + 1: label for idx, label in enumerate(labels)}
    label2id = {label: idx + 1 for idx, label in enumerate(labels)}

    # 2. Tokenize and preprocess the dataset
    print("\nLoading tokenizer and tokenizing dataset...")
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    def tokenize_function(examples):
        # Concatenate title and description for tokenization
        return tokenizer([t + " " + d for t, d in zip(examples['title'], examples['description'])], padding='max_length', truncation=True, max_length=128)


    tokenized_datasets = dataset.map(tokenize_function, batched=True)

    # Rename the 'label' column to 'labels' for the Hugging Face Trainer API and adjust to be 0-indexed
    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
    tokenized_datasets = tokenized_datasets.map(lambda example: {'labels': [l - 1 for l in example['labels']]}, batched=True) # Subtract 1 to make labels 0-indexed

    # Remove the original title and description columns as they are no longer needed after tokenization
    tokenized_datasets = tokenized_datasets.remove_columns(["title", "description"])
    tokenized_datasets.set_format("torch")
    print("Dataset tokenized and formatted for PyTorch.")
    print(tokenized_datasets)

    # 3. Fine-tune the BERT model
    print("\nLoading BERT model for sequence classification...")
    # Adjust id2label and label2id to be 0-indexed
    labels = ["World", "Sports", "Business", "Sci/Tech"]
    id2label_0indexed = {idx: label for idx, label in enumerate(labels)}
    label2id_0indexed = {label: idx for idx, label in enumerate(labels)}

    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels=len(labels),
        id2label=id2label_0indexed,
        label2id=label2id_0indexed
    )
    print("Model loaded.")

    # Define training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        eval_strategy="epoch", # Corrected parameter name
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        save_strategy="epoch",
        load_best_model_at_end=True,
        report_to="none" # Disable logging to external services
    )

    # Define a function to compute evaluation metrics
    metric = evaluate.load("f1")
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)

        # Calculate accuracy
        accuracy = evaluate.load("accuracy")
        accuracy_score = accuracy.compute(predictions=predictions, references=labels)['accuracy']

        # Calculate F1-score
        f1_score = metric.compute(predictions=predictions, references=labels, average="weighted")['f1']

        return {"accuracy": accuracy_score, "f1": f1_score}

    # Create a Trainer instance
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets['train'],
        eval_dataset=tokenized_datasets['test'],
        compute_metrics=compute_metrics,
    )

    # Train the model
    print("\nStarting model training...")
    trainer.train()
    print("Training complete.")

    # Evaluate the model on the test set
    print("\nEvaluating the model...")
    eval_results = trainer.evaluate()
    print(f"Evaluation results: {eval_results}")

    # 4. Save the fine-tuned model
    print("\nSaving the fine-tuned model and tokenizer...")
    model.save_pretrained("./fine-tuned-bert-ag-news")
    tokenizer.save_pretrained("./fine-tuned-bert-ag-news")
    print("Model and tokenizer saved to ./fine-tuned-bert-ag-news.")

# Running the script
if __name__ == "__main__":
    run_training_script()

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5
Loading dataset...


README.md: 0.00B [00:00, ?B/s]

train.jsonl:   0%|          | 0.00/33.7M [00:00<?, ?B/s]

test.jsonl: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

Dataset loaded successfully.
DatasetDict({
    train: Dataset({
        features: ['label', 'title', 'description'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['label', 'title', 'description'],
        num_rows: 7600
    })
})

Example data point:
{'label': 3, 'title': 'Wall St. Bears Claw Back Into the Black (Reuters)', 'description': "Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again."}

Loading tokenizer and tokenizing dataset...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Dataset tokenized and formatted for PyTorch.
DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7600
    })
})

Loading BERT model for sequence classification...


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded.


Downloading builder script: 0.00B [00:00, ?B/s]


Starting model training...




Epoch,Training Loss,Validation Loss


In [5]:
# Install required libraries
# !pip install gradio transformers torch

import gradio as gr
import torch
from transformers import BertTokenizer, BertForSequenceClassification, pipeline

# Load the fine-tuned model and tokenizer
# This path is correct if the directory exists and contains the required files.
model_path = "fine-tuned-bert-ag-news/" # Added trailing slash
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)

# Create a Hugging Face pipeline for inference
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

# Define the prediction function for Gradio
def predict_news_topic(text):
    if not text:
        return "Please enter some text."
    result = pipe(text)[0]
    label = result['label']
    score = result['score']

    # Gradio's markdown component for rich display
    return f"**Predicted Topic:** {label} <br> **Confidence Score:** {score:.4f}"

# Define the Gradio interface
iface = gr.Interface(
    fn=predict_news_topic,
    inputs=gr.Textbox(lines=5, label="Enter a news headline or article snippet"),
    outputs=gr.HTML(label="Classification Result"),
    title="AG News Topic Classifier",
    description="Fine-tuned BERT model to classify news headlines into four categories: World, Sports, Business, and Sci/Tech.",
    examples=[
        ["Apple is set to release its new iPhone model at a major event next week."],
        ["The World Cup final will be held in Paris this year."],
        ["Stocks fall sharply as inflation concerns grow."],
        ["Scientists discover a new exoplanet with potential for life."],
    ]
)

# Launch the Gradio app
if __name__ == "__main__":
    iface.launch()

OSError: Can't load tokenizer for 'fine-tuned-bert-ag-news/'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'fine-tuned-bert-ag-news/' is the correct path to a directory containing all relevant files for a BertTokenizer tokenizer.