In [None]:
import sys
!"{sys.executable}" -m pip install -U huggingface_hub fsspec




In [None]:
from datasets import load_dataset

ds = load_dataset("sh0416/ag_news")

In [None]:
from datasets import load_dataset, DatasetDict

ds = load_dataset("ag_news")  # DatasetDict with 'train' and 'test'

# Create dev split from train
split = ds["train"].train_test_split(test_size=0.1, seed=42, stratify_by_column="label")
ds = DatasetDict({
    "train": split["train"],
    "dev": split["test"],
    "test": ds["test"]
})

label_names = ds["train"].features["label"].names
print("Labels:", label_names)
print(ds)


Labels: ['World', 'Sports', 'Business', 'Sci/Tech']
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 108000
    })
    dev: Dataset({
        features: ['text', 'label'],
        num_rows: 12000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})



*   Load AG News DONE
*   Create train/dev/test/splits dev (from train) DONE
* Implement preprocessing (tokenization, normalization) and document it.
*   Train two classical models
    - TF-IDF + Logistic Regression
    - TF-IDF +Linear SVM
*   Report accuracy+ Macro-F1 +confusion matrix
* Collect >=20 misclassified examples from test AND cetegorise them into 3-5 error types.






[link text](https://)# Pre-processing!

In [None]:
import re
import unicodedata
from typing import List

WHITESPACE_RE = re.compile(r"\s+")
TOKEN_RE = re.compile(r"[a-z0-9]+(?:'[a-z0-9]+)?")  # keeps don't as one token

def normalize_text(text: str) -> str:
    text = unicodedata.normalize("NFKC", text)
    text = text.lower()
    text = WHITESPACE_RE.sub(" ", text).strip()
    return text

def tokenize_words(text: str) -> List[str]:
    return TOKEN_RE.findall(text)


# **TRAINING THE TWO CLASSICAL MODELS AND EVALUATION**

In [None]:
"""
TF-IDF baselines for AG News
1) TF-IDF + Logistic Regression
2) TF-IDF + Linear SVM (LinearSVC)

This file focuses on:
- loading AG News
- creating train/dev splits (dev from train)
- building + fitting the two sklearn pipelines

No metrics / confusion matrix are included here.
"""

from __future__ import annotations

from typing import Dict, Tuple, List

from datasets import load_dataset
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import numpy as np


def load_and_split_ag_news(
    dev_size: float = 0.1,
    seed: int = 42,
) -> Tuple[List[str], List[int], List[str], List[int], List[str], List[int]]:
    """
    Load AG News from Hugging Face Datasets and create train/dev/test splits.

    - The HF dataset already provides 'train' and 'test'.
    - We create a 'dev' split by splitting the original train set.
    - We stratify by label to keep class proportions similar across splits.

    Args:
        dev_size: Fraction of the original training set to use as dev/validation.
        seed: Random seed for reproducible splitting.

    Returns:
        X_train, y_train, X_dev, y_dev, X_test, y_test
        where X_* are lists of raw text strings and y_* are integer labels.
    """
    ds = load_dataset("ag_news")  # provides ds["train"], ds["test"]

    # Create dev split *from the training set* (keep ds["test"] untouched)
    split = ds["train"].train_test_split(
        test_size=dev_size,
        seed=seed,
        stratify_by_column="label",
    )
    train_ds = split["train"]
    dev_ds = split["test"]
    test_ds = ds["test"]

    # Convert to plain Python lists for scikit-learn
    X_train, y_train = list(train_ds["text"]), list(train_ds["label"])
    X_dev, y_dev = list(dev_ds["text"]), list(dev_ds["label"])
    X_test, y_test = list(test_ds["text"]), list(test_ds["label"])

    return X_train, y_train, X_dev, y_dev, X_test, y_test


def build_tfidf_logreg_pipeline() -> Pipeline:
    """
    Build a simple word-level TF-IDF + Logistic Regression pipeline.

    Returns:
        sklearn Pipeline ready to .fit(X, y) on raw text inputs.
    """
    return Pipeline(
        steps=[
            # TF-IDF converts raw text -> sparse numeric feature vectors
            (
                "tfidf",
                TfidfVectorizer(
                    analyzer="word",
                    preprocessor=normalize_text,
                    tokenizer=tokenize_words,
                    token_pattern=None,
                    lowercase=False,
                    ngram_range=(1, 2),
                    min_df=2,
                    max_df=0.9,
                    sublinear_tf=True,
                ),
            ),
            # Logistic regression classifier on top of TF-IDF features
            (
                "clf",
                LogisticRegression(
                    solver="saga",  # handles sparse features well
                    max_iter=2000,  # increase to ensure convergence
                    n_jobs=-1,  # use all CPU cores when possible
                ),
            ),
        ]
    )


def build_tfidf_linear_svm_pipeline() -> Pipeline:
    """
    Build a simple word-level TF-IDF + Linear SVM (LinearSVC) pipeline.

    Returns:
        sklearn Pipeline ready to .fit(X, y) on raw text inputs.
    """
    return Pipeline(
        steps=[
            (
                "tfidf",
                TfidfVectorizer(
                    analyzer="word",
                    preprocessor=normalize_text,
                    tokenizer=tokenize_words,
                    token_pattern=None,
                    lowercase=False,
                    ngram_range=(1, 2),
                    min_df=2,
                    max_df=0.9,
                    sublinear_tf=True,
                ),
            ),
            # Linear SVM is a strong baseline for text classification with TF-IDF
            ("clf", LinearSVC(C=1.0)),
        ]
    )


def train_two_baselines(
    dev_size: float = 0.1,
    seed: int = 42,
) -> Dict[str, Pipeline]:
    """
    Load data, split into train/dev/test, then fit both baseline models on train.

    Notes:
        - This trains on train only (so you can later use dev for tuning).
        - No evaluation is performed here (by request).

    Args:
        dev_size: Fraction of original train set used for dev.
        seed: Random seed for the split.

    Returns:
        A dict with two fitted pipelines:
            {
              "tfidf_logreg": fitted Pipeline,
              "tfidf_linear_svm": fitted Pipeline
            }
    """
    X_train, y_train, X_dev, y_dev, X_test, y_test = load_and_split_ag_news(
        dev_size=dev_size,
        seed=seed,
    )

    # Build pipelines
    logreg_model = build_tfidf_logreg_pipeline()
    svm_model = build_tfidf_linear_svm_pipeline()

    # Fit on training data only (dev/test reserved for later)
    logreg_model.fit(X_train, y_train)
    svm_model.fit(X_train, y_train)

    return {
        "tfidf_logreg": logreg_model,
        "tfidf_linear_svm": svm_model,
    }


def model_evaluation(model: Pipeline, x: List[str], y: List[str]) -> None:
    """
    This functiom evaluates the model given by calculating the accuracy, the macro f1, and creating a confusion matrix
    Args:
          model(Pipeline): this is the model
          x (List[str]): list of raw input text samples
          y (List[str]): ground truth labels for the input text

    """

    # predict the labels using the model
    y_pred = model.predict(x)

    # calculate the accuracy score
    accuracy = accuracy_score(y, y_pred)

    # print the accuracy score
    print(f"Accuracy: {accuracy:.4f}")

    # calculate macro f1
    macro_f1 = f1_score(y, y_pred, average="macro")

    # print the macro f1 score
    print(f"Macro F1: {macro_f1:.4f}")

    # create confusion matrix
    conf_matrix = confusion_matrix(y, y_pred)

    # print the confusion matrix
    print("Confusion Matrix")
    print(conf_matrix)


def misclassified_text(
    model: Pipeline,
    x: List[str],
    y: List[str],
    label_names: List[str],
    n_examples: int = 20,
) -> None:
    """
    This function prints a set of misclassified articles.
    Args:
          model(Pipeline): this is the model
          x (List[str]): list of raw input text samples
          y (List[str]): ground truth labels for the input text
          label_names (List[str]): list of labels for text
          n_examples (int): number of misclassified examples to be shown      
    """
    y_pred = model.predict(x)

    misclassified = []

    for text, true_label, pred_label in zip(x, y, y_pred):
        if true_label != pred_label:
            misclassified.append((text, true_label, pred_label))

    print(f"Total misclassified text: {len(misclassified)}")

    if len(misclassified) == 0:
        print("No misclassifications found")
        return

    for i, (text, true_label, pred_label) in enumerate(misclassified[:n_examples]):
        print(i + 1)
        print("TRUE:", label_names[true_label])
        print("PRED:", label_names[pred_label])

        print("TEXT:", text[:300])

        print("-" * 70)


if __name__ == "__main__":
    models = train_two_baselines(dev_size=0.1, seed=42)
    vec_lr = models["tfidf_logreg"].named_steps["tfidf"]
    vec_svm = models["tfidf_linear_svm"].named_steps["tfidf"]

    print("LR preprocessor:", vec_lr.preprocessor == normalize_text)
    print("LR tokenizer:", vec_lr.tokenizer == tokenize_words)
    print("SVM preprocessor:", vec_svm.preprocessor == normalize_text)
    print("SVM tokenizer:", vec_svm.tokenizer == tokenize_words)

    x_train, y_train, x_dev, y_dev, x_test, y_test = load_and_split_ag_news(
        dev_size=0.1, seed=42
    )

    # evaluating the logistic regression model
    print("Evaluation of Logistic Regression Model")
    model_evaluation(models["tfidf_logreg"], x_test, y_test)

    # evaluating the linear SVM model
    print("Evaluation of Linear SVM Model")
    model_evaluation(models["tfidf_linear_svm"], x_test, y_test)

    label_names = load_dataset("ag_news")["train"].features["label"].names

    # print 20 misclassified examples for logistic regression
    print("Misclassified Logistic Regression examples")
    misclassified_text(
        models["tfidf_logreg"],
        x_test,
        y_test,
        label_names,
        n_examples=20,
    )

    # print 20 misclassified examples for linear SVM
    print("Misclassified Linear SVM examples")
    misclassified_text(
        models["tfidf_linear_svm"],
        x_test,
        y_test,
        label_names,
        n_examples=20,
    )


LR preprocessor: True
LR tokenizer: True
SVM preprocessor: True
SVM tokenizer: True
Evaluation of Logistic Regression Model
Accuracy: 0.9170
Macro F1: 0.9168
Confusion Matrix
[[1720   55   74   51]
 [  12 1860   14   14]
 [  51   18 1675  156]
 [  44   22  120 1714]]
Evaluation of Linear SVM Model
Accuracy: 0.9259
Macro F1: 0.9257
Confusion Matrix
[[1736   52   66   46]
 [  11 1873   11    5]
 [  50   15 1697  138]
 [  47   16  106 1731]]
Misclassified Logistic Regression examples
Total misclassified text: 631
1
TRUE: Sci/Tech
PRED: Sports
TEXT: Prediction Unit Helps Forecast Wildfires (AP) AP - It's barely dawn when Mike Fitzpatrick starts his shift with a blur of colorful maps, figures and endless charts, but already he knows what the day will bring. Lightning will strike in places he expects. Winds will pick up, moist places will dry and
----------------------------------------------------------------------
2
TRUE: Sci/Tech
PRED: Business
TEXT: IBM to hire even more new workers By t