## AG News Dataset

The AG News dataset contains news articles from the AG corpus, categorized into four classes: World, Sports, Business, and Science/Technology. Each article is represented as a text string, and the corresponding label indicates the category to which the article belongs.

This dataset is often used for text classification tasks, where the goal is to predict the category of a given news article based on its content.


In [None]:
!pip install datasets
!pip install evaluate
!pip install transformers[torch]



In [None]:
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer
from datasets import load_dataset
import matplotlib.pyplot as plt
import pandas as pd
import evaluate
import torch


In [None]:
def create_subset(dataset_dict, num_samples):
    subset_dict = {}
    for split, data in dataset_dict.items():
        subset_data = data.select(range(num_samples))
        subset_dict[split] = subset_data
    return subset_dict

def tokenize(batch):
    return tokenizer(batch["text"].to_list(), padding=True, truncation=True)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# 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


ag_news_dataset = load_dataset("ag_news")

df = ag_news_dataset["train"][:]
df = pd.DataFrame(df)
df.label.unique()

array([2, 3, 1, 0])

In [None]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict


train_data = ag_news_dataset["train"]
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_data["text"], train_data["label"], test_size=0.1, random_state=42
)

# Create Datasets for train and validation
train_dataset = Dataset.from_dict({"text": train_texts, "label": train_labels})
validation_dataset = Dataset.from_dict({"text": val_texts, "label": val_labels})

# Combine train and validation datasets into a DatasetDict
dataset_dict = DatasetDict({"train": train_dataset, "validation": validation_dataset})

In [None]:
num=2

print('world','*'*10)
print()
print(df[df.label==0].iloc[num]['text'])
print()
print('sports','*'*10)
print()
print(df[df.label==1].iloc[num]['text'])
print()
print('business','*'*10)
print()

print(df[df.label==2].iloc[num]['text'])
print()
print('science','*'*10)
print()
print(df[df.label==3].iloc[num]['text'])

world **********

Palestinians in Israeli Jails Start Hunger Strike (Reuters) Reuters - Thousands of Palestinian\prisoners in Israeli jails began a hunger strike for better\conditions Sunday, but Israel's security minister said he\didn't care if they starved to death.

sports **********

Dreaming done, NBA stars awaken to harsh Olympic reality (AFP) AFP - National Basketball Association players trying to win a fourth consecutive Olympic gold medal for the United States have gotten the wake-up call that the "Dream Team" days are done even if supporters have not.

business **********

Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\about the economy and the outlook for earnings are expected to\hang over the stock market next week during the depth of the\summer doldrums.

science **********

AOL to Sell Cheap PCs to Minorities and Seniors (Reuters) Reuters - America Online on Thursday said it\plans to sell a low-priced PC targeting low-income an

In [None]:
# ag_news_dataset = load_dataset("ag_news")
train_ds = ag_news_dataset["train"]

ag_news_dataset.set_format(type="pandas")
df = ag_news_dataset["train"][:]

model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)



#hide_input
tokens2ids = list(zip(tokenizer.all_special_tokens, tokenizer.all_special_ids))
data = sorted(tokens2ids, key=lambda x : x[-1])
df = pd.DataFrame(data, columns=["Special Token", "Special Token ID"])
df.T


train_labels = dataset_dict['train']['label'][:100]
validation_labels = dataset_dict['validation']['label'][:100]
test_labels = ag_news_dataset['test']['label'][:100]

train_data = dataset_dict['train'][:100]
validation_data = dataset_dict['validation'][:100]
test_data = ag_news_dataset['test'][:100]


# ag_news_dataset_encoded = ag_news_dataset.map(tokenize, batched=True, batch_size=5568)



In [None]:
train_encoded = tokenizer(train_data['text'], padding=True, truncation=True)
validation_encoded = tokenizer(validation_data['text'], padding=True, truncation=True)
test_encoded = tokenizer(test_data['text'].to_list(), padding=True, truncation=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
train_encoded = Dataset.from_dict(train_encoded)
validation_encoded = Dataset.from_dict(validation_encoded)
test_encoded = Dataset.from_dict(test_encoded)

train_encoded = train_encoded.add_column('label',train_labels)
validation_encoded = validation_encoded.add_column('label',validation_labels)
test_encoded = test_encoded.add_column('label',test_labels)


In [None]:
dataset_dict = DatasetDict({
    "train": train_encoded,
    "validation": validation_encoded,
    "test": test_encoded
})

In [None]:

# ag_news_dataset_encoded['train'] = ag_news_dataset_encoded['train'].add_column('label',train_labels)
# ag_news_dataset_encoded['validation'] = ag_news_dataset_encoded['validation'].add_column('label',validation_labels)
# ag_news_dataset_encoded['test'] = ag_news_dataset_encoded['test'].add_column('label',test_labels)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# hide_output
num_labels = 4
id2label = {
    "0": "World",
    "1": "Sports",
    "2": "Business",
    "3": "Science/Technology",
}

label2id = {
    "World": 0,
    "Sports": 1,
    "Business": 2,
    "Science/Technology": 3,
}

model = (AutoModelForSequenceClassification
         .from_pretrained('distilbert-base-uncased', num_labels=num_labels, id2label=id2label, label2id=label2id)
         .to(device))

metric = evaluate.load("accuracy")

batch_size = 64
logging_steps = 10
model_name = f"{model_ckpt}-finetuned-emotion"
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=2,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  push_to_hub=True,
                                  log_level="error")



# subset_size = 100
# subset_dataset_dict = create_subset(dataset_dict, subset_size)



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import Trainer
trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=train_encoded,
                  eval_dataset=validation_encoded,
                  tokenizer=tokenizer)



In [None]:

trainer.train()

model.save_pretrained("fine_tuned_bert")

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,1.380291,0.33,0.243798
2,No log,1.374324,0.36,0.285795


In [None]:
predicted_output = trainer.predict(test_encoded)
#predicted_output

In [None]:
predicted_output.metrics

{'test_loss': 1.3601614236831665,
 'test_accuracy': 0.42,
 'test_f1': 0.35972849018528713,
 'test_runtime': 48.5723,
 'test_samples_per_second': 2.059,
 'test_steps_per_second': 0.041}