In [9]:
!wget -q https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/train.csv
!wget -q https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/test.csv


In [10]:
import pandas as pd

#Load & merge title+description
train_df = pd.read_csv("train.csv", header=None, names=["label","title","description"])
test_df  = pd.read_csv("test.csv",  header=None, names=["label","title","description"])
train_df["text"] = train_df["title"].str.strip() + ". " + train_df["description"].str.strip()
test_df["text"]  = test_df["title"].str.strip()  + ". " + test_df["description"].str.strip()

# Shift labels from 1–4 → 0–3**
train_df["label"] = train_df["label"] - 1
test_df["label"]  = test_df["label"]  - 1
print("Labels now range:", train_df["label"].min(), "to", train_df["label"].max())


Labels now range: 0 to 3


In [11]:
from datasets import Dataset
from transformers import DistilBertTokenizerFast

#HF Datasets
hf_train = Dataset.from_pandas(train_df[["text","label"]])
hf_test  = Dataset.from_pandas(test_df[ ["text","label"]])

#Tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
def tokenize_batch(batch):
    return tokenizer(batch["text"],
                     padding="max_length",
                     truncation=True,
                     max_length=128)

#Map separately for train test
tokenized_train = hf_train.map(tokenize_batch, batched=True, remove_columns=["text"])
tokenized_test  = hf_test .map(tokenize_batch, batched=True, remove_columns=["text"])

#Set to torch
tokenized_train.set_format("torch", columns=["input_ids","attention_mask","label"])
tokenized_test .set_format("torch", columns=["input_ids","attention_mask","label"])

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [12]:
from transformers import Trainer, TrainingArguments

# Tiny TrainingArguments for test
training_args = TrainingArguments(
    output_dir                = "debug-agnews",
    per_device_train_batch_size = 8,
    per_device_eval_batch_size  = 16,
    num_train_epochs         = 1,
    logging_steps            = 10,
)

# Subset your data for a test
small_train = tokenized_train.shuffle(seed=0).select(range(100))
small_eval  = tokenized_test.shuffle(seed=0).select(range(20))


trainer_test = Trainer(
    model           = model,
    args            = training_args,
    train_dataset   = small_train,
    eval_dataset    = small_eval,
    compute_metrics = compute_metrics,
)


trainer_test.train()
metrics = trainer_test.evaluate()
print(metrics)


from sklearn.metrics import classification_report
import numpy as np

preds_output = trainer_test.predict(small_eval)
preds = np.argmax(preds_output.predictions, axis=-1)
print(classification_report(
    small_eval["label"],
    preds,
    target_names=list(id2label.values())
))

Step,Training Loss
10,1.3268


{'eval_loss': 1.1955326795578003, 'eval_accuracy': 0.8, 'eval_precision': 0.9142857142857143, 'eval_recall': 0.8, 'eval_f1': 0.8175000000000001, 'eval_runtime': 0.1105, 'eval_samples_per_second': 181.023, 'eval_steps_per_second': 18.102, 'epoch': 1.0}
              precision    recall  f1-score   support

       World       1.00      0.60      0.75         5
      Sports       1.00      1.00      1.00         6
    Business       0.43      1.00      0.60         3
    Sci/Tech       1.00      0.67      0.80         6

    accuracy                           0.80        20
   macro avg       0.86      0.82      0.79        20
weighted avg       0.91      0.80      0.82        20



In [13]:
# The mapping
id2label = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}
label2id = {v:k for k,v in id2label.items()}

#Load model
from transformers import DistilBertConfig, DistilBertForSequenceClassification

config = DistilBertConfig.from_pretrained(
    "distilbert-base-uncased",
    num_labels=4,
    id2label=id2label,
    label2id=label2id
)
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    config=config
)

# raw predictions and mapping them back
preds_output = trainer_test.predict(small_eval)
logits = preds_output.predictions
pred_ids = np.argmax(logits, axis=-1)

# Converting to names
pred_labels = [id2label[i] for i in pred_ids]
print(pred_labels)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


['Business', 'World', 'Business', 'Sports', 'Sci/Tech', 'Sports', 'Business', 'Sci/Tech', 'Sports', 'Business', 'Sports', 'Sci/Tech', 'Business', 'Business', 'Sports', 'World', 'Sci/Tech', 'World', 'Sports', 'Business']
