In [10]:
!wget -q https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/train.csv
!wget -q https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/test.csv

In [11]:
!pip install --quiet huggingface_hub pandas datasets

In [12]:
import pandas as pd

# load CSV
train_df = pd.read_csv("train.csv", header=None, names=["label","title","description"])
test_df  = pd.read_csv("test.csv",  header=None, names=["label","title","description"])

# merge title+description
train_df["text"] = train_df["title"].str.strip() + ". " + train_df["description"].str.strip()
test_df["text"]  = test_df["title"].str.strip()  + ". " + test_df["description"].str.strip()

In [13]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# Tokenize a small batch
batch = list(train_df["text"].iloc[:8])
encoding = tokenizer(batch,
                     padding="max_length",
                     truncation=True,
                     max_length=128,
                     return_tensors="pt")

print("input_ids:", encoding["input_ids"].shape)

input_ids: torch.Size([8, 128])


In [14]:
import torch

torch.save(encoding, "tokenized_sample.pt")


In [15]:
# Reloading the raw AG News splits
from datasets import load_dataset
dataset = load_dataset("ag_news")
train_ds, test_ds = dataset["train"], dataset["test"]

# Appling tokenizer over the entire split
def tokenize_batch(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )

tokenized_train = train_ds.map(tokenize_batch, batched=True, remove_columns=["text"])
tokenized_test  = test_ds.map(tokenize_batch,  batched=True, remove_columns=["text"])

# seting to PyTorch tensors
tokenized_train.set_format("torch", columns=["input_ids","attention_mask","label"])
tokenized_test.set_format("torch",  columns=["input_ids","attention_mask","label"])

# Saving the full tokenized dataset
from datasets import DatasetDict
tok = DatasetDict({"train": tokenized_train, "test": tokenized_test})
tok.save_to_disk("/content/tokenized_ag_news")

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/120000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7600 [00:00<?, ? examples/s]