# 1. Loading 'ag_news' dataset

In [1]:
from datasets import load_dataset

dataset = load_dataset('ag_news')
train_set = dataset['train'].shuffle(seed=42).select([i for i in range(2000)])
validation_set = dataset['test'].shuffle(seed=42).select([i for i in range(400)])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]



data/train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [14]:
train_set

Dataset({
    features: ['text', 'label'],
    num_rows: 2000
})

In [15]:
train_set[5]

{'text': 'A Cosmic Storm: When Galaxy Clusters Collide Astronomers have found what they are calling the perfect cosmic storm, a galaxy cluster pile-up so powerful its energy output is second only to the Big Bang.',
 'label': 3}

In [16]:
set(train_set['label'])

{0, 1, 2, 3}

# 2. Tokenizing the dataset

In [17]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=256)

tokenized_train_set = train_set.map(preprocess_function, batched=True)
tokenized_validation_set = validation_set.map(preprocess_function, batched=True)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [18]:
tokenized_validation_set

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 400
})

# 3. Training the model

In [19]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=4)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

DistilBertForSequenceClassification LOAD REPORT from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_layer_norm.weight | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
vocab_projector.bias    | UNEXPECTED | 
pre_classifier.weight   | MISSING    | 
pre_classifier.bias     | MISSING    | 
classifier.bias         | MISSING    | 
classifier.weight       | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


In [20]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir='./logs',
    logging_steps=50,
    report_to='none'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_set,
    eval_dataset=tokenized_validation_set,
)

trainer.train()

`logging_dir` is deprecated and will be removed in v5.2. Please set `TENSORBOARD_LOGGING_DIR` instead.


Step,Training Loss
50,0.882973
100,0.370276
150,0.420053
200,0.38193
250,0.359886


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

TrainOutput(global_step=250, training_loss=0.4830237503051758, metrics={'train_runtime': 2882.6193, 'train_samples_per_second': 0.694, 'train_steps_per_second': 0.087, 'total_flos': 132472123392000.0, 'train_loss': 0.4830237503051758, 'epoch': 1.0})

# 4. Evaluation

In [21]:
trainer.evaluate()



{'eval_loss': 0.44407859444618225,
 'eval_runtime': 179.4179,
 'eval_samples_per_second': 2.229,
 'eval_steps_per_second': 0.279,
 'epoch': 1.0}

In [22]:
import torch
import torch.nn.functional as F

In [23]:
train_set.features

{'text': Value('string'),
 'label': ClassLabel(names=['World', 'Sports', 'Business', 'Sci/Tech'])}

In [28]:
def predict_texts(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=256)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = F.softmax(outputs.logits, dim=1)
        predictions = torch.argmax(probs, dim=1)
        label_map = {0: 'World', 1: 'Sports', 2: 'Business', 3: 'Sci/Tech'}
    for text, pred, prob in zip(texts, predictions, probs):
        print(f'{text}  {label_map[pred.item()]}  {prob[pred.item()].item():.4f}')

In [29]:
import random
texts = random.sample(validation_set['text'], 5)

predict_texts(texts)

Woods on Top at Rain-Soaked Dunlop Phoenix  MIYAZAKI, Japan (Reuters) - Tiger Woods fired a superb  five-under-par 65 in torrential rain to take a three-stoke lead  after the first round of the Dunlop Phoenix tournament  Thursday.  Sports  0.9790
Schumacher in uncharted territory MICHAEL Schumacher doesn #39;t need to win the Belgian Grand Prix on Sunday to nail his unprecedented seventh Formula One drivers title.  Sports  0.9766
Rivals Try to Turn Tables on Charles Schwab By MICHAEL LIEDTKE     SAN FRANCISCO (AP) -- With its low prices and iconoclastic attitude, discount stock broker Charles Schwab Corp. (SCH) represented an annoying stone in Wall Street's wing-tipped shoes for decades...  Business  0.9632
Iraq #39;s Government in Talks with Sunni, Shi #39;ite Leaders Iraq #39;s interim government is engaged in cease-fire talks with Sunni and Shi #39;ite leaders in an effort to restore calm to violent parts of Iraq before January #39;s scheduled election.  World  0.9828
America #39;s 

# 5. Gradio Interface

In [32]:
def predict_news_category(text):
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors='pt', max_length=256)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = F.softmax(outputs.logits, dim=1)
        prediction = torch.argmax(probs, dim=1).item()
    label_map = {0: 'World', 1: 'Sports', 2: 'Business', 3: 'Sci/Tech'}
    return label_map[prediction]

In [33]:
import gradio as gr

interface = gr.Interface(
    fn=predict_news_category,
    inputs=gr.Textbox(lines=5, placeholder="Enter news text here..."),
    outputs="text",
    title="News Category Prediction",
    description="Enter the news text here to predict its category (World, Sports, Business, Sci/Tech)."
)

interface.launch(share=True)    # public url

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://dc7c8b6af43ae27161.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




# 6. Save the model

In [34]:
torch.save(model.state_dict(), 'news_model.pth')