# Topic classification
## Downloading and cleaning data

In [1]:
import pandas as pd
from datasets import load_dataset
from datasets import Dataset
SEED = 42

In [2]:
dataset = load_dataset("Fraser/news-category-dataset")
num_labels = len(set(dataset["train"]["category_num"]))


Using custom data configuration default
Reusing dataset news_category (/Users/franz/.cache/huggingface/datasets/news_category/default/0.0.0/737b7b6dff469cbba49a6202c9e94f9d39da1fed94e13170cf7ac4b61a75fb9c)


In [3]:
reduced_categories = {
  "CULTURE & ARTS":["ARTS","ARTS & CULTURE","CULTURE & ARTS"],
 "BUSINESS": ["BUSINESS","MONEY"],
 "EDUCATION": ["EDUCATION","COLLEGE"],
 "COMEDY & ENTERTAINMENT" : ["COMEDY","ENTERTAINMENT","MEDIA"],
 "HEALTH & LIVING": ["WELLNESS","HEALTHY LIVING",
                     "STYLE & BEAUTY","HOME & LIVING",
                     "PARENTS","STYLE","FOOD & DRINK","TASTE","PARENTING","DIVORCE","WEDDINGS"],
 "RELIGION" : ["RELIGION"],
 "POLITICS" : ["POLITICS","BLACK VOICES","LATINO VOICES","QUEER VOICES","WOMEN"],
 "SPORTS" : ["SPORTS"],
 "TRAVEL" : ["TRAVEL"],
 "NEWS" :["GOOD NEWS","THE WORLDPOST","WORLDPOST","WORLD NEWS","WEIRD NEWS","CRIME"],
 "ENVIRONMENT" : ["GREEN","ENVIRONMENT"],
 "SCIENCE": ["SCIENCE"],
 "TECH": ["TECH"],
 "OTHER" : ["IMPACT","FIFTY"]
 }



In [4]:
label_map = {}
for i,cats in enumerate(reduced_categories.values()):
  for cat in cats:
    label_map[cat] = i

In [5]:
train_dataset = Dataset.from_dict({
    "text" : dataset["train"]["headline"],
    "label": pd.Series(dataset["train"]["category"]).replace(label_map).tolist()
})

In [6]:

test_dataset = Dataset.from_dict({
    "text" : dataset["test"]["headline"],
    "label": pd.Series(dataset["test"]["category"]).replace(label_map).tolist()
})

In [7]:
train_dataset.save_to_disk("./tmp/train_dataset")
test_dataset.save_to_disk("./tmp/test_dataset")

## Training the model

In [8]:
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import AutoTokenizer
from datasets import load_metric
import numpy as np
metric = load_metric("accuracy")

In [9]:
train_dataset = Dataset.from_file("./tmp/train_dataset/dataset.arrow")
test_dataset  = Dataset.from_file("./tmp/test_dataset/dataset.arrow")
num_labels = 41

In [None]:
model_checkpoint = "microsoft/xtremedistil-l6-h256-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

In [None]:
def preprocess_function(examples, tokenizer):
    return tokenizer(examples["text"],
                   padding="max_length",max_length=201 ,
                   truncation=True)

def compute_metrics(eval_pred, metric):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
encoded_train_dataset = train_dataset.map(preprocess_function, tokenizer,batched=True)
encoded_test_dataset = test_dataset.map(preprocess_function, tokenizer,batched=True)

In [None]:
metric_name = "accuracy"
batch_size= 16
args = TrainingArguments(
    f"finetuned",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=False,
)

In [None]:
validation_key = "validation"
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_test_dataset,
    tokenizer=tokenizer,
    
    compute_metrics=compute_metrics(metric)
)

In [None]:
trainer.train()

In [None]:
trainer.save()

## Interference of the model

In [11]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import AutoTokenizer
import nump as np
num_labels = 41

In [None]:
best_model = f"./finetuned/best-model/"

tokenizer = AutoTokenizer.from_pretrained(best_model, use_fast=True)

model_test = AutoModelForSequenceClassification.from_pretrained(
    best_model,
    num_labels=num_labels)


In [None]:
#source: https://github.com/huggingface/transformers/blob/master/src/transformers/pipelines/text_classification.py
def softmax(_outputs):
    maxes = np.max(_outputs, axis=-1, keepdims=True)
    shifted_exp = np.exp(_outputs - maxes)
    return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)

def test_interference(trained_model, text, tokenizer):
    text_pt = tokenizer([text],
                    padding="max_length",max_length=201,
                    truncation=True,return_tensors="pt")
    return softmax(trained_model(**text_pt)[0][0].detach().numpy())

In [None]:

np.argmax(test_interference(model_test, "A bird was flying today", tokenizer))

