##Imports, Downloading the data and cleaning
We use the dataset from huggingface


In [None]:
#Requirements for colab
#!pip install transformers hackernews-client datasets -q

[K     |████████████████████████████████| 3.5 MB 12.7 MB/s 
[K     |████████████████████████████████| 312 kB 24.1 MB/s 
[K     |████████████████████████████████| 67 kB 4.7 MB/s 
[K     |████████████████████████████████| 596 kB 47.3 MB/s 
[K     |████████████████████████████████| 895 kB 45.9 MB/s 
[K     |████████████████████████████████| 6.8 MB 39.8 MB/s 
[K     |████████████████████████████████| 243 kB 38.7 MB/s 
[K     |████████████████████████████████| 133 kB 35.7 MB/s 
[K     |████████████████████████████████| 1.1 MB 35.2 MB/s 
[K     |████████████████████████████████| 144 kB 37.6 MB/s 
[K     |████████████████████████████████| 94 kB 2.5 MB/s 
[K     |████████████████████████████████| 271 kB 38.9 MB/s 
[?25h  Building wheel for hackernews-client (setup.py) ... [?25l[?25hdone


# Topic classification
## Downloading and cleaning data

In [None]:
import pandas as pd
from datasets import load_dataset
from datasets import Dataset
from sklearn.model_selection import StratifiedShuffleSplit
SEED = 42

In [None]:
dataset = load_dataset("Fraser/news-category-dataset")

Using custom data configuration default
Reusing dataset news_category (/root/.cache/huggingface/datasets/Fraser___news_category/default/0.0.0/737b7b6dff469cbba49a6202c9e94f9d39da1fed94e13170cf7ac4b61a75fb9c)


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
reduced_categories = {
  "CULTURE & ARTS":["ARTS","ARTS & CULTURE","CULTURE & ARTS",
                    "COMEDY","ENTERTAINMENT","MEDIA"],
 "EDUCATION": ["EDUCATION","COLLEGE"],
 "BUSINESS": ["BUSINESS","MONEY"],
 "HEALTH & LIVING": ["WELLNESS","HEALTHY LIVING","TRAVEL","IMPACT","FIFTY",
                     "STYLE & BEAUTY","HOME & LIVING","GREEN",
                     "PARENTS","STYLE","FOOD & DRINK","TASTE","PARENTING","DIVORCE","WEDDINGS"],
 "SPORTS" : ["SPORTS"],
 "NEWS & POLITICS" :["POLITICS","BLACK VOICES","LATINO VOICES","QUEER VOICES","WOMEN","RELIGION"]+["GOOD NEWS","THE WORLDPOST","WORLDPOST","WORLD NEWS","WEIRD NEWS","CRIME"],
 "TECH & SCIENCE": ["SCIENCE","ENVIRONMENT","TECH"]
 }
 
label_map = {}
for i,cats in enumerate(reduced_categories.values()):
  for cat in cats:
    label_map[cat] = i

In [None]:
num_labels = len(reduced_categories.values())

In [None]:
#combinne train and test for
#headline + short_description
#headline
#short_description
combined_dataset = pd.DataFrame(
    {
        "text" : 
     ((pd.Series(dataset["train"]["headline"]) +/
                   "\n" + pd.Series(dataset["train"]["short_description"]))
        .append((pd.Series(dataset["test"]["headline"]) +/
                 "\n" + pd.Series(dataset["test"]["short_description"]))))
        .append((pd.Series(dataset["train"]["short_description"]))
        .append( pd.Series(dataset["test"]["short_description"])))
        .append(
            pd.Series(dataset["train"]["headline"])
            .append(pd.Series(dataset["test"]["headline"]))
            )
        ,
        "label" :
     pd.Series(dataset["train"]["category"]).replace(label_map)
        .append(pd.Series(dataset["test"]["category"]).replace(label_map))
        .append(pd.Series(dataset["train"]["category"]).replace(label_map))
        .append(pd.Series(dataset["test"]["category"]).replace(label_map))
        .append(pd.Series(dataset["train"]["category"]).replace(label_map)
        .append(pd.Series(dataset["test"]["category"]).replace(label_map)))
    }

)

In [None]:
df = df.drop_duplicates(subset=['text'])
# We want to balance the dataset
df = df.groupby("label").sample(df.label.value_counts().min())

In [None]:

split=StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_index, test_index = next(split.split(df.text,df.label))



In [None]:

train_dataset = Dataset.from_dict({
    "text" : df.iloc[train_index].text,
    "label": df.iloc[train_index].label
})


test_dataset = Dataset.from_dict({
    "text" : df.iloc[test_index].text,
    "label": df.iloc[test_index].label
})

In [None]:
train_dataset.save_to_disk("./tmp/train_dataset")
test_dataset.save_to_disk("./tmp/test_dataset")

## Training the model

In [None]:
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import AutoTokenizer
from datasets import load_metric
import numpy as np
metric = load_metric("accuracy")

In [None]:
train_dataset = Dataset.from_file("./tmp/train_dataset/dataset.arrow")
test_dataset  = Dataset.from_file("./tmp/test_dataset/dataset.arrow")


In [None]:
model_checkpoint = "microsoft/xtremedistil-l6-h256-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def preprocess_function(examples, tokenizer):
    return tokenizer(examples["text"],
                   padding="max_length",max_length=201 ,
                   truncation=True)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
encoded_train_dataset = train_dataset.map(lambda x: preprocess_function(x,tokenizer),batched=True)
encoded_test_dataset =  test_dataset.map(lambda x: preprocess_function(x,tokenizer),batched=True)

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [None]:
print("Cat:",encoded_test_dataset["label"][0],"\n",encoded_test_dataset["text"][0],"\n",list(reduced_categories.values())[6])

Cat: 6 
 The Supreme Court Let A Man Die. He Was Executed With The Wrong Drug.
The court placed far too much faith in Oklahoma's disastrous lethal injection protocol in January and in June. 
 ['POLITICS', 'BLACK VOICES', 'LATINO VOICES', 'QUEER VOICES', 'WOMEN']


In [None]:
metric_name = "accuracy"
batch_size= 16
args = TrainingArguments(
    f"finetuned",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=False,
)

In [None]:

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 10000
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1875


Epoch,Training Loss,Validation Loss,Accuracy
1,1.9904,1.652683,0.512333
2,1.5713,1.481552,0.577333
3,1.418,1.440536,0.588667


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 3000
  Batch size = 16
Saving model checkpoint to finetuned/checkpoint-625
Configuration saved in finetuned/checkpoint-625/config.json
Model weights saved in finetuned/checkpoint-625/pytorch_model.bin
tokenizer config file saved in finetuned/checkpoint-625/tokenizer_config.json
Special tokens file saved in finetuned/checkpoint-625/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 3000
  Batch size = 16
Saving model checkpoint to finetuned/checkpoint-1250
Configuration saved in finetuned/checkpoint-1250/config.json
Model weights saved in finetuned/checkpoint-1250/pytorch_model.bin
tokenizer config file saved in finetu

TrainOutput(global_step=1875, training_loss=1.60347255859375, metrics={'train_runtime': 330.8285, 'train_samples_per_second': 90.681, 'train_steps_per_second': 5.668, 'total_flos': 173970155160000.0, 'train_loss': 1.60347255859375, 'epoch': 3.0})

In [None]:
trainer.save_model("best_model")

Saving model checkpoint to best_model
Configuration saved in best_model/config.json
Model weights saved in best_model/pytorch_model.bin
tokenizer config file saved in best_model/tokenizer_config.json
Special tokens file saved in best_model/special_tokens_map.json


## Interference


In [None]:
model_test = AutoModelForSequenceClassification.from_pretrained(
    f"./best_model/",
    num_labels=num_labels)


#source: https://github.com/huggingface/transformers/blob/master/src/transformers/pipelines/text_classification.py
def softmax(_outputs):
    maxes = np.max(_outputs, axis=-1, keepdims=True)
    shifted_exp = np.exp(_outputs - maxes)
    return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)

def test_interference(trained_model, text, tokenizer):
    text_pt = tokenizer([text],
                    padding="max_length",max_length=201,
                    truncation=True,return_tensors="pt")
    return softmax(trained_model(**text_pt)[0][0].detach().numpy())

loading configuration file ./best_model/config.json
Model config BertConfig {
  "_name_or_path": "./best_model/",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13"
  },
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_12": 12,
    "LABEL_13": 13,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8,
    "LABEL_9": 9
  }

In [None]:
list(reduced_categories.values())[np.argmax(test_interference(model_test, "A new beer was introduced today", tokenizer))]

['WELLNESS',
 'HEALTHY LIVING',
 'STYLE & BEAUTY',
 'HOME & LIVING',
 'PARENTS',
 'STYLE',
 'FOOD & DRINK',
 'TASTE',
 'PARENTING',
 'DIVORCE',
 'WEDDINGS']

## Fetch Hackernews
Fetch headline and first comment

In [None]:
from hackernews import hn
news_client = hn.NewsClient()

def get_comm(x):
  try:
    c_id = x.kids[0]
    return requests.get(f"https://hacker-news.firebaseio.com/v0/item/{c_id}.json?print=pretty").json()['text']
  except:
    return ""

h = news_client.get_best_story(fetchMax=200)

headline_and_comment  =  [x.title + " " + get_comm(x) for x in h]


## Test Hacker News Model



In [None]:
headline_and_comment[26]

"The Banality of Genius: Notes on Peter Jackson's Get Back "

In [None]:
lookup_label[np.argmax(test_interference(model_test,headline_and_comment[26]))]

['COMEDY', 'ENTERTAINMENT', 'MEDIA']