In [None]:
import numpy as np
import pandas as pd
import re
import os
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, precision_recall_curve, confusion_matrix, precision_recall_fscore_support, accuracy_score
import matplotlib.pyplot as plt

try:
  import transformers
except ImportError:
  !pip install transformers
  import transformers

# for data wrangling
from transformers import FSMTForConditionalGeneration, FSMTTokenizer

!pip install spacy==3.0
import spacy
import spacy.cli
spacy.cli.download("en_core_web_sm")
import en_core_web_sm
spacy_nlp = en_core_web_sm.load()

import nltk
from nltk.corpus import wordnet as wn
nltk.download('wordnet')

Collecting transformers
  Downloading transformers-4.12.3-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 5.4 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.1.1-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 6.5 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 26.4 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 49.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 49.3 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attem

True

# Stage 0
## **Data Import and Cleaning**

In [None]:
# update to your path as necessary
df_raw = pd.read_csv('/content/raw_news.csv')
df_labels = pd.read_csv('/content/labels.csv')

In [None]:
# remove duplicate scrapes
df_raw = df_raw.drop_duplicates(subset=['url'], keep='first')
df_labels = df_labels.drop_duplicates(subset=['url'], keep='first')
df_raw.shape, df_labels.shape

((436, 2), (436, 2))

In [None]:
df_full = df_raw.merge(df_labels, left_on='url', right_on='url', how='inner')
df_full['class'] = df_full['class'].astype(int)
print(df_full.shape)
df_full.head()

(436, 3)


Unnamed: 0,text,url,class
0,Scott Morrison: Australian PM rejects 'sledgin...,https://www.bbc.com/news/world-australia-59129048,0
1,Nigeria building collapse: Race to find surviv...,https://www.bbc.com/news/world-africa-59125228,0
2,Handball federation changes uniform rules afte...,https://www.bbc.com/news/world-59119321,0
3,Nicaragua accused of running internet troll fa...,https://www.bbc.com/news/world-latin-america-5...,0
4,Military officers condemn CIA torture as a sta...,https://www.bbc.com/news/world-us-canada-59124419,0


In [None]:
df_full['class'].value_counts()

0    364
1     47
2     25
Name: class, dtype: int64

We will build a two stage model.
1. The first stage will classify a news article into whether it describes topic A (label 0). 
2. The second stage will then classify whether the news describes subtopics A1 (label 1) or A2 (label 2).

In [None]:
# group all Topic A1 and A2 news into one class - Topic A i.e. label 2 convert into label 1
# this allows the model to learn from more information to identify Topic A
df_stage1 = df_full.copy()
df_stage1['class'] = df_stage1['class'].replace(2, 1)
df_stage1['class'].value_counts()

0    364
1     72
Name: class, dtype: int64

In [None]:
# function for removing obvious textual cues to news source
# we do not want the model to classify text based on its source but on its content
def clean_text(text):
    patterns = ['\(Reuters\)', '\(AP\)', 'bbc.co.uk', 'bbc', 'Our Standards:  The Thomson Reuters Trust Principles']
    for pat in patterns:
        text = re.sub(pat,' ', text, flags=re.IGNORECASE)
    text = ' '.join(text.split())
    return text

In [None]:
df_stage1['text'] = df_stage1['text'].apply(clean_text)

## Data split into train, validation and test sets

In [None]:
# split data into train, val and test sets before data augmentation
X_full_train, X_test, y_full_train, y_test = train_test_split(df_stage1[['text', 'url']], df_stage1['class'], stratify=df_stage1['class'], test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, stratify=y_full_train, test_size=0.2)

In [None]:
print(X_train.shape, X_val.shape, X_test.shape)
print(y_train.shape, y_val.shape, y_test.shape)

(278, 2) (70, 2) (88, 2)
(278,) (70,) (88,)


In [None]:
print(y_train.value_counts())
print(y_val.value_counts())
print(y_test.value_counts())

0    232
1     46
Name: class, dtype: int64
0    59
1    11
Name: class, dtype: int64
0    73
1    15
Name: class, dtype: int64


## Data Augmentation

Augmentation for NLP tasks is different than other problems. Augmented text has to remain plausible, realistic in order not to introduce more noise into the model.

SMOTE works in the feature space and by simply generating synthetic forms of tokenized documents, these may not be realistic representations of the true text document distribution.

We attempt 3 different text augmentation methods here:
1. Splitting a document into constituent sentences and shuffling them to make new documents. (Applied twice per minority text)
2. Replacing words with synonyms in a documents (Applied once per minority text)
3. Roundtrip translation - translating the English news article into German and then back into English. This method preserves the meaning and semantics while not using the exact same words and phrasing. (Applied once per minority text)

Finally, we note that we should only apply augmentation to the training set. The validation/test sets should resemble the real world distributions as closely as possible, in order to properly assess the model's ability to generalize to unseen news articles.

In [None]:
# augmentation 1 - sentence shuffling
def shuffle_doc(text):
  sents = [sent.text for sent in spacy_nlp(text).sents]
  # create a copy to be reshuffled
  sents_copy = sents.copy()
  np.random.shuffle(sents_copy)
  return ' '.join(sents_copy)

def create_shuffled_text(df, classes=[1]):
  df = df.copy()
  minority_filter = df['class'].isin(classes)
  df.loc[minority_filter, 'shuffled_text1'] = df.loc[minority_filter, 'text'].apply(shuffle_doc)
  df.loc[minority_filter, 'shuffled_text2'] = df.loc[minority_filter, 'text'].apply(shuffle_doc)
  df = df.drop(['text'], axis=1)
  df = df.melt(id_vars=['class', 'url'], value_vars=['shuffled_text1','shuffled_text2'], value_name='text').drop('variable', axis=1).dropna(subset=['text'])
  return df.reset_index(drop=True)

In [None]:
# augmentation 2 - synonym replacement
def get_random_synonym(word, pos):
  # gets a single random synonym of a word relevant for its part-of-speech
  synonyms = set()
  wn_pos = {'NOUN':wn.NOUN, 
            'VERB':wn.VERB, 
            'ADV':wn.ADV, 
            'ADJ':wn.ADJ}

  for syn in wn.synsets(word, wn_pos[pos]): 
    for l in syn.lemmas(): 
      syn = l.name().replace("_", " ").replace("-", " ").lower()
      syn = "".join([char for char in syn if char in ' abcdefghijklmnopqrstuvwxyz'])
      synonyms.add(syn)
  
  # a word is not considered a synonym of itself, so remove
  if word in synonyms:
    synonyms.remove(word)

  if len(synonyms) >= 1:
    rand_synonym = np.random.choice(list(synonyms))
    return rand_synonym
  else:
    # return word unchanged if no synonyms found
    return word

def replace_synonyms(text, n_replace=15, pos=['VERB','ADV','ADJ']):
  # parse text using spacy
  # we do not use noun synonym replacement as WordNet synsets includes proper nouns that cannot be filtered out
  doc = spacy_nlp(text)
  eligible_tokens = [(token.text, token.pos_) for token in doc if token.pos_ in pos and not token.is_stop]
  eligible_tokens_set = list(set(eligible_tokens))
  np.random.shuffle(eligible_tokens_set)

  new_doc = [token for token in doc]
  count_replaced = 0
  for token_text, token_pos in eligible_tokens_set:
    for i, token in enumerate(doc):
      if isinstance(token, spacy.tokens.token.Token):
        if (token.text == token_text) and (token.pos_ == token_pos):
          new_doc[i] = get_random_synonym(token_text, token_pos)
          count_replaced += 1

    # replace a fixed number of unique eligible tokens
    if count_replaced >= n_replace:
      break

  # convert all remaining tokens to their text
  new_doc = [t.text if isinstance(t, spacy.tokens.token.Token) else t for t in new_doc]
  sentence = ' '.join(new_doc)
  return sentence

def create_synonym_text(df, classes=[1]):
  df = df.copy()
  minority_filter = df['class'].isin(classes)
  df.loc[minority_filter, 'synonym_text1'] = df.loc[minority_filter, 'text'].apply(replace_synonyms)
  df = df.drop(['text'], axis=1)
  df = df.melt(id_vars=['class', 'url'], value_vars=['synonym_text1'], value_name='text').drop('variable', axis=1).dropna(subset=['text'])
  return df.reset_index(drop=True)

In [None]:
tokenizer_fwd = FSMTTokenizer.from_pretrained("facebook/wmt19-en-de")
model_fwd = FSMTForConditionalGeneration.from_pretrained("facebook/wmt19-en-de")
tokenizer_bwd = FSMTTokenizer.from_pretrained("facebook/wmt19-de-en")
model_bwd = FSMTForConditionalGeneration.from_pretrained("facebook/wmt19-de-en")

def translate_roundtrip(sentence):
  
  # forward translation to de
  input_ids = tokenizer_fwd.encode(sentence, truncation=True, return_tensors='pt')
  outputs = model_fwd.generate(input_ids)
  sentence_de = tokenizer_fwd.decode(outputs[0], skip_special_tokens=True)

  # backward translation to en
  input_ids = tokenizer_bwd.encode(sentence_de, truncation=True, return_tensors='pt')
  outputs = model_bwd.generate(input_ids)
  sentence_en = tokenizer_bwd.decode(outputs[0], skip_special_tokens=True)
  return sentence_en

def create_translate_rt_text(df, classes=[1]):
  df = df.copy()
  minority_filter = df['class'].isin(classes)
  df.loc[minority_filter, 'translate_text1'] = df.loc[minority_filter, 'text'].apply(translate_roundtrip) # future improvement: batch processed with the FSMT model instead of using .apply
  df = df.drop(['text'], axis=1)
  df = df.melt(id_vars=['class', 'url'], value_vars=['translate_text1'], value_name='text').drop('variable', axis=1).dropna(subset=['text'])
  return df.reset_index(drop=True)

Downloading:   0%|          | 0.00/829k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/829k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/308k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/67.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/825 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/829k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/829k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/308k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/67.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/825 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

In [None]:
def augment_df(df, classes=[1]):
  df_shuffle = create_shuffled_text(df, classes)
  df_synonym = create_synonym_text(df, classes)
  df_translate = create_translate_rt_text(df, classes)

  df_aug = pd.concat([df, df_shuffle, df_synonym, df_translate]).reset_index(drop=True)
  return df_aug

In [None]:
df_s1 = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
df_s1_final = augment_df(df_s1)
df_s1_final['class'].value_counts()

0    232
1    230
Name: class, dtype: int64

In [None]:
# post-augmentation dataset shapes
X_train = df_s1_final[['text', 'url']]
y_train = df_s1_final['class']

print(X_train.shape, X_val.shape, X_test.shape)
print(y_train.shape, y_val.shape, y_test.shape)

(462, 2) (70, 2) (88, 2)
(462,) (70,) (88,)


In [None]:
print(y_train.value_counts())
print(y_val.value_counts())
print(y_test.value_counts())

0    232
1    230
Name: class, dtype: int64
0    59
1    11
Name: class, dtype: int64
0    73
1    15
Name: class, dtype: int64


In [None]:
# export processed datasets for quick reloading for future runs
X_train.to_csv('X_train.csv', index=False)
X_val.to_csv('X_val.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_val.to_csv('y_val.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

For future runs without lengthy data processing, import dataset directly

In [None]:
# read in exported datasets
X_train = pd.read_csv('X_train.csv')
X_val = pd.read_csv('X_val.csv')
X_test = pd.read_csv('X_test.csv')

y_train = pd.read_csv('y_train.csv')
y_val = pd.read_csv('y_val.csv')
y_test = pd.read_csv('y_test.csv')

In [None]:
# visual inspection of whether labels are correctly matching the text after data wrangling
X_train.loc[y_train['class']==1]

Unnamed: 0,text,url
6,Analyzing the Varsity Blues verdict October 18...,https://www.reuters.com/legal/legalindustry/an...
8,South Africa to extradite Mozambique’s ex-fina...,https://apnews.com/article/africa-south-africa...
13,Trading of Benfica shares temporarily suspende...,https://apnews.com/article/europe-sports-arres...
14,Italian alleged mobster linked to Van Goghs is...,https://apnews.com/article/europe-middle-east-...
27,Fake exotic dancer companies forfeit £5m in la...,https://www.bbc.com/news/uk-england-manchester...
...,...,...
457,"MANAGUA, Nicaragua - Once Nicaragua presidenti...",https://apnews.com/article/trials-nicaragua-da...
458,Former Honduran presidential candidate Salvado...,https://apnews.com/article/salvador-nasralla-t...
459,Hushpuppi: Nigerian influencer pleads guilty t...,https://www.bbc.com/news/world-africa-58002932
460,LONDON - British bank NatWest faces a fine for...,https://apnews.com/article/business-bradford-e...


# Stage 1

In stage 1, we build a classifier to differentiate between Topic A and other unrelated topics.

## Model Fine Tuning
Training a language model from scratch requires a large amount of data. 
We instead use transfer learning to fine tune a pre-trained large language model on our news dataset. We use the RoBERTa language model, which has been pre-trained on a larger corpus than BERT and more robustly optimized. (https://arxiv.org/abs/1907.11692)

In [None]:
# import pretrained BERT tokenizer
from transformers import RobertaTokenizer, RobertaModel
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [None]:
# tokenize all text inputs
train_encoding = tokenizer(X_train['text'].tolist(), padding=True, truncation=True)
val_encoding = tokenizer(X_val['text'].tolist(), padding=True, truncation=True)
test_encoding = tokenizer(X_test['text'].tolist(), padding=True, truncation=True)

In [None]:
# convert encodings, labels into pytorch Datasets
class newslDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = newsmlDataset(train_encoding, y_train['class'].tolist())
val_dataset = newsmlDataset(val_encoding, y_val['class'].tolist())
test_dataset = newsmlDataset(test_encoding, y_test['class'].tolist())

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments

# used if random initialization across multiple runs is desired
def get_random_seed():
  return int.from_bytes(os.urandom(4), 'big')

def compute_metrics(pred):
  logits, labels = pred
  preds = np.argmax(logits, axis=1)
  precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
  acc = accuracy_score(labels, preds)
  return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

args_dict = {
        'evaluation_strategy': 'epoch',
        'save_strategy': 'epoch',
        'per_device_train_batch_size': 8,
        'per_device_eval_batch_size': 8,
        'learning_rate': 2e-5,
        'lr_scheduler_type': 'linear',
        'num_train_epochs': 5,
        'logging_first_step': True,
        'save_total_limit': 1,
        'fp16': True,
        'dataloader_num_workers': 1,
        'load_best_model_at_end': True,
        'metric_for_best_model': 'eval_loss',
        # 'seed': get_random_seed(),
    }

model = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=2, return_dict=True)
training_args = TrainingArguments(output_dir='/content/', **args_dict)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file https://huggingface.co/roberta-base/resolve/main/pytorch_model.bin from 

In [None]:
trainer.train()

***** Running training *****
  Num examples = 462
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 290


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6768,0.012849,1.0,1.0,1.0,1.0
2,0.6768,0.214331,0.971429,0.916667,0.846154,1.0
3,0.6768,0.158097,0.971429,0.916667,0.846154,1.0
4,0.6768,0.011618,0.985714,0.952381,1.0,0.909091
5,0.6768,0.004851,1.0,1.0,1.0,1.0


  args.max_grad_norm,
***** Running Evaluation *****
  Num examples = 70
  Batch size = 8
Saving model checkpoint to /content/checkpoint-58
Configuration saved in /content/checkpoint-58/config.json
Model weights saved in /content/checkpoint-58/pytorch_model.bin
tokenizer config file saved in /content/checkpoint-58/tokenizer_config.json
Special tokens file saved in /content/checkpoint-58/special_tokens_map.json
  args.max_grad_norm,
***** Running Evaluation *****
  Num examples = 70
  Batch size = 8
Saving model checkpoint to /content/checkpoint-116
Configuration saved in /content/checkpoint-116/config.json
Model weights saved in /content/checkpoint-116/pytorch_model.bin
tokenizer config file saved in /content/checkpoint-116/tokenizer_config.json
Special tokens file saved in /content/checkpoint-116/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 70
  Batch size = 8
Saving model checkpoint to /content/checkpoint-174
Configuration saved in /content/checkpoint-174/c

TrainOutput(global_step=290, training_loss=0.10294030945876549, metrics={'train_runtime': 870.9057, 'train_samples_per_second': 2.652, 'train_steps_per_second': 0.333, 'total_flos': 607786537881600.0, 'train_loss': 0.10294030945876549, 'epoch': 5.0})

## Model evaluation on validation and test sets

In [None]:
test_pred = trainer.predict(test_dataset)

print(classification_report(test_pred.label_ids, test_pred.predictions.argmax(1)))
print(confusion_matrix(test_pred.label_ids, test_pred.predictions.argmax(1)))

              precision    recall  f1-score   support

           0       1.00      0.97      0.99        73
           1       0.88      1.00      0.94        15

    accuracy                           0.98        88
   macro avg       0.94      0.99      0.96        88
weighted avg       0.98      0.98      0.98        88

[[71  2]
 [ 0 15]]


In [None]:
val_pred = trainer.predict(val_dataset)

print(classification_report(val_pred.label_ids, val_pred.predictions.argmax(1)))
print(confusion_matrix(val_pred.label_ids, val_pred.predictions.argmax(1)))

***** Running Prediction *****
  Num examples = 70
  Batch size = 8


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        59
           1       1.00      1.00      1.00        11

    accuracy                           1.00        70
   macro avg       1.00      1.00      1.00        70
weighted avg       1.00      1.00      1.00        70

[[59  0]
 [ 0 11]]


Performance on validation and test sets is good. There is a small number of articles that are misclassified as being Topic A related. Generally for this particular case, we would like the model to have better recall at the cost of some precision. This depends on the applications and broader context of the problem you're working on.


In [None]:
# save model for future use
model.save_pretrained('/content/model_s1')

# Stage 2

We have successfully fine-tuned a RoBERTa base model on our classification task of identifying whether a given news article describes Topic A or not. Our stage 2 task is to build another model that separates between Topic A1 and Topic A2. This dramatically reduces the dataset size and is a harder problem, given the similarity of the writing in these two classes which subtopics of the broader Topic A.

In [None]:
# reload in our raw dataset
# update to your path as necessary
df_raw = pd.read_csv('/content/raw_news.csv')
df_labels = pd.read_csv('/content/labels.csv')

# remove duplicate scrapes
df_raw = df_raw.drop_duplicates(subset=['url'], keep='first')
df_labels = df_labels.drop_duplicates(subset=['url'], keep='first')

df_full = df_raw.merge(df_labels, left_on='url', right_on='url', how='inner')
df_full['class'] = df_full['class'].astype(int)
print(df_full.shape)

df_full['class'].value_counts()

(436, 3)


0    364
1     47
2     25
Name: class, dtype: int64

In [None]:
df_stage2 = df_full[df_full['class'].isin([1, 2])].copy()
# map class 1 to 0 and 2 to 1
class_map = {1: 0, 2: 1}
df_stage2['class'] = df_stage2['class'].map(class_map)
print(df_stage2.shape)
df_stage2['class'].value_counts()

(72, 3)


0    47
1    25
Name: class, dtype: int64

In [None]:
df_stage2['text'] = df_stage2['text'].apply(clean_text)

In [None]:
# split data into train, val and test sets before data augmentation
X_s2_full_train, X_s2_test, y_s2_full_train, y_s2_test = train_test_split(df_stage2[['text', 'url']], df_stage2['class'], stratify=df_stage2['class'], test_size=0.2)
X_s2_train, X_s2_val, y_s2_train, y_s2_val = train_test_split(X_s2_full_train, y_s2_full_train, stratify=y_s2_full_train, test_size=0.2)

In [None]:
print(X_s2_train.shape, X_s2_val.shape, X_s2_test.shape)
print(y_s2_train.shape, y_s2_val.shape, y_s2_test.shape)
print(y_s2_train.value_counts())
print(y_s2_val.value_counts())
print(y_s2_test.value_counts())

(45, 2) (12, 2) (15, 2)
(45,) (12,) (15,)
0    29
1    16
Name: class, dtype: int64
0    8
1    4
Name: class, dtype: int64
0    10
1     5
Name: class, dtype: int64


In [None]:
# augment the entire dataset to increase size
df_s2_train = pd.concat([X_s2_train, y_s2_train], axis=1).reset_index(drop=True)
df_s2_train_aug = augment_df(df_s2_train, classes=[0, 1])
df_s2_train_aug['class'].value_counts()

0    145
1     80
Name: class, dtype: int64

In [None]:
X_s2_train = df_s2_train_aug[['text', 'url']]
y_s2_train = df_s2_train_aug['class']
print(X_s2_train.shape, y_s2_train.shape)

# export processed datasets for quick reloading
X_s2_train.to_csv('X_s2_train.csv', index=False)
X_s2_val.to_csv('X_s2_val.csv', index=False)
X_s2_test.to_csv('X_s2_test.csv', index=False)
y_s2_train.to_csv('y_s2_train.csv', index=False)
y_s2_val.to_csv('y_s2_val.csv', index=False)
y_s2_test.to_csv('y_s2_test.csv', index=False)

(225, 2) (225,)


In [None]:
# read in exported datasets
X_train = pd.read_csv('X_s2_train.csv')
X_val = pd.read_csv('X_s2_val.csv')
X_test = pd.read_csv('X_s2_test.csv')

y_train = pd.read_csv('y_s2_train.csv')
y_val = pd.read_csv('y_s2_val.csv')
y_test = pd.read_csv('y_s2_test.csv')

In [None]:
print(y_train['class'].value_counts())
print(y_val['class'].value_counts())
print(y_test['class'].value_counts())

0    145
1     80
Name: class, dtype: int64
0    8
1    4
Name: class, dtype: int64
0    10
1     5
Name: class, dtype: int64


In [None]:
# import pretrained RoBERTa tokenizer
from transformers import RobertaTokenizer, RobertaModel
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# tokenize all text inputs
train_encoding = tokenizer(X_train['text'].tolist(), padding=True, truncation=True)
val_encoding = tokenizer(X_val['text'].tolist(), padding=True, truncation=True)
test_encoding = tokenizer(X_test['text'].tolist(), padding=True, truncation=True)

# convert encodings, labels into pytorch Datasets
class newsmlDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = newsmlDataset(train_encoding, y_train['class'].tolist())
val_dataset = newsmlDataset(val_encoding, y_val['class'].tolist())
test_dataset = newsmlDataset(test_encoding, y_test['class'].tolist())

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

## Model fine-tuning

In [None]:
# initialize model and trainer
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments

# used if random initialization across multiple runs is desired
def get_random_seed():
  return int.from_bytes(os.urandom(4), 'big')

def compute_metrics(pred):
  logits, labels = pred
  preds = np.argmax(logits, axis=1)
  precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
  acc = accuracy_score(labels, preds)
  return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

args_dict = {
        'evaluation_strategy': 'epoch',
        'save_strategy': 'epoch',
        'per_device_train_batch_size': 8,
        'per_device_eval_batch_size': 8,
        'learning_rate': 2e-5,
        'lr_scheduler_type': 'linear',
        'num_train_epochs': 5,
        'logging_first_step': True,
        'save_total_limit': 1,
        'fp16': True,
        'dataloader_num_workers': 1,
        'load_best_model_at_end': True,
        'metric_for_best_model': 'eval_loss',
        # 'seed': get_random_seed(),
    }

model = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=2, return_dict=True)
training_args = TrainingArguments(output_dir='/content/', **args_dict)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [None]:
trainer.train()

***** Running training *****
  Num examples = 225
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 145


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.7849,0.557576,0.666667,0.0,0.0,0.0
2,0.7849,0.686097,0.75,0.4,1.0,0.25
3,0.7849,0.540428,0.916667,0.857143,1.0,0.75
4,0.7849,0.845163,0.833333,0.75,0.75,0.75
5,0.7849,0.933707,0.833333,0.75,0.75,0.75


  args.max_grad_norm,
***** Running Evaluation *****
  Num examples = 12
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to /content/checkpoint-29
Configuration saved in /content/checkpoint-29/config.json
Model weights saved in /content/checkpoint-29/pytorch_model.bin
tokenizer config file saved in /content/checkpoint-29/tokenizer_config.json
Special tokens file saved in /content/checkpoint-29/special_tokens_map.json
  args.max_grad_norm,
***** Running Evaluation *****
  Num examples = 12
  Batch size = 8
Saving model checkpoint to /content/checkpoint-58
Configuration saved in /content/checkpoint-58/config.json
Model weights saved in /content/checkpoint-58/pytorch_model.bin
tokenizer config file saved in /content/checkpoint-58/tokenizer_config.json
Special tokens file saved in /content/checkpoint-58/special_tokens_map.json
  args.max_grad_norm,
***** Running Evaluation *****
  Num examples = 12
  Batch size = 8
Saving model checkpoint to 

TrainOutput(global_step=145, training_loss=0.19688402373215247, metrics={'train_runtime': 451.0872, 'train_samples_per_second': 2.494, 'train_steps_per_second': 0.321, 'total_flos': 295999937280000.0, 'train_loss': 0.19688402373215247, 'epoch': 5.0})

## Model evaluation on validation and test sets

In [None]:
test_pred = trainer.predict(test_dataset)

print(classification_report(test_pred.label_ids, test_pred.predictions.argmax(1)))
print(confusion_matrix(test_pred.label_ids, test_pred.predictions.argmax(1)))

***** Running Prediction *****
  Num examples = 15
  Batch size = 8


              precision    recall  f1-score   support

           0       0.91      1.00      0.95        10
           1       1.00      0.80      0.89         5

    accuracy                           0.93        15
   macro avg       0.95      0.90      0.92        15
weighted avg       0.94      0.93      0.93        15

[[10  0]
 [ 1  4]]


In [None]:
val_pred = trainer.predict(val_dataset)

print(classification_report(val_pred.label_ids, val_pred.predictions.argmax(1)))
print(confusion_matrix(val_pred.label_ids, val_pred.predictions.argmax(1)))

***** Running Prediction *****
  Num examples = 12
  Batch size = 8


              precision    recall  f1-score   support

           0       0.89      1.00      0.94         8
           1       1.00      0.75      0.86         4

    accuracy                           0.92        12
   macro avg       0.94      0.88      0.90        12
weighted avg       0.93      0.92      0.91        12

[[8 0]
 [1 3]]


# Errors analysis

We inspect the misclassified articles to see where the model's weaknesses lie and what sources of ambiguity could cause the model prediction to fail.

In [None]:
# inspect the news article which model misclassifies
print(X_s2_val.loc[val_pred.label_ids != val_pred.predictions.argmax(1), 'text'].values)

Depending on the examples you inspect, you may spot important details that hint at where your model is not working as expected or has trouble making the distinction between topics. 

We experiment with training the model for an additional 5 epochs to see if performance improves.   

It doesn't. The model has settled at an optimum and further training does not affect its performance. 

In [None]:
# Train for an additional 5 epochs
trainer.train()

***** Running training *****
  Num examples = 225
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 145


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0039,0.540432,0.916667,0.857143,1.0,0.75
2,0.0039,0.540432,0.916667,0.857143,1.0,0.75
3,0.0039,0.540432,0.916667,0.857143,1.0,0.75
4,0.0039,0.540432,0.916667,0.857143,1.0,0.75
5,0.0039,0.540432,0.916667,0.857143,1.0,0.75


***** Running Evaluation *****
  Num examples = 12
  Batch size = 8


Saving model checkpoint to /content/checkpoint-29
Configuration saved in /content/checkpoint-29/config.json
Model weights saved in /content/checkpoint-29/pytorch_model.bin
tokenizer config file saved in /content/checkpoint-29/tokenizer_config.json
Special tokens file saved in /content/checkpoint-29/special_tokens_map.json
Deleting older checkpoint [/content/checkpoint-87] due to args.save_total_limit
Deleting older checkpoint [/content/checkpoint-145] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 12
  Batch size = 8
Saving model checkpoint to /content/checkpoint-58
Configuration saved in /content/checkpoint-58/config.json
Model weights saved in /content/checkpoint-58/pytorch_model.bin
tokenizer config file saved in /content/checkpoint-58/tokenizer_config.json
Special tokens file saved in /content/checkpoint-58/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 12
  Batch size = 8
Saving model checkpoint to /content/checkpoint-87
Confi

TrainOutput(global_step=145, training_loss=0.01939842865384858, metrics={'train_runtime': 451.7506, 'train_samples_per_second': 2.49, 'train_steps_per_second': 0.321, 'total_flos': 295999937280000.0, 'train_loss': 0.01939842865384858, 'epoch': 5.0})

In [38]:
model.save_pretrained('/content/model_s2')

Configuration saved in /content/config.json
Model weights saved in /content/pytorch_model.bin


While this problem could be explored using simpler methods to like regex word detection, tfidf feature vectors, this was an exercise to observe how closely the model was able to pick up on the custom labelling methods / criteria used in the labelling process. Was it able to pick up on nuanced cases? Did you have to make a judgment call between Topics A1 and A2? How did the model fare on cases that were ambiguous to you as a human?