In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import torch

if torch.cuda.is_available():       
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

# Getting the Dataset

In [2]:
df = pd.read_json('../input/news-category-dataset/News_Category_Dataset_v2.json', lines = True)
print(format(df.shape[0]))
df.sample(5)

In [4]:
cat_ids = list(df.category.unique()) #use index as id

In [6]:
df['label'] = df.apply(lambda row: cat_ids.index(row.category), axis = 1)

df.sample(10)

In [7]:
headlines = df.headline.values
labels = df.label.values

In [11]:
headlines.dtype

In [9]:
from transformers import BertTokenizer

#Use lower case, as in our use case capitalisation style may differ from Huff Post
tokenizer = BertTokenizer.from_pretrained('/kaggle/input/bert-base-uncased', do_lower_case=True)

In [10]:
input_tokens = [] #tokenized input
attention_masks = [] #indicates padded tokens
encoded_dict = tokenizer(list(headlines),
                         add_special_tokens = True, #For classification markers
                         padding='longest',
                         return_attention_mask = True, #To indicate useful data
                         return_tensors = 'pt') #Pytorch
input_tokens = encoded_dict['input_ids']
attention_masks = encoded_dict['attention_mask']
input_tokens, attention_masks

In [None]:
input_tokens.shape, attention_masks.shape, torch.tensor(labels).shape

In [None]:
from torch.utils.data import TensorDataset, random_split

data_length = 200853
train_length = int(0.8*data_length)

dataset = TensorDataset(input_tokens, attention_masks, torch.tensor(labels))

train_dataset, valid_dataset = random_split(dataset, [train_length,data_length-train_length])

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

train_dl = DataLoader(train_dataset,
                      sampler = RandomSampler(train_dataset),
                      batch_size = 64)
valid_dl = DataLoader(valid_dataset,
                      sampler = SequentialSampler(valid_dataset),
                      batch_size = 64)

# Training the model

In [None]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels = len(cat_ids),
                                                      output_attentions = False, # Whether the model returns attentions weights.
                                                      output_hidden_states = False)

model.cuda() #Run on GPU

In [None]:
from transformers import AdamW

optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8)

In [None]:
from transformers import get_linear_schedule_with_warmup
#Linearly reduces learning rate from set value to 0 over training

epochs = 2
total_steps = len(train_dl) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
for epoch in range(0,epochs):
    print("\nEpoch num {:}".format(epoch+1))
    
    total_train_loss = 0
    
    ##TRAINING
    
    model.train() #Training mode
    
    for i, batch in enumerate(train_dl):
        batch_input_tokens = batch[0].to(device)
        batch_input_mask = batch[1].to(device)
        batch_labels = batch[2].to(device)
        
        model.zero_grad() #Clear grads
        
        result = model(batch_input_tokens,
                       token_type_ids=None, 
                       attention_mask=batch_input_mask, 
                       labels=batch_labels,
                       return_dict=True)

        loss = result.loss
        logits = result.logits #Output prior to activation function
        
        total_train_loss += loss.item()
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(),1.) #Prevent exploding gradients
        
        optimizer.step() #Changes params
        
        scheduler.step() #Changes learning rate (to smaller)
        
    avg_train_loss = total_train_loss/len(train_dl)
    
    print("Training loss: {0:.2f}".format(avg_train_loss))
    
    ##VALIDATION
    
    model.eval()
    
    total_val_acc = 0
    total_val_loss = 0
    
    for batch in valid_dl:
        batch_input_tokens = batch[0].to(device)
        batch_input_mask = batch[1].to(device)
        batch_labels = batch[2].to(device)
        
        with torch.no_grad():
            result = model(batch_input_tokens,
                           token_type_ids=None, 
                           attention_mask=batch_input_mask, 
                           labels=batch_labels,
                           return_dict=True)
        
        loss = result.loss
        logits = result.logits
        
        total_val_loss += loss.item()
        
        #Back to cpu to calculate accuracy
        logits = logits.detach().cpu().numpy()
        b_labels = batch_labels.to('cpu').numpy()
        total_val_acc += flat_accuracy(logits,b_labels)
        
    avg_val_acc = total_val_acc/len(valid_dl)
    avg_val_loss = total_val_loss/len(valid_dl)
    print("Accuracy: {0:.2f}".format(avg_val_acc))
    print("Valid loss: {0:.2f}".format(avg_val_loss))


In [None]:
torch.save(model.state_dict(), 'headline_classifier.pt')