In [23]:
import pandas as pd
import numpy as np
import io
import os
import re
from google.colab import drive
!pip install transformers
!pip install SentencePiece
import torch
import transformers
from transformers import XLNetTokenizer, XLNetModel, AdamW, get_linear_schedule_with_warmup
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from collections import defaultdict
from textwrap import wrap
from pylab import rcParams

from torch import nn, optim
from tqdm.notebook import tqdm
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset,RandomSampler,SequentialSampler
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

from transformers import (AutoConfig, 
                          AutoModelForSequenceClassification, 
                          AutoTokenizer, AdamW, 
                          get_linear_schedule_with_warmup,
                          set_seed,
                          )
from keras.preprocessing.sequence import pad_sequences
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Reference
https://gmihaila.medium.com/fine-tune-transformers-in-pytorch-using-transformers-57b40450635

https://medium.com/swlh/using-xlnet-for-sentiment-classification-cfa948e65e85


In [45]:
set_seed(123)
# Number of training epochs (authors recommend between 2 and 4)
epochs = 2
# Number of batches - depending on the max sequence length and GPU memory.
# For 512 sequence length batch of 10 works without cuda memory issues.
# For small sequence length can try batch of 32 or higher.
batches = 10
# Pad or truncate text sequences to a specific length
# if `None` it will use maximum sequence of word piece tokens allowed by model.
max_len = 300
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Name of transformers model - will use already pretrained model.
# Path of transformer model - will load your own model from local disk.
model_name = 'bert-base-cased'
# Dicitonary of labels and their id - this will be used to convert.
# String labels to number ids.
# labels_ids = {'neg': 0, 'pos': 1}
# How many labels are we using in training.
# This is used to decide size of classification head.
n_labels = 2
class_names = ['negative', 'positive']

In [25]:
path = '/content/drive/MyDrive/colab_data'
def de_emojify(inputString):
    return inputString.encode('ascii', 'ignore').decode('ascii')
def clean_text(text):
    text = re.sub(r"@[A-Za-z0-9]+", ' ', text)
    text = re.sub(r"https?://[A-Za-z0-9./]+", ' ', text)
    text = re.sub(r"[^a-zA-z.!?'0-9]", ' ', text)
    text = re.sub('\t', ' ',  text)
    text = re.sub(r" +", ' ', text)
    return text
def text_proc(df, text_col='text'):
    df['orig_text'] = df[text_col]
    # Remove twitter handles
    df[text_col] = df[text_col].apply(lambda x: clean_text(x))
    # Remove URLs
    df[text_col] = df[text_col].apply(lambda x:x.replace('<br />', ' '))
    return df[df[text_col]!='']

In [26]:
data = pd.read_csv(os.path.join(path, "covid-19_articles_data.csv"))
data = text_proc(data,'text').dropna(subset=['sentiment']).sample(2000, random_state = 10).reset_index(drop=True)
data['len'] = data.text.apply(lambda x: len(x.split(' ')))
print(len(data))
data.head(3)

2000


Unnamed: 0.1,Unnamed: 0,text,sentiment,orig_text,len
0,30649,Trump throws up his hands as an ad from Biden'...,1,Trump throws up his hands as an ad from Biden'...,150
1,14382,The fallout from the outbreak aboard the Roose...,1,The fallout from the outbreak aboard the Roose...,274
2,26250,Test result throws US into fresh upheaval On T...,1,Test result throws US into fresh upheaval On T...,257


In [27]:
df_train, df_val = train_test_split(data, test_size=0.33, random_state=42)

In [None]:
# max_len = int(df_train.len.quantile(0.95))

In [28]:
class Transformer_Dataset(Dataset):

    def __init__(self, text, target, tokenizer, max_len):
        self.text = text
        self.target = target
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, item):
        text = str(self.text[item])
        target = self.target[item]

        encoding = self.tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=self.max_len,
        return_token_type_ids=False,
        pad_to_max_length=False,
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
        )

        input_ids = pad_sequences(encoding['input_ids'], maxlen=max_len, dtype=torch.Tensor ,truncating="post",padding="post")
        input_ids = input_ids.astype(dtype = 'int64')
        input_ids = torch.tensor(input_ids) 

        attention_mask = pad_sequences(encoding['attention_mask'], maxlen=max_len, dtype=torch.Tensor ,truncating="post",padding="post")
        attention_mask = attention_mask.astype(dtype = 'int64')
        attention_mask = torch.tensor(attention_mask)       

        return {
        'text': text,
        'input_ids': input_ids,
        'attention_mask': attention_mask.flatten(),
        'target': torch.tensor(target, dtype=torch.long)
        }

In [29]:
def create_data_loader(df, target, tokenizer, max_len, batch_size):
  ds = Transformer_Dataset(
    text=df.text.to_numpy(),
    target=df[target].to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=2,
    drop_last = True
  )

In [30]:
# Get model configuration.
print('Loading configuraiton...')
model_config = AutoConfig.from_pretrained(pretrained_model_name_or_path=model_name, 
                                          num_labels=n_labels)

# Get model's tokenizer.
print('Loading tokenizer...')
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)

# Get the actual model.
print('Loading model...')
model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name, 
                                                           config=model_config)

# Load model to defined device.
model.to(device)
print('Model loaded to `%s`'%device)

Loading configuraiton...
Loading tokenizer...
Loading model...


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Model loaded to `cuda`


In [31]:
train_data_loader = create_data_loader(df_train, 'sentiment', tokenizer, max_len, batches)
val_data_loader = create_data_loader(df_val, 'sentiment', tokenizer, max_len, batches)

In [32]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
                                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
                                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay':0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5)

total_steps = len(train_data_loader) * epochs

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

In [33]:
from sklearn import metrics
def train_epoch(model, data_loader, optimizer, device, scheduler, n_examples, batch_size, maxlen):
    model = model.train()
    losses = []
    acc = 0
    counter = 0
  
    for d in data_loader:
        input_ids = d["input_ids"].reshape(batch_size,maxlen).to(device)
        attention_mask = d["attention_mask"].to(device)
        target = d['target'].to(device)
        
        outputs = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels = target)
        loss = outputs[0]
        logits = outputs[1]

        # preds = preds.cpu().detach().numpy()
        _, prediction = torch.max(outputs[1], dim=1)
        target = target.cpu().detach().numpy()
        prediction = prediction.cpu().detach().numpy()
        accuracy = metrics.accuracy_score(target, prediction)

        acc += accuracy
        losses.append(loss.item())
        
        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        counter = counter + 1

    return acc / counter, np.mean(losses)

In [34]:
def eval_model(model, data_loader, device, n_examples,batch_size,maxlen):
    model = model.eval()
    losses = []
    acc = 0
    counter = 0
  
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].reshape(batch_size,maxlen).to(device)
            attention_mask = d["attention_mask"].to(device)
            target = d['target'].to(device)
            
            outputs = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels = target)
            loss = outputs[0]
            logits = outputs[1]

            _, prediction = torch.max(outputs[1], dim=1)
            target = target.cpu().detach().numpy()
            prediction = prediction.cpu().detach().numpy()
            accuracy = metrics.accuracy_score(target, prediction)

            acc += accuracy
            losses.append(loss.item())
            counter += 1

    return acc / counter, np.mean(losses)

In [35]:
%%time
history = defaultdict(list)
best_accuracy = 0

for epoch in tqdm(range(epochs)):
    print(f'Epoch {epoch + 1}/{epochs}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,     
        optimizer, 
        device, 
        scheduler, 
        len(df_train),
        batches,
        max_len
    )

    print(f'Train loss {train_loss} Train accuracy {train_acc}')

    val_acc, val_loss = eval_model(
        model,
        val_data_loader, 
        device, 
        len(df_val),
        batches,
        max_len
    )

    print(f'Val loss {val_loss} Val accuracy {val_acc}')
    print()

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

    if val_acc > best_accuracy:
        torch.save(model.state_dict(), os.path.join(path, 'xlnet_model.bin'))
        best_accuracy = val_acc

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

Epoch 1/2
----------
Train loss 0.6660105775096523 Train accuracy 0.6029850746268658
Val loss 0.5920500479864351 Val accuracy 0.6575757575757577

Epoch 2/2
----------
Train loss 0.4486813924984256 Train accuracy 0.7962686567164181
Val loss 0.4407845710714658 Val accuracy 0.8045454545454545


CPU times: user 2min 43s, sys: 13 s, total: 2min 56s
Wall time: 2min 59s


In [47]:
def predict_sentiment(text):
    text = text

    encoded_review = tokenizer.encode_plus(
    text,
    add_special_tokens=True,
    max_length=max_len,
    return_token_type_ids=False,
    pad_to_max_length=False,
    return_attention_mask=True,
    return_tensors='pt',
    truncation=True
    )

    input_ids = pad_sequences(encoded_review['input_ids'], maxlen=max_len, dtype=torch.Tensor ,truncating="post",padding="post")
    input_ids = input_ids.astype(dtype = 'int64')
    input_ids = torch.tensor(input_ids) 

    attention_mask = pad_sequences(encoded_review['attention_mask'], maxlen=max_len, dtype=torch.Tensor ,truncating="post",padding="post")
    attention_mask = attention_mask.astype(dtype = 'int64')
    attention_mask = torch.tensor(attention_mask) 

    input_ids = input_ids.reshape(1,max_len).to(device)
    attention_mask = attention_mask.to(device)

    outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    outputs = outputs[0][0].cpu().detach()

    probs = F.softmax(outputs, dim=-1).cpu().detach().numpy().tolist()
    _, prediction = torch.max(outputs, dim =-1)

    print("Positive score:", probs[1])
    print("Negative score:", probs[0])
    print(f'text: {text}')
    print(f'target  : {class_names[prediction]}')

In [48]:
df_val.head(3)

Unnamed: 0.1,Unnamed: 0,text,sentiment,orig_text,len
1860,17440,They wanted to use the records to investigate ...,0,They wanted to use the records to investigate ...,175
353,27433,In states that allow the counting of ballots r...,0,In states that allow the counting of ballots r...,159
1333,17478,Anderson is one of nearly 60 pets ACC has take...,1,Anderson is one of nearly 60 pets ACC has take...,263


In [46]:
predict_sentiment(df_val.text[1860])

Positive score: 0.1023903414607048
Negative score: 0.8976095914840698
Review text: They wanted to use the records to investigate allegations of misconduct before Trump became president but Trump intervened to try to block the subpoenas. He lost in lower courts and thus here we are. The outcome of these cases could draw the line on how much immunity a sitting president has against scrutiny from government. And yes today's proceedings will happen over the phone. 5. Mexico Mexico's president has ordered the military back on the streets to tackle a rising tide of violence in the country. President Andres Manuel Lopez Obrador signed a decree that would install the National Guard on Mexican streets for the next five years a time frame designed to help the force improve its capabilities. Lopez Obrador created the National Guard shortly after he took office in 2018 to combat Mexico's historic levels of violence pulling members from units of the armed forces. Its goal is to reduce violence by p