In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# SOURCE: https://towardsdatascience.com/bert-text-classification-using-pytorch-723dfb8b6b5b

# Parameters


In [None]:
source_folder = '/content/drive/My Drive/'
destination_folder = '/content/drive/My Drive/'

In [None]:
raw_data_path = 'drive/My Drive/news.csv'
raw_data_path_covid = 'drive/My Drive/Corona_NLP_'
destination_folder = 'drive/My Drive/'

first_n_words = 200

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Preprocessing news dataset




In [None]:
def trim_string(x):

    x = x.split(maxsplit=first_n_words)
    x = ' '.join(x[:first_n_words])

    return x


In [None]:
# Read raw data
df_raw = pd.read_csv(raw_data_path)

# Prepare columns
df_raw['label'] = (df_raw['label'] == 'FAKE').astype('int')
df_raw['titletext'] = df_raw['title'] + ". " + df_raw['text']
df_raw = df_raw.reindex(columns=['label', 'title', 'text', 'titletext'])

# Drop rows with empty text
df_raw.drop( df_raw[df_raw.text.str.len() < 5].index, inplace=True)

# Trim text and titletext to first_n_words
df_raw['text'] = df_raw['text'].apply(trim_string)
df_raw['titletext'] = df_raw['titletext'].apply(trim_string) 

# Split according to label
df_real = df_raw[df_raw['label'] == 0]
df_fake = df_raw[df_raw['label'] == 1]

# Train-test split
df_real_full_train, df_real_test = train_test_split(df_real, train_size = 0.1, random_state = 1)
df_fake_full_train, df_fake_test = train_test_split(df_fake, train_size = 0.1, random_state = 1)

# Train-valid split
df_real_train, df_real_valid = train_test_split(df_real_full_train, train_size = 0.8, random_state = 1)
df_fake_train, df_fake_valid = train_test_split(df_fake_full_train, train_size = 0.8, random_state = 1)

# Concatenate splits of different labels
df_train = pd.concat([df_real_train, df_fake_train], ignore_index=True, sort=False)
df_valid = pd.concat([df_real_valid, df_fake_valid], ignore_index=True, sort=False)
df_test = pd.concat([df_real_test, df_fake_test], ignore_index=True, sort=False)

# Write preprocessed data
df_train.to_csv(destination_folder + '/train.csv', index=False)
df_valid.to_csv(destination_folder + '/valid.csv', index=False)
df_test.to_csv(destination_folder + '/test.csv', index=False)


# Preprocessing COVID tweets dataset


In [None]:
import string
from nltk.tokenize import TweetTokenizer

# NEW CODE ---------------------------------------------------------------------

def clean_string(x):

  words = x.split()

  table = str.maketrans('', '', string.punctuation)
  stripped = [w.translate(table) for w in words]
  x = ' '.join(stripped)

  tweet_tokenizer = TweetTokenizer()
  tweet_tokens = []
  w = (tweet_tokenizer.tokenize(x))

  w = [w for w in stripped if 'https' not in w]
  w = [w for w in stripped if w.isalnum()]

  return ' '.join(w)

# NEW CODE ---------------------------------------------------------------------


In [None]:
# NEW CODE ---------------------------------------------------------------------

# Read raw data

for each in ['train', 'test']:
  filename = raw_data_path_covid + each + '.csv'
  print(filename)
  df = pd.read_csv(raw_data_path_covid + each + '.csv', encoding= "ISO-8859-1")

  # Encode sentiment column to have numerical values
  df['Sentiment'] = df.Sentiment = pd.Categorical(df.Sentiment)
  df['label'] = df.Sentiment.cat.codes

  # Drop other columns, not going to be used
  df = df.drop(columns = ['UserName', 'ScreenName', 'TweetAt', 'Location'])

  # Drop rows with empty text
  df.drop( df[df.OriginalTweet.str.len() < 5].index, inplace=True)

  # Trim to first_n_words
  df['OriginalTweet'] = df['OriginalTweet'].apply(trim_string)
  df['OriginalTweet'] = df['OriginalTweet'].apply(clean_string)

  df = df [['label', 'OriginalTweet']]
  # Write preprocessed data
  df.to_csv(destination_folder + '/covid_' + each + '.csv', index=False)
  print(df.shape)
# create validation set out of training set since train/test split is already 90/10
# will create train/val test split of 80/10/10

df_train =  pd.read_csv(destination_folder + 'covid_train.csv', encoding= "ISO-8859-1")
df_test =  pd.read_csv(destination_folder + 'covid_test.csv', encoding= "ISO-8859-1")

df_train, df_valid = train_test_split(df_train, train_size = 0.9, random_state = 1)

df_valid.to_csv(destination_folder + '/covid_valid.csv', index=False)

print('train set size', df_train.shape)
print('test set size', df_test.shape)
print('validation set size', df_valid.shape)

print(df_valid)
# NEW CODE ---------------------------------------------------------------------


drive/My Drive/Corona_NLP_train.csv
(41157, 2)
drive/My Drive/Corona_NLP_test.csv
(3798, 2)
train set size (37041, 2)
test set size (3798, 2)
validation set size (4116, 2)
       label                                      OriginalTweet
32828      0  My mum used to say come with us to the superma...
766        2  The breakfast program at my kids school is sus...
35742      4  Russia to develop additional business support ...
39955      2  Top oilproducing countries agreed on historic ...
36059      3  Busy legostreet at first glance but actually i...
...      ...                                                ...
4325       2  How to combat stockpiling BackToBasics tough a...
11862      2  Our Director Niki is taking a look at Global O...
25920      3  Some local farms have seen an influx of busine...
17588      2  COVID2019 Coronavirus Corona Danish supermarke...
37083      3  SocialDistancing is pointing because people st...

[4116 rows x 2 columns]


# Libraries

In [None]:
!pip install transformers



In [None]:
# Libraries

import matplotlib.pyplot as plt
import pandas as pd
import torch

# Preliminaries

from torchtext.data import Field, TabularDataset, BucketIterator, Iterator

# Models

import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification

# Training

import torch.optim as optim

# Evaluation

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


# Preliminaries

In [None]:
# NEW CODE ---------------------------------------------------------------------

""" choice of model and dataset
    news is binary classification and covid is multiclass classification
"""

MODEL = 'BERT'
# MODEL = 'XL_net'


#DATASET = 'NEWS'
DATASET = 'COVID'

# NEW CODE ---------------------------------------------------------------------


In [None]:
# NEW CODE ---------------------------------------------------------------------

"""tokenizers
"""

if MODEL == 'BERT':
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', max_len=1024)

elif MODEL == 'XL_net':
  !pip install pytorch_transformers
  from pytorch_transformers import XLNetConfig, XLNetModel, XLNetTokenizer
  tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

# NEW CODE ---------------------------------------------------------------------


In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

# Model parameter
MAX_SEQ_LEN = 512
PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)
BATCH_SIZE = 8

# Fields
label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.long)
text_field = Field(use_vocab=False, tokenize=tokenizer.encode, lower=False, include_lengths=False, batch_first=True,
                   fix_length=MAX_SEQ_LEN, pad_token=PAD_INDEX, unk_token=UNK_INDEX)

if DATASET == 'NEWS':
  fields = [('label', label_field), ('title', text_field), ('text', text_field), ('titletext', text_field)]

elif DATASET == 'COVID':
  fields = [('label', label_field), ('OriginalTweet', text_field)]

# TabularDataset

if DATASET == 'NEWS':
  train, valid, test = TabularDataset.splits(path=source_folder, train='train.csv', validation='valid.csv',
                                           test='test.csv', format='CSV', fields=fields, skip_header=True)
  train_iter = BucketIterator(train, batch_size=BATCH_SIZE, sort_key=lambda x: len(x.text),
                            device=device, train=True, sort=True, sort_within_batch=True)
  valid_iter = BucketIterator(valid, batch_size=BATCH_SIZE, sort_key=lambda x: len(x.text),
                              device=device, train=True, sort=True, sort_within_batch=True)
  test_iter = Iterator(test, batch_size=BATCH_SIZE, device=device, train=False, shuffle=False, sort=False)


elif DATASET == 'COVID':
  train, valid, test = TabularDataset.splits(path=source_folder, train='covid_train.csv', validation='covid_valid.csv',
                                           test='covid_test.csv', format='CSV', fields=fields, skip_header=True)
  
  train_iter = BucketIterator(train, batch_size=BATCH_SIZE, sort_key=lambda x: len(x.OriginalTweet),
                            device=device, train=True, sort=True, sort_within_batch=True)
  valid_iter = BucketIterator(valid, batch_size=BATCH_SIZE, sort_key=lambda x: len(x.OriginalTweet),
                              device=device, train=True, sort=True, sort_within_batch=True)
  test_iter = Iterator(test, batch_size=BATCH_SIZE, device=device, train=False, shuffle=False, sort=False)



cuda:0


In [None]:
myit = iter(test_iter)

print(next(myit))



[torchtext.data.batch.Batch of size 8]
	[.label]:[torch.cuda.LongTensor of size 8 (GPU 0)]
	[.OriginalTweet]:[torch.cuda.LongTensor of size 8x512 (GPU 0)]


In [None]:
# Save and Load Functions

def save_checkpoint(save_path, model, valid_loss):
    if save_path == None:
        return
    
    state_dict = {'model_state_dict': model.state_dict(),
                  'valid_loss': valid_loss}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')

def load_checkpoint(load_path, model):  
    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    model.load_state_dict(state_dict['model_state_dict'])
    return state_dict['valid_loss']


def save_metrics(save_path, train_loss_list, valid_loss_list, global_steps_list):
    if save_path == None:
        return
    
    state_dict = {'train_loss_list': train_loss_list,
                  'valid_loss_list': valid_loss_list,
                  'global_steps_list': global_steps_list}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')


def load_metrics(load_path):
    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    return state_dict['train_loss_list'], state_dict['valid_loss_list'], state_dict['global_steps_list']

In [None]:
if DATASET == 'NEWS':
    criterion = nn.BCELoss()
elif DATASET == 'COVID':
    criterion = nn.CrossEntropyLoss()


In [None]:
# Training Function

def train(model,
          optimizer,
          criterion = criterion,
          train_loader = train_iter,
          valid_loader = valid_iter,
          num_epochs = 5,
          eval_every = len(train_iter) // 2,
          file_path = destination_folder,
          best_valid_loss = float("Inf")):
    
    # initialize running values
    running_loss = 0.0
    valid_running_loss = 0.0
    global_step = 0
    train_loss_list = []
    valid_loss_list = []
    global_steps_list = []

    # training loop
    model.train()
    for epoch in range(num_epochs):
      if DATASET == 'NEWS':
        for (labels, title, text, titletext), _ in train_loader:
            labels = labels.type(torch.LongTensor)           
            labels = labels.to(device)
            titletext = titletext.type(torch.LongTensor)  
            titletext = titletext.to(device)
            output = model(input_ids=titletext, labels=labels)
            loss = output[0]
            output = output[1]

      elif DATASET == 'COVID':
         for (labels, OriginalTweet), _ in train_loader:
            labels = labels.type(torch.LongTensor)           
            labels = labels.to(device)
            OriginalTweet = OriginalTweet.type(torch.LongTensor)  
            OriginalTweet = OriginalTweet.to(device)
            output = model(input_ids=OriginalTweet, labels=labels)
            loss = output[0]
            output = output[1]

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      # update running values
      running_loss += loss.item()
      global_step += 1

      # evaluation step
      if global_step % eval_every == 0:
          model.eval()
          with torch.no_grad():                    
              # validation loop
              for (labels, title, text, titletext), _ in valid_loader:
                if DATASET == 'NEWS':
                  for (labels, title, text, titletext), _ in train_loader:
                      labels = labels.type(torch.LongTensor)           
                      labels = labels.to(device)
                      titletext = titletext.type(torch.LongTensor)  
                      titletext = titletext.to(device)
                      output = model(input_ids=titletext, labels=labels)
                 
                      loss = output[0]
                      output = output[1]

                elif DATASET == 'COVID':
                  for (labels, OriginalTweet), _ in train_loader:
                      labels = labels.type(torch.LongTensor)           
                      labels = labels.to(device)
                      OriginalTweet = OriginalTweet.type(torch.LongTensor)  
                      OriginalTweet = OriginalTweet.to(device)
                      output = model(input_ids=OriginalTweet, labels=labels)
                 
                      loss = output[0]
                      output = output[1]

                  valid_running_loss += loss.item()

          # evaluation
          average_train_loss = running_loss / eval_every
          average_valid_loss = valid_running_loss / len(valid_loader)
          train_loss_list.append(average_train_loss)
          valid_loss_list.append(average_valid_loss)
          global_steps_list.append(global_step)

          # resetting running values
          running_loss = 0.0                
          valid_running_loss = 0.0
          model.train()

          # print progress
          print('Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}'
                .format(epoch+1, num_epochs, global_step, num_epochs*len(train_loader),
                        average_train_loss, average_valid_loss))
          
          # checkpoint
          if best_valid_loss > average_valid_loss:
              best_valid_loss = average_valid_loss
              save_checkpoint(file_path + '/' + 'model_xl.pt', model, best_valid_loss)
              save_metrics(file_path + '/' + 'metrics_xl.pt', train_loss_list, valid_loss_list, global_steps_list)
    
    save_metrics(file_path + '/' + 'metrics_xl.pt', train_loss_list, valid_loss_list, global_steps_list)
    print('Finished Training!')

In [None]:
import torch

if MODEL == 'BERT':
  model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=5,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

elif MODEL == 'XL_net':
  model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'xlnet-base-cased', output_attentions=True)  # Update configuration during loading
  model.config.num_choices = 5
  print(model.config)

optimizer = optim.Adam(model.parameters(), lr=2e-5)
model = model.to(device)
train(model=model, optimizer=optimizer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

RuntimeError: ignored

In [None]:
train_loss_list, valid_loss_list, global_steps_list = load_metrics(destination_folder + '/metrics_xl.pt')
plt.plot(global_steps_list, train_loss_list, label='Train')
plt.plot(global_steps_list, valid_loss_list, label='Valid')
plt.xlabel('Global Steps')
plt.ylabel('Loss')
plt.legend()
plt.show() 

In [None]:
# Evaluation Function

def evaluate(model, model_name, test_loader):
    y_pred = []
    y_true = []

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print(device)
    model.eval()
    with torch.no_grad():
        for (labels, title, text, titletext), _ in test_loader:
                labels = labels.type(torch.LongTensor)           
                labels = labels.to(device)
                titletext = titletext.type(torch.LongTensor)  
                titletext = titletext.to(device)
                output = model(input_ids=titletext, labels=labels)
                output = output[1]
                # print(output.shape)
                output=output.to(device)
                y_pred.extend(torch.argmax(output, 1).tolist())
                y_true.extend(labels.tolist())
    
    print('Classification Report:')
    print(classification_report(y_true, y_pred, labels=[1,0], digits=4))
    
    cm = confusion_matrix(y_true, y_pred, labels=[1,0])
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, ax = ax, cmap='Blues', fmt="d")

    ax.set_title('Confusion Matrix')

    ax.set_xlabel('Predicted Labels')
    ax.set_ylabel('True Labels')

    ax.xaxis.set_ticklabels(['FAKE', 'REAL'])
    ax.yaxis.set_ticklabels(['FAKE', 'REAL'])

In [None]:
!pip install pytorch_pretrained_bert 

import torch

# BERT
# model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased', output_attentions=True)  # Update configuration during loading
# model_name = 'bert'

# XL_NET
model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'xlnet-base-cased', output_attentions=True)  # Update configuration during loading
load_checkpoint(destination_folder + '/model_xl.pt', model)

assert model.config.output_attentions == True
model = model.to(device)
evaluate(model, model_name, test_iter)
