# Transformers

In [1]:
# warnings
from warnings import simplefilter
simplefilter("ignore", category=FutureWarning)
simplefilter("ignore", category=DeprecationWarning)

# setting path
import os
import sys
import time
import random
sys.path.append('../../../RedditNews/')

# custom built functions
from logs.get_logs import logger
from dataPrep.get_data_fold import data_module

In [2]:
# import libraries
import torch
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# set seed
seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
# tensorflow.random.set_seed(seed)

# KFold and Data

In [4]:
# Choose model
model_select = "XLNet" # Options: "SVM_MODEL", "LSTM_MODEL", "TRANSFORMER_MODEL"

# Choose K in KFold
KFold = 1 # Options: 1, 2, 3, 4, 5

# logger
task = "GridSearchCV"
root_dir = '/home/ravi/PROJECTS_DATA/DataModelsResults/'
log_dir_fname = root_dir + 'Results/Transformers/'+ model_select + task + "_KFold" + str(KFold)+'.log'
# print(log_dir_fname)
logger = logger(log_dir_fname)
logger.info("=============== New execution ====================")
execution_st = time.time()

# inputs
logger.info("Get inputs values")
data_file = 'lib_con_22554articles_n_label_shuffled.json'

## Load data. Get K-Fold data. Save 5 fold indices (80% train, 20% test)
# all_train_data, test_data = data_module(logger, data_file, KFold, root_dir)

### ULMFit
logger.info("=========== Data loading ===========")

2022-06-29 19:51:26,879 | 241557611.py: 17: <cell line: 17>() |  INFO: Get inputs values


In [5]:
path = root_dir+'/Results/Transformers/'
dfTrain = dfTrain = pd.read_csv('train.csv') #all_train_data.copy()
dfValidation = dfTrain.sample(frac=0.2, random_state=42)

In [6]:
dfTrain = dfTrain.drop(columns=['id'])
dfTrain = dfTrain[["label", "tweet"]]
dfTrain.head()

Unnamed: 0,label,tweet
0,real,The CDC currently reports 99031 deaths. In gen...
1,real,States reported 1121 deaths a small rise from ...
2,fake,Politically Correct Woman (Almost) Uses Pandem...
3,real,#IndiaFightsCorona: We have 1524 #COVID testin...
4,real,Populous states can generate large case counts...


In [7]:
dfValidation = dfValidation.drop(columns=['id'])
dfValidation = dfValidation[["label", "tweet"]]
dfValidation.head()

Unnamed: 0,label,tweet
324,fake,"Canada’s top BDSM doctor says wear a mask, lea..."
1340,real,There are 3 cases considered to have recovered...
6025,real,Heard about contact tracing but not sure what ...
2077,fake,India records its highest single-day increase ...
381,real,Researchers warned of a ticking coronavirus ti...


# Tokenizer

In [8]:
# the model we are going to Fine-Tune
# check text classification models here: https://huggingface.co/models?filter=text-classification
model_name = "bert-base-uncased"
# max sequence length for each document/sentence sample
max_length = 512

In [9]:
# load the tokenizer
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

In [10]:
# target names ordered
target_names = sorted(np.unique(dfTrain["label"].tolist()))

In [11]:
# convert string labels to numeric but target >=0 and target < num_classes
dfTrain["label"] = dfTrain["label"].str.replace('fake','0')
dfTrain["label"] = dfTrain["label"].str.replace('real','1')
dfValidation["label"] = dfValidation["label"].str.replace('fake','0')
dfValidation["label"] = dfValidation["label"].str.replace('real','1')
dfTrain["label"] = pd.to_numeric(dfTrain["label"])
dfValidation["label"] = pd.to_numeric(dfValidation["label"])

In [12]:
# each is a list
train_texts=dfTrain["tweet"].tolist()
valid_texts=dfValidation["tweet"].tolist()
train_labels=dfTrain["label"].tolist()
valid_labels=dfValidation["label"].tolist()

In [13]:
# tokenize the dataset, truncate when passed `max_length`, 
# and pad with 0's when less than `max_length`
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length)
valid_encodings = tokenizer(valid_texts, truncation=True, padding=True, max_length=max_length)

In [14]:
class NewsGroupsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

# convert our tokenized data into a torch Dataset
train_dataset = NewsGroupsDataset(train_encodings, train_labels)
valid_dataset = NewsGroupsDataset(valid_encodings, valid_labels)

In [15]:
len(target_names)

2

# Fine-tune the pretrained language model

In [16]:
# load the model and pass to CUDA
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(target_names)).to("cuda")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [17]:
# metrics
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  # calculate accuracy using sklearn's function
  acc = accuracy_score(labels, preds)
  return {
      'accuracy': acc,
  }

In [18]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=10,  # batch size per device during training
    per_device_eval_batch_size=10,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    logging_steps=400,               # log & save weights each logging_steps
    save_steps=400,
    evaluation_strategy="steps",     # evaluate each `logging_steps`
)

In [19]:
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

In [20]:
# train the model
trainer.train()

***** Running training *****
  Num examples = 6420
  Num Epochs = 3
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 1
  Total optimization steps = 1926


Step,Training Loss,Validation Loss,Accuracy
400,0.336,0.26425,0.915109
800,0.1601,0.101472,0.981308
1200,0.0956,0.04398,0.984424
1600,0.0266,0.005561,0.998442


***** Running Evaluation *****
  Num examples = 1284
  Batch size = 10
Saving model checkpoint to ./results/checkpoint-400
Configuration saved in ./results/checkpoint-400/config.json
Model weights saved in ./results/checkpoint-400/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1284
  Batch size = 10
Saving model checkpoint to ./results/checkpoint-800
Configuration saved in ./results/checkpoint-800/config.json
Model weights saved in ./results/checkpoint-800/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1284
  Batch size = 10
Saving model checkpoint to ./results/checkpoint-1200
Configuration saved in ./results/checkpoint-1200/config.json
Model weights saved in ./results/checkpoint-1200/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1284
  Batch size = 10
Saving model checkpoint to ./results/checkpoint-1600
Configuration saved in ./results/checkpoint-1600/config.json
Model weights saved in ./results/checkpoint-1600/pytorch_model.bi

TrainOutput(global_step=1926, training_loss=0.13293978878270798, metrics={'train_runtime': 488.8289, 'train_samples_per_second': 39.4, 'train_steps_per_second': 3.94, 'total_flos': 5067518926233600.0, 'train_loss': 0.13293978878270798, 'epoch': 3.0})

In [21]:
import torch

In [22]:
torch.version.cuda

'11.6'

In [23]:
torch.__version__

'1.12.0'

# EVal

In [25]:
# evaluate the current model after training
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 1284
  Batch size = 10


{'eval_loss': 0.0055609531700611115,
 'eval_accuracy': 0.9984423676012462,
 'eval_runtime': 2.5581,
 'eval_samples_per_second': 501.943,
 'eval_steps_per_second': 50.429,
 'epoch': 3.0}

In [26]:
# saving the fine tuned model & tokenizer
model_path = "fakenews-bert-base-uncased"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

Configuration saved in fakenews-bert-base-uncased/config.json
Model weights saved in fakenews-bert-base-uncased/pytorch_model.bin
tokenizer config file saved in fakenews-bert-base-uncased/tokenizer_config.json
Special tokens file saved in fakenews-bert-base-uncased/special_tokens_map.json


('fakenews-bert-base-uncased/tokenizer_config.json',
 'fakenews-bert-base-uncased/special_tokens_map.json',
 'fakenews-bert-base-uncased/vocab.txt',
 'fakenews-bert-base-uncased/added_tokens.json',
 'fakenews-bert-base-uncased/tokenizer.json')

# Performing Inference

In [27]:
def get_prediction(text):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to("cuda")
    # perform inference to our model
    outputs = model(**inputs)
    # get output probabilities by doing softmax
    probs = outputs[0].softmax(1)
    # executing argmax function to get the candidate label
    return target_names[probs.argmax()]

In [38]:
fakeDF = dfValidation[dfValidation["label"]==0]
realDF = dfValidation[dfValidation["label"]==1]
print(fakeDF)
print(realDF)

      label                                              tweet
324       0  Canada’s top BDSM doctor says wear a mask, lea...
2077      0  India records its highest single-day increase ...
2778      0  There were more deaths on the roads of France ...
1971      0  Game Cigars have been discontinued due to a CO...
4482      0           Claim that "there is no" COVID-19 virus.
...     ...                                                ...
2788      0  In January, Donald Trump claimed the coronavir...
4763      0  Author Chen Ming-Fang the daughter-in-law of r...
1186      0  Americans Attempting to Speak Truth to Power H...
4866      0  18 nations including USA and UK wants \n@naren...
151       0  Scientists are expressing cautious optimism th...

[596 rows x 2 columns]
      label                                              tweet
1340      1  There are 3 cases considered to have recovered...
6025      1  Heard about contact tracing but not sure what ...
381       1  Researchers warned

In [39]:
print(get_prediction(fakeDF["tweet"][324]))
print(get_prediction(fakeDF["tweet"][1971]))
print(get_prediction(realDF["tweet"][1340]))
print(get_prediction(realDF["tweet"][1033]))

fake
fake
real
real
