## Install Libraries

In [1]:
!pip install transformers~=2.11.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!pip install pytorch_pretrained_bert

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!pip install tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Connect to Google Drive

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import os
os.chdir("/content/drive/MyDrive")

## Parameters

In [6]:
#data_path = './data/capstone/CLAWS/covidhate/annotated_tweets_w_text.csv'
data_path = './data/capstone/CLAWS/covidhate/part_labeled_tweet.csv'
#data_path = './data/part_labeled_tweet.csv'

In [7]:
trainedBertModel = 'bert'

In [8]:
batchsize = 8

In [9]:
gpu_num = 0

## Code

In [10]:
import torch
from torch.utils.data import TensorDataset, Subset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch import nn
from torch.nn import functional as F
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import get_linear_schedule_with_warmup
from transformers import AdamW
from pytorch_pretrained_bert import BertModel
import pandas as pd
import numpy as np
import time, datetime, random, glob, os, sys, joblib, argparse, json
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import KFold
from sklearn import metrics
from tqdm import tqdm 

In [11]:
USING_GPU = False
DEVICE = None

In [12]:
# Pretrained models dictionary
# Albert
pretrained_models = {'bert': 'bert-base-uncased', 'roberta': 'roberta-base'}

In [22]:
def format_time(seconds):
    seconds_round = int(round((seconds)))
    return str(datetime.timedelta(seconds=seconds_round)) # hh:mm:ss

In [13]:
def prepare_dataset(sentences, labels, tokenizer, max_length=100):
    input_ids = []
    attention_masks = []
    for sent in sentences:
        # print(sent)
        try:
            encoded_dict = tokenizer.encode_plus(
                                sent,
                                add_special_tokens = True,
                                max_length = max_length,
                                truncation=True,
                                pad_to_max_length = True,
                                return_attention_mask = True,
                                return_tensors = 'pt'
                           )
        except:
            print("some tweet sent is not correct")
            print(sent)
            exit(0)

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)
    return input_ids, attention_masks, labels

In [29]:
def train(fold, model, device, train_loader, optimizer, scheduler, epoch):
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0
    model.train()
    # For each batch of training data...
    for step, batch in enumerate(train_loader):
        print('step: '+str(step))
        model.zero_grad()
        
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_loader), elapsed))
        
        b_input_ids = batch[0].to(DEVICE)
        b_input_mask = batch[1].to(DEVICE)
        b_labels = batch[2].unsqueeze(0).to(DEVICE)
        #print(b_input_ids.shape)
        #print(b_input_mask.shape)
        #print(b_labels.shape)
        # https://stackoverflow.com/questions/70548318/bertforsequenceclassification-target-size-torch-size1-16-must-be-the-same
        #b_labels = torch.nn.functional.one_hot(b_labels.to(torch.int64), 3)
        #print(type(b_labels))
        #print(b_labels.shape)
        #loss, logits, hidden_states = model(b_input_ids,
        loss, logits, hidden_states = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask,
                            labels=b_labels)
        #print('run more')
        total_train_loss += loss.item()
        #loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        del loss, logits

    avg_train_loss = total_train_loss / len(train_loader)

    training_time = format_time(time.time() - t0)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))

In [25]:
def test(fold, model, device, test_loader, test_data_len):
    print("Running Validation...")

    model.eval()
    predictions, true_labels = [], []
    for batch in tqdm(test_loader, total=test_data_len):


        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            outputs = model(batch[0].to(DEVICE), token_type_ids=None,
                            attention_mask=batch[1].to(DEVICE))
            b_proba = outputs[0]

            proba = b_proba.detach().cpu().numpy()
            label_ids = batch[2].numpy()

            predictions.append(proba)
            true_labels.append(label_ids)
    print(predictions)
    print(true_labels)
    print(b_labels)

In [26]:
def train_bert_model(model, dataset, Y, batch_size, epochs=3, learning_rate=1e-5, epsilon=1e-8, save_fn=None):

    if USING_GPU:
        print("Using GPU", DEVICE)
        model.cuda(DEVICE)

    # prepare cross validation
    n = 5
    kfold = KFold(n_splits=n, shuffle=True)
# for each fold
    for fold, (train_idx, test_idx) in enumerate(kfold.split(dataset)):
        print('------------fold no---------{}----------------------'.format(fold))
        print(train_idx)
        print(test_idx)
        train_tensor = Subset(dataset, train_idx)
        test_tensor = Subset(dataset, test_idx)
        test_data_len = len(test_idx)

        trainloader = DataLoader(
            dataset,
            batch_size=batch_size,
            sampler=RandomSampler(train_tensor))
        testloader = DataLoader(
            dataset,
            batch_size=batch_size,
            sampler=SequentialSampler(test_tensor))
        
        total_steps = len(trainloader) * epochs
        optimizer = AdamW(model.parameters(),
                          lr=learning_rate,
                          eps=epsilon
                          )
        scheduler = get_linear_schedule_with_warmup(optimizer,
                               num_warmup_steps=0,  # Default value in run_glue.py
                               num_training_steps=total_steps)
        
        for epoch_i in range(0, epochs):
            print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
            train(fold, model, DEVICE, trainloader, optimizer, scheduler, epochs)
            test(fold, model, DEVICE, testloader, test_data_len)


In [17]:
# If there's a GPU available...
if torch.cuda.is_available():
    # Tell PyTorch to use the GPU.    
    DEVICE = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
    USING_GPU = True
else:
    print('No GPU available, using the CPU instead.')
    DEVICE = torch.device("cpu")
    USING_GPU = False

No GPU available, using the CPU instead.


In [18]:
df = pd.read_csv(data_path)
df = df[df['Text'].notna()]
X = df.Text.values # x
Y = list(df['label']) # y_true

In [19]:
if trainedBertModel == 'bert':
    used_bert_model = pretrained_models[trainedBertModel]
    model = BertForSequenceClassification.from_pretrained(used_bert_model, num_labels = 3)
    tokenizer = BertTokenizer.from_pretrained(used_bert_model, do_lower_case=True)
elif trainedBertModel == 'roberta':
    used_bert_model = pretrained_models[trainedBertModel]
    model = RobertaForSequenceClassification.from_pretrained(used_bert_model, num_labels = 3)
    tokenizer = RobertaTokenizer.from_pretrained(used_bert_model, do_lower_case=True)

input_ids, attention_masks, labels = prepare_dataset(X, Y, tokenizer, max_length=400)
dataset = TensorDataset(input_ids, attention_masks, labels)

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [None]:
train_bert_model(model, dataset, Y, batch_size=batchsize)

------------fold no---------0----------------------
[ 0  1  2  3  4  5  6  8  9 10 11 12 14 15 16 17 18 19 20 22 23 25 26 29]
[ 7 13 21 24 27 28]
Training...
step: 0
step: 1
step: 2

  Average training loss: 1.03
  Training epoch took: 0:04:59
Running Validation...


 17%|█▋        | 1/6 [00:21<01:49, 21.82s/it]


[array([[ 0.24932191, -0.22419916, -0.1799518 ],
       [ 0.28101942, -0.25948605, -0.3473724 ],
       [ 0.18687303, -0.11519731, -0.26217982],
       [ 0.18568721, -0.15173319, -0.31729963],
       [ 0.16187072, -0.22331466, -0.29775766],
       [ 0.23146866, -0.1855069 , -0.30011764]], dtype=float32)]
[array([0, 0, 0, 1, 0, 1])]
Training...
step: 0
step: 1
step: 2

  Average training loss: 0.94
  Training epoch took: 0:04:49
Running Validation...


 17%|█▋        | 1/6 [00:23<01:58, 23.61s/it]


[array([[ 0.35316673, -0.3250932 , -0.29812232],
       [ 0.25404483, -0.26290113, -0.44867265],
       [ 0.31620213, -0.2334359 , -0.39180434],
       [ 0.23802225, -0.1560243 , -0.35423285],
       [ 0.28266203, -0.29454434, -0.4021901 ],
       [ 0.31550384, -0.23342586, -0.3768292 ]], dtype=float32)]
[array([0, 0, 0, 1, 0, 1])]
Training...
step: 0
step: 1
step: 2

  Average training loss: 0.88
  Training epoch took: 0:04:50
Running Validation...


 17%|█▋        | 1/6 [00:23<01:56, 23.37s/it]


[array([[ 0.3852858 , -0.35108832, -0.32770646],
       [ 0.32486448, -0.29935837, -0.48231834],
       [ 0.35039887, -0.26235834, -0.42107794],
       [ 0.25127807, -0.15536322, -0.36011094],
       [ 0.3249499 , -0.31564382, -0.43574017],
       [ 0.33560053, -0.24281842, -0.39080942]], dtype=float32)]
[array([0, 0, 0, 1, 0, 1])]
------------fold no---------1----------------------
[ 2  3  4  5  6  7  8  9 10 11 13 14 17 18 19 20 21 22 23 24 25 26 27 28]
[ 0  1 12 15 16 29]
Training...
step: 0
step: 1
step: 2

  Average training loss: 0.88
  Training epoch took: 0:04:49
Running Validation...


 17%|█▋        | 1/6 [00:23<01:58, 23.73s/it]


[array([[ 0.42467186, -0.3543325 , -0.38770196],
       [ 0.41893983, -0.330865  , -0.53205776],
       [ 0.3858603 , -0.23285876, -0.45804575],
       [ 0.16953948,  0.02398451, -0.27658322],
       [ 0.3283614 , -0.30050692, -0.45029497],
       [ 0.29464012, -0.12222977, -0.3524098 ]], dtype=float32)]
[array([0, 0, 0, 1, 0, 1])]
Training...
step: 0
step: 1
step: 2

  Average training loss: 0.80
  Training epoch took: 0:04:53
Running Validation...


 17%|█▋        | 1/6 [00:23<01:58, 23.69s/it]


[array([[ 0.49941015, -0.40048888, -0.47624516],
       [ 0.5792825 , -0.3866793 , -0.5876505 ],
       [ 0.498614  , -0.3351696 , -0.57705456],
       [ 0.1946556 ,  0.02866145, -0.3078433 ],
       [ 0.4330299 , -0.3533911 , -0.54804283],
       [ 0.33571362, -0.15022977, -0.39573666]], dtype=float32)]
[array([0, 0, 0, 1, 0, 1])]
Training...
step: 0
step: 1
step: 2

  Average training loss: 0.80
  Training epoch took: 0:04:48
Running Validation...


 17%|█▋        | 1/6 [00:25<02:07, 25.51s/it]


[array([[ 0.5313344 , -0.42327765, -0.5085151 ],
       [ 0.61258703, -0.43113863, -0.63594276],
       [ 0.5378069 , -0.35952747, -0.62506413],
       [ 0.17779222,  0.06179165, -0.2991617 ],
       [ 0.46062014, -0.37585118, -0.59741616],
       [ 0.34677032, -0.15248446, -0.407816  ]], dtype=float32)]
[array([0, 0, 0, 1, 0, 1])]
------------fold no---------2----------------------
[ 0  1  2  3  4  5  6  7  9 10 11 12 13 15 16 17 18 19 21 24 25 27 28 29]
[ 8 14 20 22 23 26]
Training...
step: 0
step: 1


In [None]:
Y