In [None]:
! git clone https://github.com/VinAIResearch/COVID19Tweet.git
! pip install transformers

import numpy as np 
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report


train_df = pd.read_csv("COVID19Tweet/train.tsv", sep='\t')
val_df = pd.read_csv("COVID19Tweet/valid.tsv", sep='\t',names=['Id','Text','Label'])
test_df = pd.read_csv("COVID19Tweet/unlabeled_test_with_noise.tsv", sep='\t',names=['Id','Text'])

train_sentences = train_df.Text.values
train_labels =  train_df.Label.values

val_sentences = val_df.Text.values
val_labels =  val_df.Label.values


test_sentences = test_df.Text.values
# test_labels =  val_df.Label.values

y_train = [int(label == 'INFORMATIVE') for label in train_labels]
y_val = [int(label == 'INFORMATIVE') for label in val_labels]

y_train = np.array(y_train)
y_val = np.array(y_val)



fatal: destination path 'COVID19Tweet' already exists and is not an empty directory.


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification,  AdamW

tokenizer = AutoTokenizer.from_pretrained("digitalepidemiologylab/covid-twitter-bert")

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("digitalepidemiologylab/covid-twitter-bert" ,
                                                                       num_labels = 2, # The number of output labels--2 for binary classification.
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False).cuda()


In [None]:
learning_rate = 2e-5
max_length = 128
batch_size = 16


In [None]:
optimizer = AdamW(model.parameters(),
                  lr = learning_rate, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

In [None]:
import logging
from tqdm import tqdm
import torch

logging.getLogger('transformers').setLevel(logging.ERROR)

# training

In [None]:
# number of epochs
Epochs = 5

for i in range(Epochs):
  print("-------------------------------------------------------")
  print('Epoch ', i)

  model.train()
  training_steps = int(len(train_sentences)/batch_size)+1
  losses = []
  # 1 epoch over X_train
  with tqdm(total=training_steps) as progress_bar:
    for i in range(0, len(train_sentences), batch_size):
      batch_X = train_sentences[i:i+batch_size]
      batch_y = torch.LongTensor(y_train[i:i+batch_size]).cuda()


      encoding = tokenizer(list(batch_X),padding='max_length',truncation="longest_first", max_length  = max_length,return_tensors='pt')
      input_ids = encoding['input_ids'].cuda()
      attention_mask = encoding['attention_mask'].cuda()
      token_type_ids = encoding['token_type_ids'].cuda()

      loss, logits = model(input_ids= input_ids,attention_mask=attention_mask, token_type_ids=token_type_ids,labels=batch_y)


      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
      loss.backward()
      optimizer.step()
      model.zero_grad()

      losses.append(loss.item())
      avg_loss = sum(losses)/len(losses)
      progress_bar.update(1)
      progress_bar.set_description("avg loss so far = {}".format(avg_loss))

  model.eval()
  eval_steps = int(len(val_sentences)/batch_size)+1
  list_of_logits = []

  with tqdm(total=eval_steps) as progress_bar:
    for i in range(0, len(val_sentences), batch_size):
      batch_X = val_sentences[i:i+batch_size]



      encoding = tokenizer(list(batch_X),padding='max_length',truncation="longest_first", max_length  = max_length,return_tensors='pt')
      input_ids = encoding['input_ids'].cuda()
      attention_mask = encoding['attention_mask'].cuda()
      token_type_ids = encoding['token_type_ids'].cuda()

      logits = model(input_ids= input_ids,attention_mask=attention_mask, token_type_ids=token_type_ids)[0]
      list_of_logits.extend(logits.tolist())

      progress_bar.update(1)
      
  preds_bert = np.argmax(list_of_logits,axis=1)
  print()
  print(classification_report(y_val, preds_bert,digits=6))

  print("-------------------------------------------------------")
  print("-------------------------------------------------------")



  0%|          | 0/434 [00:00<?, ?it/s]

-------------------------------------------------------
Epoch  0


avg loss so far = 0.16066371720855036: 100%|██████████| 434/434 [08:43<00:00,  1.21s/it]
100%|██████████| 63/63 [00:25<00:00,  2.49it/s]
  0%|          | 0/434 [00:00<?, ?it/s]


              precision    recall  f1-score   support

           0   0.935547  0.907197  0.921154       528
           1   0.899590  0.930085  0.914583       472

    accuracy                       0.918000      1000
   macro avg   0.917569  0.918641  0.917869      1000
weighted avg   0.918575  0.918000  0.918053      1000

-------------------------------------------------------
-------------------------------------------------------
-------------------------------------------------------
Epoch  1


avg loss so far = 0.0410802869838641: 100%|██████████| 434/434 [08:42<00:00,  1.20s/it]
100%|██████████| 63/63 [00:25<00:00,  2.49it/s]
  0%|          | 0/434 [00:00<?, ?it/s]


              precision    recall  f1-score   support

           0   0.893116  0.933712  0.912963       528
           1   0.921875  0.875000  0.897826       472

    accuracy                       0.906000      1000
   macro avg   0.907495  0.904356  0.905395      1000
weighted avg   0.906690  0.906000  0.905818      1000

-------------------------------------------------------
-------------------------------------------------------
-------------------------------------------------------
Epoch  2


avg loss so far = 0.02025820579876574: 100%|██████████| 434/434 [08:42<00:00,  1.20s/it]
100%|██████████| 63/63 [00:25<00:00,  2.48it/s]
  0%|          | 0/434 [00:00<?, ?it/s]


              precision    recall  f1-score   support

           0   0.914019  0.926136  0.920038       528
           1   0.916129  0.902542  0.909285       472

    accuracy                       0.915000      1000
   macro avg   0.915074  0.914339  0.914661      1000
weighted avg   0.915015  0.915000  0.914962      1000

-------------------------------------------------------
-------------------------------------------------------
-------------------------------------------------------
Epoch  3


avg loss so far = 0.014647327337807263: 100%|██████████| 434/434 [08:42<00:00,  1.20s/it]
100%|██████████| 63/63 [00:25<00:00,  2.48it/s]
  0%|          | 0/434 [00:00<?, ?it/s]


              precision    recall  f1-score   support

           0   0.927342  0.918561  0.922931       528
           1   0.909853  0.919492  0.914647       472

    accuracy                       0.919000      1000
   macro avg   0.918598  0.919026  0.918789      1000
weighted avg   0.919087  0.919000  0.919021      1000

-------------------------------------------------------
-------------------------------------------------------
-------------------------------------------------------
Epoch  4


avg loss so far = 0.02776819867611576: 100%|██████████| 434/434 [08:41<00:00,  1.20s/it]
100%|██████████| 63/63 [00:25<00:00,  2.49it/s]


              precision    recall  f1-score   support

           0   0.894928  0.935606  0.914815       528
           1   0.924107  0.877119  0.900000       472

    accuracy                       0.908000      1000
   macro avg   0.909517  0.906362  0.907407      1000
weighted avg   0.908700  0.908000  0.907822      1000

-------------------------------------------------------
-------------------------------------------------------





# Model saving / loading

In [None]:
!mkdir "drive/My Drive/BERT_FINE_TUNING_WNUT/"

In [None]:
path = "drive/My Drive/BERT_FINE_TUNING_WNUT/CT_2e"

torch.save(model, path)

In [None]:
path = "drive/My Drive/BERT_FINE_TUNING_WNUT/CT_1e"

model = torch.load(path)

# experiments

## Training for 1 epoch

In [None]:
model.train()
training_steps = int(len(train_sentences)/batch_size)+1
losses = []
# 1 epoch over X_train
with tqdm(total=training_steps) as progress_bar:
  for i in range(0, len(train_sentences), batch_size):
    batch_X = train_sentences[i:i+batch_size]
    batch_y = torch.LongTensor(y_train[i:i+batch_size]).cuda()


    encoding = tokenizer(list(batch_X),padding='max_length',truncation="longest_first", max_length  = max_length,return_tensors='pt')
    input_ids = encoding['input_ids'].cuda()
    attention_mask = encoding['attention_mask'].cuda()
    token_type_ids = encoding['token_type_ids'].cuda()

    loss, logits = model(input_ids= input_ids,attention_mask=attention_mask, token_type_ids=token_type_ids,labels=batch_y)


    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    loss.backward()
    optimizer.step()
    model.zero_grad()

    losses.append(loss.item())
    avg_loss = sum(losses)/len(losses)
    progress_bar.update(1)
    progress_bar.set_description("avg loss so far = {}".format(avg_loss))

avg loss so far = 0.06615064456245073: 100%|█████████▉| 867/868 [10:20<00:00,  1.40it/s]


## Evaluation on the validation set

In [None]:
model.eval()
eval_steps = int(len(val_sentences)/batch_size)+1
list_of_logits = []

with tqdm(total=eval_steps) as progress_bar:
  for i in range(0, len(val_sentences), batch_size):
    batch_X = val_sentences[i:i+batch_size]



    encoding = tokenizer(list(batch_X),padding='max_length',truncation="longest_first", max_length  = max_length,return_tensors='pt')
    input_ids = encoding['input_ids'].cuda()
    attention_mask = encoding['attention_mask'].cuda()
    token_type_ids = encoding['token_type_ids'].cuda()

    logits = model(input_ids= input_ids,attention_mask=attention_mask, token_type_ids=token_type_ids)[0]
    list_of_logits.extend(logits.tolist())

    progress_bar.update(1)
    
preds_bert = np.argmax(list_of_logits,axis=1)
print(classification_report(y_val, preds_bert,digits=6))




 99%|█████████▉| 125/126 [00:27<00:00,  4.51it/s]


In [None]:
preds_bert = np.argmax(list_of_logits,axis=1)

In [None]:

print(classification_report(y_val, preds_bert,digits=6))

              precision    recall  f1-score   support

           0   0.958150  0.823864  0.885947       528
           1   0.829670  0.959746  0.889980       472

    accuracy                       0.888000      1000
   macro avg   0.893910  0.891805  0.887964      1000
weighted avg   0.897507  0.888000  0.887851      1000



In [None]:

print(classification_report(y_val, preds_bert,digits=6))

              precision    recall  f1-score   support

           0   0.923225  0.910985  0.917064       528
           1   0.901879  0.915254  0.908517       472

    accuracy                       0.913000      1000
   macro avg   0.912552  0.913120  0.912791      1000
weighted avg   0.913149  0.913000  0.913030      1000



In [None]:
list_of_logits

[[3.8475430011749268, -3.6093201637268066],
 [-3.2689993381500244, 2.9922397136688232],
 [4.415075778961182, -4.092049598693848],
 [4.124192237854004, -4.193169116973877],
 [3.827329158782959, -3.527426242828369],
 [-1.4919925928115845, 0.9087761640548706],
......... [-4.141377925872803, 3.960819721221924]]