<a href="https://colab.research.google.com/github/koleshjr/COVID_19-TWEETS-CLASSIFICATION/blob/main/Covid_19tweets_classification_Optuna_WandB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## COVID-19 Tweet Classification Challenge
* Can you identify tweets about coronavirus without using keywords? 

In [1]:
# !pip install -q datasets
# !pip install transformers
# !pip install optuna
# !pip install sentencepiece
# !pip install evaluate


# !pip install mlflow wandb



In [2]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
from pathlib import Path
path_files = '/content/drive/MyDrive/tweets_classification/'

In [4]:
import pandas as pd 
from sklearn.model_selection import train_test_split
import seaborn as sns 
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import mlflow
import os
import seaborn as sns
import torch
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import (
    BertForSequenceClassification,
    BertModel,
    BertTokenizerFast,
    TrainingArguments,
    Trainer,
    EvalPrediction,
)
import transformers 
import wandb

import os.path
from os import path

In [5]:
# !wandb login 
# wandb.init(project="classify_covid_tweets")

In [6]:
# %env WANDB_LOG_MODEL=true   #log every trained model

In [7]:

transformers.set_seed(1) 
df = pd.read_csv(path_files + 'Train.csv')
df

 





Unnamed: 0,ID,text,target
0,train_0,The bitcoin halving is cancelled due to,1
1,train_1,MercyOfAllah In good times wrapped in its gran...,0
2,train_2,266 Days No Digital India No Murder of e learn...,1
3,train_3,India is likely to run out of the remaining RN...,1
4,train_4,In these tough times the best way to grow is t...,0
...,...,...,...
5282,train_6856,The spread of the novel among asylum seekers o...,1
5283,train_6857,Hundreds of Jewish patients are being treated...,1
5284,train_6858,Beats me Honestly most of the people I follow ...,0
5285,train_6859,Help us reach more people by donating and shar...,0


In [8]:
df_test = pd.read_csv(path_files + 'Test.csv')
len(df_test)

1962

In [9]:
df_test.head()

Unnamed: 0,ID,text
0,test_2,Why is explained in the video take a look
1,test_3,Ed Davey fasting for Ramadan No contest
2,test_4,Is Doja Cat good or do you just miss Nicki Minaj
3,test_8,How Boris Johnson s cheery wounded in action p...
4,test_9,Man it s terrible Not even a reason to get on ...


In [10]:
df.target.value_counts()


0    2746
1    2541
Name: target, dtype: int64

### Training

In [11]:
df = df.rename(columns = {'target':'label'})

In [12]:
# In the first step we will split the data in training and remaining dataset
df_train, df_eval = train_test_split(df, train_size=0.8, stratify = df['label'])

df_train.reset_index(drop=True, inplace=True)
df_eval.reset_index(drop=True, inplace=True)

df_train.shape, df_eval.shape, df_test.shape


((4229, 3), (1058, 3), (1962, 2))

In [13]:


dataset = DatasetDict({
    "train": Dataset.from_pandas(df_train),
    "valid": Dataset.from_pandas(df_eval),
    "test": Dataset.from_pandas(df_test)
    })



In [14]:
df_eval.head()

Unnamed: 0,ID,text,label
0,train_2079,Health experts are concerned that cases are si...,1
1,train_6223,New review discuss repurposing fibrinolytic ti...,1
2,train_1958,i dont like synthetic white noise or like indu...,0
3,train_513,Ramadan Mubarak I enjoyed this clip of call to...,0
4,train_1738,Michael Ryan chief executive director of the W...,1


Tokenizing function

In [15]:
dataset

DatasetDict({
    train: Dataset({
        features: ['ID', 'text', 'label'],
        num_rows: 4229
    })
    valid: Dataset({
        features: ['ID', 'text', 'label'],
        num_rows: 1058
    })
    test: Dataset({
        features: ['ID', 'text'],
        num_rows: 1962
    })
})

In [16]:
def print_custom(text):
    print('\n')
    print(text)
    print('-'*100)

# Project constants
LR_MIN = 4e-5
LR_CEIL = 0.01
WD_MIN = 4e-5
WD_CEIL = 0.01
MIN_EPOCHS = 2
MAX_EPOCHS = 5
PER_DEVICE_EVAL_BATCH = 8
PER_DEVICE_TRAIN_BATCH = 8
NUM_TRIALS = 20
SAVE_DIR = '/content/drive/MyDrive/tweets_classification'
NAME_OF_MODEL = 'huggingoptunaface'
MAX_LENGTH = 350

In [17]:

# Load in Electra small
print_custom('Initializing Electra Small pretrained tokenizer')
model_name = "google/electra-small-discriminator"  
tokenizer = AutoTokenizer.from_pretrained(model_name)  
model = AutoModelForSequenceClassification.from_pretrained(model_name)   



Initializing Electra Small pretrained tokenizer
----------------------------------------------------------------------------------------------------


Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier

In [18]:
model

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (embeddings_project): Linear(in_features=128, out_features=256, bias=True)
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_

In [19]:

# Preprocessing functions
print_custom('Preprocessing text...')
def preprocess(examples):     
    return tokenizer(examples['text'], truncation=True, padding='max_length', 
            max_length=MAX_LENGTH)

dataset = dataset.map(preprocess, batched=True) 



Preprocessing text...
----------------------------------------------------------------------------------------------------


  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [19]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    f1 = f1_score(labels, preds)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [None]:
import optuna
#----------------------------------------------------------------------------------------------------
# Optuna setup
#----------------------------------------------------------------------------------------------------
print_custom('Setting up Optuna study')

def objective(trial: optuna.Trial):     
    model = AutoModelForSequenceClassification.from_pretrained(model_name)      
    training_args = TrainingArguments(         
        output_dir=SAVE_DIR, 
        learning_rate=trial.suggest_loguniform('learning_rate', low=LR_MIN, high=LR_CEIL),         
        weight_decay=trial.suggest_loguniform('weight_decay', WD_MIN, WD_CEIL),         
        num_train_epochs=trial.suggest_int('num_train_epochs', low = MIN_EPOCHS,high = MAX_EPOCHS),         
        per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH,         
        per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,         
        disable_tqdm=True)     

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset['train'],
        eval_dataset=dataset['test'],
        compute_metrics=compute_metrics,
        )      
    
    result = trainer.train()     
    return result.training_loss

#----------------------------------------------------------------------------------------------------
#                    CREATE OPTUNA STUDY
#----------------------------------------------------------------------------------------------------

print_custom('Triggering Optuna study')
study = optuna.create_study(study_name='hp-search-electra', direction='minimize') 
study.optimize(func=objective, n_trials=NUM_TRIALS)  

[32m[I 2023-02-21 14:12:21,444][0m A new study created in memory with name: hp-search-electra[0m




Setting up Optuna study
----------------------------------------------------------------------------------------------------


Triggering Optuna study
----------------------------------------------------------------------------------------------------


Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier

Saving model checkpoint to /content/drive/MyDrive/tweets_classification/checkpoint-500
Configuration saved in /content/drive/MyDrive/tweets_classification/checkpoint-500/config.json


{'loss': 0.7001, 'learning_rate': 0.001530323281458984, 'epoch': 0.95}


Model weights saved in /content/drive/MyDrive/tweets_classification/checkpoint-500/pytorch_model.bin
Saving model checkpoint to /content/drive/MyDrive/tweets_classification/checkpoint-1000
Configuration saved in /content/drive/MyDrive/tweets_classification/checkpoint-1000/config.json


{'loss': 0.6957, 'learning_rate': 0.0010568321671461797, 'epoch': 1.89}


Model weights saved in /content/drive/MyDrive/tweets_classification/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to /content/drive/MyDrive/tweets_classification/checkpoint-1500
Configuration saved in /content/drive/MyDrive/tweets_classification/checkpoint-1500/config.json


{'loss': 0.6924, 'learning_rate': 0.0005833410528333752, 'epoch': 2.84}


Model weights saved in /content/drive/MyDrive/tweets_classification/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to /content/drive/MyDrive/tweets_classification/checkpoint-2000
Configuration saved in /content/drive/MyDrive/tweets_classification/checkpoint-2000/config.json
Model weights saved in /content/drive/MyDrive/tweets_classification/checkpoint-2000/pytorch_model.bin


{'loss': 0.7124, 'learning_rate': 0.00010984993852057065, 'epoch': 3.78}




Training completed. Do not forget to share your model on huggingface.co/models =)


[32m[I 2023-02-21 14:16:52,305][0m Trial 0 finished with value: 0.6997375704615688 and parameters: {'learning_rate': 0.0020038143957717886, 'weight_decay': 0.0029135356479954777, 'num_train_epochs': 4}. Best is trial 0 with value: 0.6997375704615688.[0m


{'train_runtime': 262.6551, 'train_samples_per_second': 64.404, 'train_steps_per_second': 8.056, 'train_loss': 0.6997375704615688, 'epoch': 4.0}


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--google--electra-small-discriminator/snapshots/153f486d928bcfc213932f8fc91fc2e3c41af769/config.json
Model config ElectraConfig {
  "_name_or_path": "google/electra-small-discriminator",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weight

{'loss': 0.667, 'learning_rate': 0.00016798486225424634, 'epoch': 0.95}


Model weights saved in /content/drive/MyDrive/tweets_classification/checkpoint-500/pytorch_model.bin
Saving model checkpoint to /content/drive/MyDrive/tweets_classification/checkpoint-1000
Configuration saved in /content/drive/MyDrive/tweets_classification/checkpoint-1000/config.json


{'loss': 0.6943, 'learning_rate': 1.7460792133953922e-05, 'epoch': 1.89}


Model weights saved in /content/drive/MyDrive/tweets_classification/checkpoint-1000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


[32m[I 2023-02-21 14:19:07,469][0m Trial 1 finished with value: 0.6810315065437995 and parameters: {'learning_rate': 0.00031850893237453875, 'weight_decay': 0.00016522776038346474, 'num_train_epochs': 2}. Best is trial 1 with value: 0.6810315065437995.[0m
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--google--electra-small-discriminator/snapshots/153f486d928bcfc213932f8fc91fc2e3c41af769/config.json
Model config ElectraConfig {
  "_name_or_path": "google/electra-small-discriminator",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 10

{'train_runtime': 134.1365, 'train_samples_per_second': 63.055, 'train_steps_per_second': 7.887, 'train_loss': 0.6810315065437995, 'epoch': 2.0}


Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier

{'loss': 0.6996, 'learning_rate': 0.002078005508854025, 'epoch': 0.95}


Model weights saved in /content/drive/MyDrive/tweets_classification/checkpoint-500/pytorch_model.bin
Saving model checkpoint to /content/drive/MyDrive/tweets_classification/checkpoint-1000
Configuration saved in /content/drive/MyDrive/tweets_classification/checkpoint-1000/config.json


{'loss': 0.6939, 'learning_rate': 0.0015936219403565835, 'epoch': 1.89}


Model weights saved in /content/drive/MyDrive/tweets_classification/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to /content/drive/MyDrive/tweets_classification/checkpoint-1500
Configuration saved in /content/drive/MyDrive/tweets_classification/checkpoint-1500/config.json


{'loss': 0.6925, 'learning_rate': 0.0011092383718591416, 'epoch': 2.84}


Model weights saved in /content/drive/MyDrive/tweets_classification/checkpoint-1500/pytorch_model.bin


In [None]:


# This can be used to train the final model. Passed through using kwargs into the model
print_custom('Finding study best parameters')
best_lr = float(study.best_params['learning_rate'])
best_weight_decay = float(study.best_params['weight_decay'])
best_epoch = int(study.best_params['num_train_epochs'])

print_custom('Extract best study params')
print(f'The best learning rate is: {best_lr}')
print(f'The best weight decay is: {best_weight_decay}')
print(f'The best epoch is : {best_epoch}')

print_custom('Create dictionary of the best hyperparameters')
best_hp_dict = {
    'best_learning_rate' : best_lr,
    'best_weight_decay': best_weight_decay,
    'best_epoch': best_epoch
}



In [None]:

#----------------------------------------------------------------------------------------------------
#                   TRAIN BASED ON OPTUNAS SELECTED HP
#----------------------------------------------------------------------------------------------------
 
print_custom('Training the model on the custom parameters')

training_args = TrainingArguments(         
    output_dir=SAVE_DIR, 
    learning_rate=best_lr,         
    weight_decay=best_weight_decay,         
    num_train_epochs=best_epoch,         
    per_device_train_batch_size=8,         
    per_device_eval_batch_size=8,         
    disable_tqdm=True)     

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'])      
    
result = trainer.train() 
trainer.evaluate()

print_custom('Saving the best Optuna tuned model')
if not path.exists('model'):
    os.mkdir('model')

model_path = "model/{}".format(NAME_OF_MODEL)
model.save_pretrain
tokenizer.save_pretrained(model_path)

#### Loading and Using the model

In [None]:
import datasets 
import optuna 
from datasets import load_dataset 
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import os.path
from os import path
import numpy as np
from numpy import argmax

# Specify where to load the model and tokenizer from
MODEL_NAME = 'huggingoptunaface'
MODEL_FOLDER = 'model'
MODEL_PATH = f'{MODEL_FOLDER}/{MODEL_NAME}'
MAX_LENGTH = 350

# Load our model and tokenizer
loaded_model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
loaded_tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

In [None]:
# Our example text to pass to our fine tuned model
text = 'Acute generalized exanthematous'

def get_result(text, message=True):
    encoded_input = loaded_tokenizer(text, truncation=True, padding='max_length',
                                     max_length=MAX_LENGTH, return_tensors='pt')
    output = loaded_model(**encoded_input)
    result = output[0].detach().numpy()
    probs = torch.sigmoid(output[0]).detach().numpy()
    class_label = argmax(result)
    
    if message:
        print(f'The predicted class is label: {str(class_label)} with a probability of {probs[0][0]}')
    
    return result, class_label, probs


# Run your result through the function
result, class_label, probs = get_result(text)

In [None]:
wandb.finish()