In [None]:
import torch

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla V100-SXM2-16GB


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.2-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 568 kB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 70.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 43.7 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.21.2


In [None]:
from tqdm import tqdm
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
import string

In [None]:
import torch
import torch.nn as nn
from transformers import AutoModel

# Bert-Classfier class
class BertClassifier(nn.Module):
    """Bert Model for Classification Tasks.
    """
    def __init__(self, model_name, freeze_bert=False):
        """
        @param    bert: a BertModel object
        @param    classifier: a torch.nn.Module classifier
        @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
        """

        """

        BERT paper suggestions for the hyperparameters during fine-tuning:
        droupout: 0.1 ALWAYS
        Batch size: 16, 32
        Learning rate (Adam): 5e-5, 3e-5, 2e-5
        Number of epochs: 2, 3, 4

        """
        super(BertClassifier, self).__init__()
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out = 768, 50, 8

        # Instantiate BERT model
        self.bert = AutoModel.from_pretrained(model_name)

        # Instantiate an one-layer feed-forward classifier
        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(H, D_out)
        )

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
    def forward(self, input_ids, attention_mask):
          """
          Feed input to BERT and the classifier to compute logits.
          @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
                        max_length)
          @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                        information with shape (batch_size, max_length)
          @return   logits (torch.Tensor): an output tensor with shape (batch_size,
                        num_labels)
          """
          # Feed input to BERT
          outputs = self.bert(input_ids=input_ids,
                              attention_mask=attention_mask)
          
          # Extract the last hidden state of the token `[CLS]` for classification task
          last_hidden_state_cls = outputs[0][:, 0, :]

          # Feed input to classifier to compute logits
          logits = self.classifier(last_hidden_state_cls)

          return logits
        

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup
def initialize_model(model_name,train_dataloader,epochs=4):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier(model_name,freeze_bert=False)

    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(bert_classifier.parameters(),
                      lr=2e-5,    # Default learning rate
                      eps=1e-8    # Default epsilon value
                      )

    # Total number of training steps
    total_steps = len(train_dataloader) * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

In [None]:
import random
import time
from sklearn.metrics import f1_score

# Specify loss function
loss_fn = nn.CrossEntropyLoss()

def set_seed(seed_value=42):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)
set_seed(42)    # Set seed for reproducibility

In [None]:
def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()
    # Tracking variables
    val_loss = []
    val_tp=[]
    val_fp=[]
    val_tn=[]
    val_fn=[]

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Compute loss
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()
        val_tp.append((((preds+1)/(b_labels+1)) == b_labels).cpu().numpy().sum())
        val_fp.append((preds - b_labels == 1).cpu().numpy().sum())
        val_tn.append((((preds+1)/(b_labels+1)) == b_labels+1).cpu().numpy().sum())
        val_fn.append((preds - b_labels == -1).cpu().numpy().sum())

    # Compute the average accuracy and loss over the validation set.
    # predictions = np.concatenate(predictions, axis=0)
    # true_vals = np.concatenate(true_vals, axis=0)
    val_loss = np.mean(val_loss)
    val_tp=np.sum(val_tp)
    val_fp=np.sum(val_fp)
    val_tn=np.sum(val_tn)
    val_fn=np.sum(val_fn)
    pos= val_fn + val_tp
    neg = val_fp +val_tn
    total = pos + neg
    val_accuracy=(val_tp+val_tn)/(val_tp+val_tn+val_fp+val_fn)
    # val_neg_f1=(2*val_tn)/( val_tn + val_fn +neg)
    # val_pos_f1=(2*val_tp)/( val_tp + val_fp +pos)
    precison= val_tp/(val_tp+val_fp)
    recall= val_tp/(val_tp+val_fn)
    labels_flat = b_labels.flatten()
    F1= (2*precison*recall)/(precison+recall)
    # if(acc_per_class):
    #   accuracy_per_class(predictions, true_vals)
    
    # microF1 = (val_neg_f1 * neg + val_pos_f1 * pos)/ total
    return val_loss, val_accuracy ,precison, recall, F1

In [None]:
def get_predictions(model, test_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()
    # Tracking variables
    predictions=[]
    labels=[]

    # For each batch in our validation set...
    for batch in test_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
            
        # Get the true labels
        labels.append(b_labels.flatten().cpu())
        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()
        predictions.append(preds.cpu())
    return predictions, labels

In [None]:
#### bert preprocessing
from transformers import AutoTokenizer, AutoModel
# model_name='aubmindlab/bert-base-arabertv2'
# model_name='CAMeL-Lab/bert-base-arabic-camelbert-msa'
# model_name='UBC-NLP/MARBERTv2'
# model_name="UBC-NLP/ARBERT"
def preprocessing_for_bert(data,model_name):
  
    tokenizer = AutoTokenizer.from_pretrained(model_name)
  
    # Create empty lists to store outputs
    input_ids = []
    attention_masks = []
    MAX_LEN=350 # it can be assigned to average: 90 token, what do you think?
    # For every sentence...
    for sent in data:
        # `encode_plus` will:
        #    (1) Tokenize the sentence
        #    (2) Add the `[CLS]` and `[SEP]` token to the start and end
        #    (3) Truncate/Pad sentence to max length
        #    (4) Map tokens to their IDs
        #    (5) Create attention mask
        #    (6) Return a dictionary of outputs
        encoded_sent = tokenizer.encode_plus(
            text=sent,  # Preprocess sentence
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length=MAX_LEN,                  # Max length to truncate/pad
            pad_to_max_length=True,
            truncation =True,         # Pad sentence to max length
            return_attention_mask=True 
                        )    
        
        # Add the outputs to the lists
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

In [None]:
#Loading data
df = pd.read_csv('/content/drive/MyDrive/Saudi Patent Research Project 2022/colab/model_dataset.csv')

In [None]:
df

Unnamed: 0.1,Unnamed: 0,text,label,cleaned_text
0,0,طريقة وجهاز لكشف وحدة الشبكة البصرية ونظام الش...,H,طريقه وجهاز لكشف وحده الشبكه البصريه ونظام الش...
1,1,جهاز لتبريد الهواء الداخل لمحرك احتراق داخلي ث...,F,جهاز لتبريد الهواء الداخل لمحرك احتراق داخلي ث...
2,2,طريقة لمعالجة سطح صبغة ثاني أكسيد التيتانيوميت...,C,طريقه لمعالجه سطح صبغه ثاني اكسيد التيتانيوميت...
3,3,طريقة للمحاكاة الآمنة للكشف عن التلوث بمادة مش...,G,طريقه للمحاكاه الامنه للكشف عن التلوث بماده مش...
4,4,جهاز لتغيير عربة تحميليتعلق الاختراع الراهن بج...,F,جهاز لتغير عربه تحميليتعلق الاختراع الراهن بجه...
...,...,...,...,...
9760,9760,مركبات أمينوجوانيدينات، والتركيبات الصيديلية ا...,A,مركبات امينوجوانيدينات والتركيبات الصيديليه ال...
9761,9761,مثبطات HIV PROTEASE في تركيبات صيدلية لعلاج ا...,A,مثبطات HIV PROTEASE في تركيبات صيدليه لعلاج ا...
9762,9762,حفازات عالية الفعالية ذات بنية مسامية أوسطية ث...,C,حفازات عاليه الفعاليه ذات بنيه مساميه اوسطيه ث...
9763,9763,"hemihydrate of 4-(5,6,7,8-tetrahydroimidazo[1,...",A,hemihydrate of tetrahydroimidazoa pyridineN be...


In [None]:
df=df.drop('Unnamed: 0', axis=1)
df= df.dropna()

In [None]:
df

Unnamed: 0,text,label,cleaned_text
0,طريقة وجهاز لكشف وحدة الشبكة البصرية ونظام الش...,H,طريقه وجهاز لكشف وحده الشبكه البصريه ونظام الش...
1,جهاز لتبريد الهواء الداخل لمحرك احتراق داخلي ث...,F,جهاز لتبريد الهواء الداخل لمحرك احتراق داخلي ث...
2,طريقة لمعالجة سطح صبغة ثاني أكسيد التيتانيوميت...,C,طريقه لمعالجه سطح صبغه ثاني اكسيد التيتانيوميت...
3,طريقة للمحاكاة الآمنة للكشف عن التلوث بمادة مش...,G,طريقه للمحاكاه الامنه للكشف عن التلوث بماده مش...
4,جهاز لتغيير عربة تحميليتعلق الاختراع الراهن بج...,F,جهاز لتغير عربه تحميليتعلق الاختراع الراهن بجه...
...,...,...,...
9760,مركبات أمينوجوانيدينات، والتركيبات الصيديلية ا...,A,مركبات امينوجوانيدينات والتركيبات الصيديليه ال...
9761,مثبطات HIV PROTEASE في تركيبات صيدلية لعلاج ا...,A,مثبطات HIV PROTEASE في تركيبات صيدليه لعلاج ا...
9762,حفازات عالية الفعالية ذات بنية مسامية أوسطية ث...,C,حفازات عاليه الفعاليه ذات بنيه مساميه اوسطيه ث...
9763,"hemihydrate of 4-(5,6,7,8-tetrahydroimidazo[1,...",A,hemihydrate of tetrahydroimidazoa pyridineN be...


In [None]:
possible_labels = df.label.unique()
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

{'H': 0, 'F': 1, 'C': 2, 'G': 3, 'B': 4, 'A': 5, 'E': 6, 'D': 7}

In [None]:
df['label'] = df.label.replace(label_dict)
X = df.text.values
y = df.label.values
y=y.astype(int)
### converting labels to tensors and intilizing batches
labels = torch.tensor(y)

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
# model_name='aubmindlab/bert-base-arabertv2'
# model_name='CAMeL-Lab/bert-base-arabic-camelbert-mix'
# model_name='UBC-NLP/MARBERTv2'
# model_name="UBC-NLP/ARBERT"
# model_name="qarib/bert-base-qarib"
model_names=['CAMeL-Lab/bert-base-arabic-camelbert-msa',"CAMeL-Lab/bert-base-arabic-camelbert-mix", 'aubmindlab/bert-base-arabertv2']
# Stratified shuffle split the data into 80/10/10
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_valid_index in split.split(X, labels):
    train_set = X[train_index]
    train_labels=labels[train_index]
    test_valid_set = X[test_valid_index]
    test_valid_set_labels=labels[test_valid_index]

split2 = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
for test_index, valid_index in split2.split(test_valid_set, test_valid_set_labels):
    test_set = test_valid_set[test_index]
    val_set = test_valid_set[valid_index]
    test_labels=test_valid_set_labels[test_index]
    val_labels=test_valid_set_labels[valid_index]

print('Tokenizing data...')
test_inputs_CAMEL,test_masks_CAMEL=preprocessing_for_bert(test_set,model_names[0])
test_inputs_MIX,test_masks_MIX=preprocessing_for_bert(test_set,model_names[1])
test_inputs_ARABERT,test_masks_ARABERT=preprocessing_for_bert(test_set,model_names[2])

train_inputs_CAMEL,train_masks_CAMEL=preprocessing_for_bert(train_set,model_names[0])
train_inputs_MIX,train_masks_MIX=preprocessing_for_bert(train_set,model_names[1])
train_inputs_ARABERT,train_masks_ARABERT=preprocessing_for_bert(train_set,model_names[2])

Tokenizing data...




In [None]:
# For fine-tuning BERT, the authors recommend a batch size of 16 or 32.
batch_size = 32

# Create the DataLoader for our training set
train_data_CAMEL = TensorDataset(train_inputs_CAMEL, train_masks_CAMEL, train_labels)
train_sampler_CAMEL = RandomSampler(train_data_CAMEL)
train_dataloader_CAMEL = DataLoader(train_data_CAMEL, sampler=train_sampler_CAMEL, batch_size=batch_size)


train_data_MIX = TensorDataset(train_inputs_MIX, train_masks_MIX, train_labels)
train_sampler_MIX = RandomSampler(train_data_MIX)
train_dataloader_MIX = DataLoader(train_data_MIX, sampler=train_sampler_MIX, batch_size=batch_size)


train_data_ARABERT = TensorDataset(train_inputs_ARABERT, train_masks_ARABERT, train_labels)
train_sampler_ARABERT = RandomSampler(train_data_ARABERT)
train_dataloader_ARABERT = DataLoader(train_data_ARABERT, sampler=train_sampler_ARABERT, batch_size=batch_size)



# Create the DataLoader for our Testing set
test_data_CAMEL = TensorDataset(test_inputs_CAMEL, test_masks_CAMEL, test_labels)
test_sampler_CAMEL = SequentialSampler(test_data_CAMEL)
test_dataloader_CAMEL = DataLoader(test_data_CAMEL, sampler=test_sampler_CAMEL, batch_size=batch_size)

test_data_MIX = TensorDataset(test_inputs_MIX, test_masks_MIX, test_labels)
test_sampler_MIX = SequentialSampler(test_data_MIX)
test_dataloader_MIX = DataLoader(test_data_MIX, sampler=test_sampler_MIX, batch_size=batch_size)

test_data_ARABERT = TensorDataset(test_inputs_ARABERT, test_masks_ARABERT, test_labels)
test_sampler_ARABERT = SequentialSampler(test_data_ARABERT)
test_dataloader_ARABERT = DataLoader(test_data_ARABERT, sampler=test_sampler_ARABERT, batch_size=batch_size)



In [None]:
# initialize and load previously trained models
MIX, MIX_optimizer, MIX_scheduler = initialize_model(model_names[1],train_dataloader_MIX,epochs=3)
CAMEL, CAMEL_optimizer, CAMEL_scheduler = initialize_model(model_names[0],train_dataloader_CAMEL, epochs=3)
ARABERT, ARABERT_optimizer, ARABERT_scheduler = initialize_model(model_names[2],train_dataloader_ARABERT,epochs=3)
MIX.load_state_dict(torch.load('/content/drive/MyDrive/Saudi Patent Research Project 2022/saved_models/camel_mix_unprocessed'))
CAMEL.load_state_dict(torch.load('/content/drive/MyDrive/Saudi Patent Research Project 2022/saved_models/CAMEL_MSA'))
ARABERT.load_state_dict(torch.load('/content/drive/MyDrive/Saudi Patent Research Project 2022/saved_models/arabertv2_unprocessed'))


Some weights of the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-mix were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-msa were not used when 

<All keys matched successfully>

In [None]:
MIX_preds, MIX_labels= get_predictions(MIX, test_dataloader_MIX)
CAMEL_preds, CAMEL_labels= get_predictions(CAMEL, test_dataloader_CAMEL)
ARABERT_preds, ARABERT_labels= get_predictions(ARABERT, test_dataloader_ARABERT)

In [None]:
preds= { 'MIX':MIX_preds, 'CAMEL':CAMEL_preds, 'ARABERT':ARABERT_preds }

In [None]:
print(len(preds['MIX']))
print(len(preds['CAMEL']))
print(len(preds['ARABERT']))

31
31
31


In [None]:
np_MIX_preds= [i.detach().numpy() for i in preds['MIX']]
np_CAMEL_preds= [i.detach().numpy() for i in preds['CAMEL']]
np_ARABERT_preds= [i.detach().numpy() for i in preds['ARABERT']]
true_labels=[i.detach().numpy() for i in CAMEL_labels]

In [None]:
def weighted_sum(elm1, elm2, elm3):
   elm1_=[0,0,0,0,0,0,0,0]
   elm2_=[0,0,0,0,0,0,0,0]
   elm3_=[0,0,0,0,0,0,0,0]
   elm1_[elm1]=1
   elm2_[elm2]=1
   elm3_[elm3]=1
   summed = np.tensordot([elm1_,elm2_,elm3_],[1,0.5,0.5], axes=((0),(0)))
   result = np.argmax(summed)
   return result


voting ensemble (weighted)

In [None]:
# LOOKING FOR A WAY TO IMPLEMENT IT
majority_vote=[]
for i in range(31):
  
  if(i==30):
    result=[]
    for j in range(16):
      mod=weighted_sum(np_CAMEL_preds[i][j],np_MIX_preds[i][j],np_ARABERT_preds[i][j])
      result = np.append(result, mod)
  else:
    result=[]
    for j in range(32):
      mod=weighted_sum(np_CAMEL_preds[i][j],np_MIX_preds[i][j],np_ARABERT_preds[i][j])
      result = np.append(result, mod)
  majority_vote.append(np.array(result, dtype=np.int8))

In [None]:
# Get the predictions
val_tp=[]
val_fp=[]
val_tn=[]
val_fn=[]
for i in range(31):
    val_tp.append((((majority_vote[i]+1)/(true_labels[i]+1)) == true_labels[i]).sum())
    val_fp.append((majority_vote[i] - true_labels[i] == 1).sum())
    val_tn.append((((majority_vote[i]+1)/(true_labels[i]+1)) == true_labels[i]+1).sum())
    val_fn.append((majority_vote[i] - true_labels[i] == -1).sum())
val_tp=np.sum(val_tp)
val_fp=np.sum(val_fp)
val_tn=np.sum(val_tn)
val_fn=np.sum(val_fn)
pos= val_fn + val_tp
neg = val_fp +val_tn
total = pos + neg
val_accuracy=(val_tp+val_tn)/(val_tp+val_tn+val_fp+val_fn)
precison= val_tp/(val_tp+val_fp)
recall= val_tp/(val_tp+val_fn)
F1= (2*precison*recall)/(precison+recall)

In [None]:
# weighted sum Ensemble results
print(f"  {' Acc':^8}  |{'  precison ':^8}|{' recall ':^8} |{' F1':^8} ") 
print(f"  {val_accuracy:^9f} | {precison:^9f} |{recall:^9f} |{F1:^9f} ")

     Acc    |  precison | recall  |   F1    
  0.770588  | 0.844037  |0.807018  |0.825112  


In [None]:
# Test model on test data:
val_loss, val_accuracy ,precison, recall, F1= evaluate(CAMEL, test_dataloader_CAMEL)
print(f" {' Loss':^10} | {' Acc':^8}  |{'  precison ':^8}|{' recall ':^8} |{' F1':^8} ") 
print(f" { val_loss:^10.6f} | {val_accuracy:^9f} | {precison:^9f} |{recall:^9f} |{F1:^9f} ")

    Loss    |    Acc    |  precison | recall  |   F1    
  0.895033  | 0.760234  | 0.827273  |0.805310  |0.816143  
