In [1]:
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt
import torch, os
from torch.utils.data import Dataset
from ipywidgets import interact 
import seaborn as sns
from transformers import BertTokenizerFast, BertForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import tensorflow as tf
import numpy as np

In [2]:
torch.__version__

'2.1.0'

# Loading Data

In [37]:
dataset = load_dataset("trec")
train_dataset = dataset['train']
test_dataset = dataset['test']

Using the latest cached version of the module from C:\Users\liang\.cache\huggingface\modules\datasets_modules\datasets\trec\f2469cab1b5fceec7249fda55360dfdbd92a7a5b545e91ea0f78ad108ffac1c2 (last modified on Mon Aug 21 17:19:17 2023) since it couldn't be found locally at trec., or remotely on the Hugging Face Hub.


In [38]:
train_df = pd.DataFrame({'text' : train_dataset['text'], 'coarse_label' : train_dataset['coarse_label']})
test_df = pd.DataFrame({'text' : test_dataset['text'], 'coarse_label' : test_dataset['coarse_label']})

In [5]:
train_df.head()

Unnamed: 0,text,coarse_label
0,How did serfdom develop in and then leave Russ...,2
1,What films featured the character Popeye Doyle ?,1
2,How can I find a list of celebrities ' real na...,2
3,What fowl grabs the spotlight after the Chines...,1
4,What is the full form of .com ?,0


In [6]:
from sklearn.model_selection import train_test_split
initial_labeled_set, pool = train_test_split(train_df, test_size=0.95, stratify=train_df['coarse_label'])

In [7]:
initial_labeled_set

Unnamed: 0,text,coarse_label
141,What was the name of the `` Little Rascals '' ...,1
4222,What 2th-century American poet wrote a four-vo...,3
2870,Where is your corpus callosum ?,4
353,What actor first portrayed James Bond ?,3
945,What is the origin of the surname of Braun ?,2
...,...,...
137,What are equity securities ?,2
917,How many Gutenberg Bibles are there ?,5
2384,What is her husband 's name ?,3
4768,What good are mosquitoes ?,2


# Spliting Data into train_encodings, test_encodings

In [13]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased", max_length=512)

In [17]:
train_encodings = tokenizer(initial_labeled_set['text'].to_list(), truncation=True, padding=True)
test_encodings = tokenizer(test_df['text'].to_list(), truncation=True, padding=True)
train_labels = initial_labeled_set.coarse_label.to_list()
test_labels = test_df.coarse_label.to_list()

In [18]:
num_labels = 6
id_to_label = {0 : 'ABBR' , 1 : 'ENTY', 2: 'DESC', 3 : 'HUM', 4 : 'LOC', 5 : 'NUM'}
label_to_id= { 'ABBR' : 0 , 'ENTY' : 1, 'DESC': 2, 'HUM' :3, 'LOC' : 4, 'NUM' : 5}

In [19]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6, id2label = id_to_label, label2id = label_to_id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### TrainingArguments, Dataloader, Metrics are needed to construct this Fine-tune the model

In [8]:
training_args = TrainingArguments(
    # The output directory where the model predictions and checkpoints will be written
    output_dir='./BERTModel2',
    do_train=True,
    do_eval=True,
    #  The number of epochs, defaults to 3.0
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    # Number of steps used for a linear warmup
    warmup_steps=100,
    weight_decay=0.01,
    logging_strategy='steps',
   # TensorBoard log directory
    logging_dir='./multi-class-logs',
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
)

In [9]:
class DataLoader(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        """
          This construct a dict that is (index position) to encoding pairs.
          Where the Encoding becomes tensor(Encoding), which is an requirements
          for training the model
        """
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        """
        Returns the number of data items in the dataset.

        """
        return len(self.labels)

In [10]:
def compute_metrics(pred):

    # Extract true labels from the input object
    labels = pred.label_ids

    # Obtain predicted class labels by finding the column index with the maximum probability
    preds = pred.predictions.argmax(-1)

    # Compute macro precision, recall, and F1 score using sklearn's precision_recall_fscore_support function
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro',zero_division=1)

    # Calculate the accuracy score using sklearn's accuracy_score function
    acc = accuracy_score(labels, preds)

    # Return the computed metrics as a dictionary
    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }

# Calling our Dataloader

In [20]:
train_dataloader = DataLoader(train_encodings,train_labels)
test_dataloader = DataLoader(test_encodings,test_labels)

In [21]:
trainer = Trainer(
    #the pre-trained bert model that will be fine-tuned
    model=model,
    #training arguments that we defined above
    args=training_args,
    train_dataset= train_dataloader,
    eval_dataset = test_dataloader,
    compute_metrics= compute_metrics
)

In [32]:
#trainer.train() #It's able to skip this and still make good prediction

## Let see the accuracy for our prediction our dataset 

In [22]:
predictions = trainer.predict(test_dataloader)


  0%|          | 0/16 [00:00<?, ?it/s]

In [23]:
predicted_labels = predictions.predictions.argmax(axis=1)


In [24]:
accuracy = accuracy_score(test_df['coarse_label'].to_list(),predicted_labels)
print('accuracy {} '.format(accuracy))

accuracy 0.276 


## Active Learning Approach

In [37]:
initial_labeled_set.head() # so we fine_tune with the inital_labeled data

Unnamed: 0,text,coarse_label
1345,Tell me what city the Kentucky Horse Park is n...,4
3519,Who is the Incredible Hulk in reality ?,3
878,Why are electric cars less efficient in the no...,2
5163,What does `` E Pluribus Unum '' on the penny m...,2
1185,What is the Hub of London ?,2


In [38]:
initial_labeled_set.shape

(272, 2)

In [26]:
def entropy_for_each_row(class_probabilities):
    """ Calculate entropy for each row in the array """
    return -tf.reduce_sum(class_probabilities * tf.math.log(class_probabilities),axis=1)

In [40]:
data = test_df.copy()

In [41]:
data['coarse_label']

0      5
1      4
2      3
3      2
4      5
      ..
495    3
496    1
497    5
498    1
499    2
Name: coarse_label, Length: 500, dtype: int64

In [11]:
def predict_and_calculate_entropy(data):
    ''' Sample the Data '''
    data_encodings = tokenizer(data['text'].to_list(), truncation=True, padding=True)
    dataloader = DataLoader(data_encodings, data.coarse_label.to_list())

    ''' Make predictions with class_probabilities and calculate entropy (uncertainty) '''
    predictions = trainer.predict(dataloader)
    prediction_probabilities = tf.constant(predictions.predictions)

    ''' Predicted Labels '''
    predicted_labels = predictions.predictions.argmax(axis=1)

    

    ''' Prediction Probabilities '''
    # Prediction probabilities, returning the highest probability for each instance
    prediction_probabilities_max = np.amax(prediction_probabilities, axis=1)

    # Calculate entropy for each instance
    entropies = entropy_for_each_row(tf.nn.softmax(prediction_probabilities))

    entropy_df = pd.DataFrame(
        {'text' : data['text'].to_list(),
         'predicted_Label': predicted_labels,
         'predicted_Probability': prediction_probabilities_max,
         'Entropy': entropies},
        index=data.index
    )

    final_df = pd.concat([data['coarse_label'], entropy_df], axis=1)

    return final_df.sort_values(by=['Entropy'],ascending=False)


In [43]:
def finetune_and_train(data):

    data_encodings = tokenizer(data['text'].to_list(), truncation=True, padding=True)
    data_labels = data.coarse_label.to_list()

    test_encodings = tokenizer(test_df['text'].to_list(), truncation=True, padding=True)
    test_labels = test_df.coarse_label.to_list()

    #Create Dataloader
    train_dataloader = DataLoader(data_encodings,data_labels)
    test_dataloader = DataLoader(test_encodings,test_labels) 


    trainer = Trainer(
        #the pre-trained bert model that will be fine-tuned
        model=model,
        #training arguments that we defined above
        args=training_args,
        train_dataset= train_dataloader,
        eval_dataset = test_dataloader,
        compute_metrics= test_dataloader
    )

    trainer.train()



In [41]:
for iteration in range(10):
    sample_data = data.sample(n=10,random_state=42)
    data.drop(sample_data.index,inplace = True)
    entropy_prob_df = predict_and_calculate_entropy(sample_data)

    #Let now apply active learning
    text_encoding = tokenizer(entropy_prob_df['text'].to_list(),truncation=True,padding=True)
    sample_dataloader = DataLoader(test_encodings, entropy_prob_df.coarse_label.to_list())

    #test_df 
    test_encodings = tokenizer(test_df['text'].to_list(),truncation=True,padding=True)
    test_dataloader = DataLoader(test_encodings, test_df.coarse_label.to_list())

    trainer = Trainer(
        #the pre-trained bert model that will be fine-tuned
        model=model,
        #training arguments that we defined above
        args=training_args,
        train_dataset= train_dataloader,
        eval_dataset = test_dataloader,
        compute_metrics= compute_metrics
    )

    trainer.train()


    
    

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/51 [00:00<?, ?it/s]

{'loss': 1.624, 'learning_rate': 2.5e-05, 'epoch': 2.94}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 1.3472628593444824, 'eval_Accuracy': 0.586, 'eval_F1': 0.44060796701707866, 'eval_Precision': 0.7384507580240803, 'eval_Recall': 0.4825863061261837, 'eval_runtime': 0.6968, 'eval_samples_per_second': 717.611, 'eval_steps_per_second': 22.964, 'epoch': 2.94}
{'train_runtime': 9.2792, 'train_samples_per_second': 87.939, 'train_steps_per_second': 5.496, 'train_loss': 1.622994060609855, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/51 [00:00<?, ?it/s]

{'loss': 1.117, 'learning_rate': 2.5e-05, 'epoch': 2.94}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 0.970547616481781, 'eval_Accuracy': 0.722, 'eval_F1': 0.6236021829783662, 'eval_Precision': 0.8134894086692955, 'eval_Recall': 0.6213917061345651, 'eval_runtime': 0.7042, 'eval_samples_per_second': 710.028, 'eval_steps_per_second': 22.721, 'epoch': 2.94}
{'train_runtime': 8.9941, 'train_samples_per_second': 90.726, 'train_steps_per_second': 5.67, 'train_loss': 1.117545578994003, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/51 [00:00<?, ?it/s]

{'loss': 0.5896, 'learning_rate': 2.5e-05, 'epoch': 2.94}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 0.5507956743240356, 'eval_Accuracy': 0.866, 'eval_F1': 0.7311907415809517, 'eval_Precision': 0.8982055965169394, 'eval_Recall': 0.7370056319154309, 'eval_runtime': 0.7093, 'eval_samples_per_second': 704.921, 'eval_steps_per_second': 22.557, 'epoch': 2.94}
{'train_runtime': 9.0408, 'train_samples_per_second': 90.258, 'train_steps_per_second': 5.641, 'train_loss': 0.590410590171814, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/51 [00:00<?, ?it/s]

{'loss': 0.1934, 'learning_rate': 2.5e-05, 'epoch': 2.94}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 0.36938953399658203, 'eval_Accuracy': 0.896, 'eval_F1': 0.7509687572486553, 'eval_Precision': 0.9150933201698438, 'eval_Recall': 0.7564720009113061, 'eval_runtime': 0.7081, 'eval_samples_per_second': 706.159, 'eval_steps_per_second': 22.597, 'epoch': 2.94}
{'train_runtime': 9.0663, 'train_samples_per_second': 90.003, 'train_steps_per_second': 5.625, 'train_loss': 0.1964872818367154, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/51 [00:00<?, ?it/s]

{'loss': 0.062, 'learning_rate': 2.5e-05, 'epoch': 2.94}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 0.28451189398765564, 'eval_Accuracy': 0.918, 'eval_F1': 0.8822363442084279, 'eval_Precision': 0.9266425299526354, 'eval_Recall': 0.8599007199433624, 'eval_runtime': 0.7115, 'eval_samples_per_second': 702.762, 'eval_steps_per_second': 22.488, 'epoch': 2.94}
{'train_runtime': 9.0861, 'train_samples_per_second': 89.808, 'train_steps_per_second': 5.613, 'train_loss': 0.06372915179121728, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/51 [00:00<?, ?it/s]

{'loss': 0.0206, 'learning_rate': 2.5e-05, 'epoch': 2.94}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 0.3230632245540619, 'eval_Accuracy': 0.912, 'eval_F1': 0.8930663055614404, 'eval_Precision': 0.9277780801274638, 'eval_Recall': 0.8725050904056832, 'eval_runtime': 0.7172, 'eval_samples_per_second': 697.196, 'eval_steps_per_second': 22.31, 'epoch': 2.94}
{'train_runtime': 9.0957, 'train_samples_per_second': 89.713, 'train_steps_per_second': 5.607, 'train_loss': 0.021438854582169476, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/51 [00:00<?, ?it/s]

{'loss': 0.0076, 'learning_rate': 2.5e-05, 'epoch': 2.94}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 0.3089001178741455, 'eval_Accuracy': 0.932, 'eval_F1': 0.9090653276523106, 'eval_Precision': 0.944967240693574, 'eval_Recall': 0.8869236701883737, 'eval_runtime': 0.7147, 'eval_samples_per_second': 699.635, 'eval_steps_per_second': 22.388, 'epoch': 2.94}
{'train_runtime': 9.1419, 'train_samples_per_second': 89.26, 'train_steps_per_second': 5.579, 'train_loss': 0.007595811934009486, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/51 [00:00<?, ?it/s]

{'loss': 0.0029, 'learning_rate': 2.5e-05, 'epoch': 2.94}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 0.4120580554008484, 'eval_Accuracy': 0.92, 'eval_F1': 0.9008051511251756, 'eval_Precision': 0.9350775394296448, 'eval_Recall': 0.8813182299956174, 'eval_runtime': 0.71, 'eval_samples_per_second': 704.255, 'eval_steps_per_second': 22.536, 'epoch': 2.94}
{'train_runtime': 9.1212, 'train_samples_per_second': 89.462, 'train_steps_per_second': 5.591, 'train_loss': 0.0030147394567143684, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/51 [00:00<?, ?it/s]

{'loss': 0.0012, 'learning_rate': 2.5e-05, 'epoch': 2.94}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 0.3976462483406067, 'eval_Accuracy': 0.922, 'eval_F1': 0.8862912379295542, 'eval_Precision': 0.8807627156069869, 'eval_Recall': 0.8956258703598787, 'eval_runtime': 0.7111, 'eval_samples_per_second': 703.131, 'eval_steps_per_second': 22.5, 'epoch': 2.94}
{'train_runtime': 9.1393, 'train_samples_per_second': 89.284, 'train_steps_per_second': 5.58, 'train_loss': 0.0012375229156996106, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/51 [00:00<?, ?it/s]

{'loss': 0.0005, 'learning_rate': 2.5e-05, 'epoch': 2.94}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 0.4328893721103668, 'eval_Accuracy': 0.924, 'eval_F1': 0.9026753558158057, 'eval_Precision': 0.9362555245260098, 'eval_Recall': 0.8815033348322657, 'eval_runtime': 0.7171, 'eval_samples_per_second': 697.258, 'eval_steps_per_second': 22.312, 'epoch': 2.94}
{'train_runtime': 9.1373, 'train_samples_per_second': 89.304, 'train_steps_per_second': 5.581, 'train_loss': 0.0005222474471392, 'epoch': 3.0}


In [45]:
predictions = trainer.predict(test_dataloader)
predicted_labels = predictions.predictions.argmax(axis=1)
accuracy = accuracy_score(test_df['coarse_label'].to_list(),predicted_labels)
print('accuracy {}% '.format(accuracy * 100))


  0%|          | 0/16 [00:00<?, ?it/s]

accuracy 92.2% 
