# Bert Model V2
### Operating with GPU support on Mac M2 Pro

In VScode, if the kernel doesnt show up automatically, go to VS code settings->user->extensions->python->Env File. Enter ${workspaceFolder}/venv/.env into the path.

Go back to this notebook, select kernel again. It should be under the Jupyter Kernel... choice, and its called venv.

Make sure to activate it in terminal in the 'bertModel_v2' folder using the command $source venv/bin/activate

<h2>Imports</h2>

In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
import pandas as pd
from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm


<h2>Confirm GPU support</h2>

In [2]:
global device
#Check if GPU support is enabled. returns: device object, msg
def check_device():
    msg = ''
    if torch.backends.mps.is_available():
        mps_device = torch.device("mps")
        x = torch.ones(1, device=mps_device)    
        msg=x
    else:
        msg="MPS device not found."
    return msg
print(check_device())

tensor([1.], device='mps:0')


## BERT Model

In [3]:
class bertModel(Dataset):
    def __init__(self, dataPath= '../../sample.csv', testSize=0.2, randomState=42):
        #data
        self.datasetPath = dataPath
        self.dataFrame = pd.read_csv(dataPath)
        self.texts = self.dataFrame['text'].tolist()
        self.labels = self.dataFrame['label'].tolist()


        #parameters
        self.batch_size = 8
        self.max_len = 16
        self.epochs = 4
        
        #hyperparameters
        self.test_size = testSize
        self.random_state = randomState
        self.learn_rate = 2e-5

        # datasets
        self.train_texts, self.val_texts, self.train_labels, self.val_labels = None, None, None, None
        
        #device
        self.device = torch.device("mps") if torch.backends.mps.is_available() else "cpu"

        #objects
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        
        self.model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)
        
        self.model.to("mps")
        self.train_loader = None
        self.val_dataset = None


    # Define a custom dataset class
    class CustomDataset(Dataset):
        def __init__(self, texts, labels, tokenizer, max_len):
            self.texts = texts
            self.labels = labels
            self.tokenizer = tokenizer
            self.max_len = max_len

        def __len__(self):
            return len(self.texts)

        def __getitem__(self, idx):
            text = str(self.texts[idx])
            label = self.labels[idx]

            encoding = self.tokenizer.encode_plus(
                text,
                add_special_tokens=True,
                max_length=self.max_len,
                return_token_type_ids=False,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                return_tensors='pt',
            )
            input_ids = encoding['input_ids'].flatten()
            attention_mask = encoding['attention_mask'].flatten()

            input_ids = input_ids.to("mps") if torch.backends.mps.is_available() else input_ids
            attention_mask = attention_mask.to("mps") if torch.backends.mps.is_available() else attention_mask


            return {
                'text': text,
                'input_ids': input_ids,
                'attention_mask': attention_mask,
                'label': torch.tensor(label, dtype=torch.long, device="mps")
            }
        
    #metric stuff
    def calc_metrics(self, data, prevAvg, prevStdDev):

        data_array = np.array(data)

        avg = np.average(data_array)
        std_dev = np.std(data_array)

        return avg, std_dev

    
    #set parameters, returns bool == success of func
    def setParameters(self, batchSize, maxLen, n_epochs):

        self.batch_size = batchSize
        self.max_len = maxLen
        self.epochs = n_epochs

        print(f"\nParameters Set!\n------------------\nbatch_size:{self.batch_size}\nmax_len:{self.max_len}\nepochs:{self.epochs}")
        return True
    
    #set hyperparameters, returns bool == success of func
    def __setHyperParameters(self, testSize, randomState=42, learnRate=2e-5):
        self.test_size = testSize
        self.random_state = randomState
        self.learn_rate = learnRate

        print(f"\nHyperparameters Set!\n------------------\ntest_size:{self.test_size}\nrandom_state:{self.random_state}\nlearn_rate:{self.learn_rate}")
        return True
    #init model, returns bool == success of func
    def initializeModel(self, batchSize=8, maxLen=32, n_epochs=4, testSize=0.2, randomState=42, learnRate=2e-5):
        
        param_status, hyper_param_status = self.setParameters(batchSize, maxLen, n_epochs), self.__setHyperParameters(testSize, randomState, learnRate)

        if not (param_status and hyper_param_status):
            print("Initialize Failure.")
            return False

        #self.model.to("mps")
        # Split data into training and validation sets
        self.train_texts, self.val_texts, self.train_labels, self.val_labels = train_test_split(self.texts, self.labels, test_size=self.test_size, random_state=self.random_state)

        self.val_dataset = self.CustomDataset(self.val_texts, self.val_labels, self.tokenizer, self.max_len)
        self.val_loader = DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=False)
        
        #format sets and load dataloader
        self.train_dataset = self.CustomDataset(self.train_texts, self.train_labels, self.tokenizer, self.max_len)
        self.train_loader = DataLoader(self.train_dataset, self.batch_size, shuffle=True)

        #init objects
        self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=self.learn_rate)
        self.criterion = torch.nn.CrossEntropyLoss()

        print("\nModel Initialized!\n")
        return True
    
    #no parameters will reset to default state
    def reInitializeModel(self, batchSize=8, maxLen=32, n_epochs=10, testSize=0.2, randomState=42):

        return self.initializeModel(batchSize, maxLen, n_epochs, testSize, randomState)

            
    def runModel(self):
        #for formating so # of epochs displays properly
        temp = (self.epochs + 1) // 10
        count = 0
        while temp//10 > 0:
            temp //= 10
            count += 1
        headerEpochFormatStr = '-----' + str('-' *count)
        #Table header for result data
        print(f'\nEpoch||Train Loss||Val Accuracy||Precision||Recall||F1 Score|')
        print(f'{headerEpochFormatStr}||----------||------------||---------||------||--------|')


        #for storing epoch metrics
        epoch_metrics = []
        # self.model.to("mps")
        # Training loop
        self.model.train() #might need to(device) here
        for epoch in range(self.epochs):
            total_loss = 0
            for batch in self.train_loader:
                input_ids = batch['input_ids']
                attention_mask = batch['attention_mask']
                labels = batch['label']

                self.optimizer.zero_grad()
                outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                total_loss += loss.item()
                loss.backward()
                self.optimizer.step()

            avg_train_loss = total_loss / len(self.train_loader)


            # Validation
            val_predictions = []
            val_true_labels = []  
            val_accuracy = 0
            total_val_samples = 0  
            self.model.eval()
            
            for batch in self.val_loader:
                with torch.no_grad():
                    input_ids = batch['input_ids']
                    attention_mask = batch['attention_mask']
                    labels = batch['label']

                    outputs = self.model(input_ids, attention_mask=attention_mask) # device?
                    logits = outputs.logits
                    _, predicted = torch.max(logits, dim=1)
                    
                    val_accuracy += (predicted == labels).sum().item()
                    total_val_samples += labels.size(0)
                    val_predictions.extend(predicted.cpu().numpy())
                    val_true_labels.extend(labels.cpu().numpy())

            # Convert lists to numpy arrays for sklearn metrics
            val_predictions = np.array(val_predictions)
            val_true_labels = np.array(val_true_labels)

            # Calculate precision, recall, and F1 scores
            precision = precision_score(val_true_labels, val_predictions, average='weighted', zero_division=1)
            recall = recall_score(val_true_labels, val_predictions, average='weighted')
            f1 = f1_score(val_true_labels, val_predictions, average='weighted')
            val_accuracy /= total_val_samples
        
            #print eval metrics
            print(f'{epoch+1}    ||  {avg_train_loss:.4f}  ||   {val_accuracy * 100:.2f}%   || {precision:.4f}  ||{recall:.4f}|| {f1:.4f} |')

            #save metrics
            epoch_metrics.append({'epoch': epoch, 'train_loss': avg_train_loss, 'val_accuracy': val_accuracy, 'precision': precision, 'recall': recall, 'f1':f1})
    
        #return metric results
        parameters = [self.batch_size, self, self.max_len, self.epochs, self.test_size, self.random_state, self.learn_rate]
        return self.model, [parameters, epoch_metrics]
    #---------------------------------vvvvvvvvvvvvvvvv------------------------------------<<<

## Initial Basic Testing

In [4]:
test_model = bertModel()

#parameters
t_batch_size = 64  # keep below 64
t_max_len = 128  # keep below 128
t_epochs = 3
#hyperparameters
t_test_size = 0.2
t_random_state = 42
t_learn_rate = 5e-5 #between 1e-5 and 1e-4


#initialize
test_model.initializeModel(t_batch_size, t_max_len, t_epochs, t_test_size, t_random_state, t_learn_rate)

start_date = datetime.now()
print(f"\nStart Time: {start_date}\n-----------------------")
#run the model
trained_test_model, performance = test_model.runModel()

end_date = datetime.now()
t_runtime = (end_date-start_date)
print(f"\nEnd Time: {end_date}, Total Runtime: {t_runtime}")

# print([f"{performance[1][0]}\n{key}: {performance[1][1][key]}" for key in performance[1][1].keys()])
#retrieve results
# metricKeys = performance[1][1].keys()
results = f"\n{end_date} || Runtime:{t_runtime}\n-Parameters= batch_size:{t_batch_size}, max_len:{t_max_len}, epochs:{t_epochs}\n-Hyperarameters= test_size:{t_test_size}, random_state:{t_random_state}, learn_rate:{t_learn_rate}" + f"\nEpoch||Train Loss||Val Accuracy||Precision||Recall||F1 Score|\n-----||----------||------------||---------||------||" + "".join([f"\n{performance[1][i]['epoch']}    ||  {performance[1][i]['train_loss']:.4f}  ||   {performance[1][i]['val_accuracy'] * 100:.2f}%   || {performance[1][i]['precision']:.4f}  ||{performance[1][i]['recall']:.4f}|| {performance[1][i]['f1']:.4f} |"  for i in range(len(performance[1]))])

#save results
try:
    file = open('./bertModelResults.txt', 'a+')
    file.seek(0,2)

    pos = file.tell()
    while pos > 0:
        pos -= 1
        file.seek(pos, 0)
        if file.read(1) == '\n':
            break
    
    version_string = file.readline().strip()
    version = int(version_string[32:len(version_string) - 2]) + 1
    file.seek(0,2)
    file.write(f'\n{results}\nEmotion Detection BERT Model v2.{version}.0')
    file.close()
except Exception as e:
    print("\nSave Fail: ")
    print(f"Error: {e}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Parameters Set!
------------------
batch_size:64
max_len:128
epochs:3

Hyperparameters Set!
------------------
test_size:0.2
random_state:42
learn_rate:5e-05

Model Initialized!


Start Time: 2024-04-19 17:52:31.384571
-----------------------

Epoch||Train Loss||Val Accuracy||Precision||Recall||F1 Score|
-----||----------||------------||---------||------||--------|
1    ||  1.6603  ||   43.00%   || 0.6810  ||0.4300|| 0.3418 |
2    ||  1.3020  ||   58.50%   || 0.7546  ||0.5850|| 0.4747 |
3    ||  0.8801  ||   70.00%   || 0.7269  ||0.7000|| 0.6647 |

End Time: 2024-04-19 17:53:31.434327, Total Runtime: 0:01:00.049756


## Current Results


Displays the results of all recorded runs.

In [5]:
total_data = open('./bertModelResults.txt', 'r')
total_data_str = total_data.read()
total_data.close()
print(total_data_str)

2024-04-19 17:46:05.216188 || Runtime:0:01:04.555742
Parameters= batch_size:64, max_len:128, epochs:3
Hyperarameters= test_size:0.2, random_state:42, learn_rate:5e-05
Epoch||Train Loss||Val Accuracy||Precision||Recall||F1 Score|
-----||----------||------------||---------||------||
0    ||  1.5952  ||   44.50%   || 0.6488  ||0.4450|| 0.3586 |
1    ||  1.3514  ||   57.00%   || 0.7295  ||0.5700|| 0.4588 |
2    ||  0.9027  ||   67.00%   || 0.6582  ||0.6700|| 0.6175 |
Emotion Detection BERT Model v2.0.0

2024-04-19 17:53:31.434327 || Runtime:0:01:00.049756
-Parameters= batch_size:64, max_len:128, epochs:3
-Hyperarameters= test_size:0.2, random_state:42, learn_rate:5e-05
Epoch||Train Loss||Val Accuracy||Precision||Recall||F1 Score|
-----||----------||------------||---------||------||
0    ||  1.6603  ||   43.00%   || 0.6810  ||0.4300|| 0.3418 |
1    ||  1.3020  ||   58.50%   || 0.7546  ||0.5850|| 0.4747 |
2    ||  0.8801  ||   70.00%   || 0.7269  ||0.7000|| 0.6647 |
Emotion Detection BERT Mo

## Calculate Optimal Parameters
### Metrics:
- ????
