In [2]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AdamW
import os
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
torch.manual_seed(42)
np.random.seed(42)

In [3]:
from google.colab import drive

drive.mount('/content/drive')

# Specify the file path in your Google Drive
file_path = '/content/drive/StonyBrook/Conversational MBTI Classifier/Train.csv' 


ModuleNotFoundError: No module named 'google'

In [6]:
BATCH_SIZE = 16
EPOCHS = 3

TEST_PATH = "testdata.csv"
TRAIN_PATH= "traindata.csv"
EVAL_PATH = "evaldata.csv"
SAVE_PATH= "models/Big5"

In [7]:
def import_data(path):
    dataset = pd.read_csv(path)
    return dataset
train_data = import_data(TRAIN_PATH)
test_data = import_data(TEST_PATH)
eval_data = import_data(EVAL_PATH)

In [8]:
class Big5Classifier:
    def __init__(self, model_name='distilbert-base-uncased', num_classes=2):
        # Initialize models here
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = num_classes)

    def get_tokenizer_and_model(self):
        return self.model, self.tokenizer  

In [13]:
def get_dataset_split(data, big5category):
    text_column = f"text{big5category[0]}"
    category_column = f"c{big5category}"

    # Check if the required columns exist
    if text_column in data.columns and category_column in data.columns:
        return data[[text_column, category_column]].rename(
            columns={text_column: "essay", category_column: "decision"}
        )    
    else:
        raise ValueError(f"Columns for category '{big5category}' are not present in the dataset.")
    

In [12]:
class DatasetLoader(Dataset):
    #cateogry can be "OPN", "CON", "EXT", "AGR", "NEU"
    def __init__(self, data, big5category, tokenizer):
        self.data = get_dataset_split(data, big5category)
        self.tokenizer = tokenizer
        self.big5category = big5category
        self.questionMap =  {"OPN":"This is a question about openness: Describe a time when you tried something completely new—whether it was a different activity, way of thinking, or environment. What motivated you to try it, and how did you feel about the experience afterward? ",
                "CON":"This is a question about conscientiousness: Think of a goal you set for yourself that required sustained effort over time. How did you manage your time and resources to stay on track, and what strategies helped you stay committed, even when challenges came up? What did you find challenging or rewarding about the experience?",
                "EXT":"This is a question about extraversion: Recall a memorable social experience that either energized you or left you feeling drained. What do you think made the interaction fulfilling or draining? How did it shape your understanding of your social preferences or needs? ",
                "AGR":"This is a question about agreeableness: Describe a situation where you found yourself in disagreement with someone. How did you handle the situation, and what were your priorities in resolving or understanding the conflict? ",
                "NEU":"This is a question about neuroticism: Think of a time when you felt particularly stressed or anxious. How did you respond initially, and what steps did you take to manage your emotions and approach the situation constructively? "
               }
    
    def tokenize_data(self):
        print("Processing data")
        tokens = []
        labels = []
        label_dict = {'y': 1, 'n': 0}

        answers = self.data['essay'].to_list() 
        label_list = self.data['decision'].to_list()

        for (answer, decision) in tqdm(zip(answers, label_list), total = len(answers)):
            #concatenate the question asked before the answer.
            qa_concat = f"{self.questionMap[self.big5category]} [SEP] {answer} "
            encoded_text = self.tokenizer.encode(qa_concat, max_length = 400, truncation = True)
            tokens.append(torch.tensor(encoded_text))
            labels.append(label_dict[decision])
        
        tokens = pad_sequence(tokens, batch_first = True)
        labels = torch.tensor(labels)
        return TensorDataset(tokens, labels)
    
    def get_data_loaders(self, batch_size = 32, shuffle = True):
        processed_dataset = self.tokenize_data()

        data_loader = DataLoader(
            processed_dataset,
            shuffle=shuffle,
            batch_size=batch_size
        )

        return data_loader

In [30]:
class Trainer:
    def __init__(self, big5category, train_data, eval_data, batch_size, epochs, device, save_path, training_type):
        """
        Initializes the Trainer class with training and evaluation parameters.
        Args:
        - model: PyTorch model to be trained (e.g., DistilBERT).
        - train_data: Training dataset.
        - eval_data: Evaluation dataset.
        - batch_size: Batch size for DataLoader.
        - epochs: Number of training epochs.
        - optimizer: Optimizer (e.g., Adam).
        - criterion: Loss function (e.g., CrossEntropyLoss).
        - device: Device to run the model on (e.g., 'cuda' or 'cpu').
        - savepath: Path to save the model after training.
        - training_type: Type of training ("fully_frozen", "top_4_training", "bottom_4_training", "all_training").
        """
        transformer = Big5Classifier()
        self.model, self.tokenizer = transformer.get_tokenizer_and_model()
        self.batch_size = batch_size
        self.epochs = epochs
        self.device = device
        self.save_path = save_path
        self.training_type = training_type

        # Create DataLoaders from datasets
        self.train_dataset = DatasetLoader(train_data, big5category, self.tokenizer)
        self.eval_dataset = DatasetLoader(eval_data, big5category, self.tokenizer)

        # Send model to device
        self.model.to(self.device)
        
        # Set the training parameters based on the training type
        self.set_training_parameters()

    def set_training_parameters(self):
        """
        Freezes or unfreezes model parameters based on the specified training type.
        """
        # Freeze all parameters by default
        for param in self.model.parameters():
            param.requires_grad = False

        if self.training_type == "fully_frozen":
            # Only the classifier head is trainable
            for param in self.model.classifier.parameters():
                param.requires_grad = True
        elif self.training_type == "top_4_training":
            # Unfreeze the top 4 layers of the transformer and the classifier
            for layer in range(2, 6):  # Loop through top 4 layers
                for param in self.model.distilbert.transformer.layer[layer].parameters():
                    param.requires_grad = True
            for param in self.model.classifier.parameters():
                param.requires_grad = True
        elif self.training_type == "bottom_4_training":
            # Unfreeze the bottom 4 layers of the transformer and the classifier
            for layer in range(0, 4):  # Loop through bottom 4 layers
                for param in self.model.distilbert.transformer.layer[layer].parameters():
                    param.requires_grad = True
            for param in self.model.classifier.parameters():
                param.requires_grad = True
        elif self.training_type == "all_training":
            # Unfreeze all layers
            for param in self.model.parameters():
                param.requires_grad = True

        print(f"Training type set to: {self.training_type}")

    def get_performance_metrics(self, preds, labels):
        """
        Calculate performance metrics: accuracy, precision, recall, and F1-score.
        Args:
        - preds: Predictions from the model.
        - labels: Ground truth labels.
        Returns:
        - metrics: Dictionary containing accuracy, precision, recall, and F1-score.
        """
        pred_flat = np.argmax(preds, axis=1).flatten()
        labels_flat = labels.flatten()

        accuracy = accuracy_score(labels_flat, pred_flat)
        precision = precision_score(labels_flat, pred_flat, zero_division=0, average="weighted")
        recall = recall_score(labels_flat, pred_flat, zero_division=0, average="weighted")
        f1 = f1_score(labels_flat, pred_flat, zero_division=0, average="weighted")
        
        return precision, recall, f1, accuracy

    def train(self, data_loader, optimizer):
        self.model.train()
        total_recall = 0
        total_precision = 0
        total_f1 = 0
        total_accuracy = 0
        total_loss = 0

        for batch_idx, (reviews, labels) in enumerate(tqdm(data_loader)):
            self.model.zero_grad()
            reviews, labels = reviews.to(self.device), labels.to(self.device)
            outputs = self.model(reviews, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            optimizer.step()

            preds = outputs.logits.detach().cpu().numpy()
            labels_np = labels.cpu().numpy()
            precision, recall, f1, accuracy = self.get_performance_metrics(preds, labels_np)
            total_precision += precision
            total_recall += recall
            total_accuracy += accuracy
            total_f1 += f1

        precision = total_precision/len(data_loader)
        recall = total_recall/len(data_loader)
        f1 = total_f1/len(data_loader)
        accuracy = total_accuracy/len(data_loader)
        loss = total_loss/len(data_loader)

        return precision, recall, f1, accuracy, loss

    def evaluate(self, data_loader):
        """
        Evaluates the model on the validation dataset.
        """
        self.model.eval()  # Set the model to evaluation mode
        total_recall = 0
        total_precision = 0
        total_f1 = 0
        total_loss = 0
        total_accuracy = 0

        with torch.no_grad():
            for (answers, labels) in tqdm(data_loader):
                self.model.zero_grad()
                answers, labels = answers.to(self.device), labels.to(self.device)
                outputs = self.model(answers, labels=labels)
                loss = outputs.loss
                total_loss += loss.item()

                preds = outputs.logits.detach().cpu().numpy()
                labels_np = labels.cpu().numpy()
                precision, recall, f1, accuracy = self.get_performance_metrics(preds, labels_np)
                total_precision += precision
                total_recall += recall
                total_accuracy += accuracy
                total_f1 += f1

            precision = total_precision/len(data_loader)
            recall = total_recall/len(data_loader)
            f1 = total_f1/len(data_loader)
            accuracy = total_accuracy/len(data_loader)
            loss = total_loss/len(data_loader)

        return precision, recall, f1, accuracy, loss
    
    def save_transformer(self):
        self.model.save_pretrained(self.save_path)
        self.tokenizer.save_pretrained(self.save_path)

    def execute(self):
        last_best = 0
        train_data_loader = self.train_dataset.get_data_loaders(self.batch_size)
        val_data_loader = self.eval_dataset.get_data_loaders(self.batch_size)
        optimizer = torch.optim.AdamW(self.model.parameters(), lr = 3e-5, eps = 1e-8)
        self.set_training_parameters()
        for epoch_i in range(0, self.epochs):
            train_precision, train_recall, train_f1, train_accuracy, train_loss = self.train(train_data_loader, optimizer)
            print(f'Epoch {epoch_i + 1}: train_loss: {train_loss:.4f} train_precision: {train_precision:.4f} train_recall: {train_recall:.4f} train_accuracy: {train_accuracy:.4f}train_f1: {train_f1:.4f}')
            val_precision, val_recall, val_f1, val_accuracy, val_loss = self.evaluate(val_data_loader)
            print(f'Epoch {epoch_i + 1}: val_loss: {val_loss:.4f} val_precision: {val_precision:.4f} val_recall: {val_recall:.4f} val_accuracy: {val_accuracy:.4f} val_f1: {val_f1:.4f}')

            if val_f1 > last_best:
                print("Saving model..")
                self.save_transformer()
                last_best = val_f1
                print("Model saved.")

In [31]:
trainer = Trainer(
    big5category="OPN",
    batch_size=BATCH_SIZE,
    device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
    epochs=EPOCHS,
    save_path=SAVE_PATH + '_fully_frozen',
    training_type='fully_frozen',
    train_data=train_data,
    eval_data=eval_data
)
trainer.execute()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training type set to: fully_frozen
Processing data


100%|██████████| 1301/1301 [00:00<00:00, 5120.85it/s]


Processing data


100%|██████████| 300/300 [00:00<00:00, 4346.80it/s]


Training type set to: fully_frozen


100%|██████████| 82/82 [00:01<00:00, 51.27it/s]


Epoch 1: train_loss: 0.6945 train_precision: 0.5348 train_recall: 0.4965 train_accuracy: 0.4965train_f1: 0.4899


100%|██████████| 19/19 [00:00<00:00, 49.99it/s]


Epoch 1: val_loss: 0.6921 val_precision: 0.5804 val_recall: 0.5471 val_accuracy: 0.5471 val_f1: 0.5338
Saving model..
Model saved.


100%|██████████| 82/82 [00:01<00:00, 51.30it/s]


Epoch 2: train_loss: 0.6915 train_precision: 0.5630 train_recall: 0.5280 train_accuracy: 0.5280train_f1: 0.5265


100%|██████████| 19/19 [00:00<00:00, 50.49it/s]


Epoch 2: val_loss: 0.6910 val_precision: 0.6517 val_recall: 0.6140 val_accuracy: 0.6140 val_f1: 0.6064
Saving model..
Model saved.


100%|██████████| 82/82 [00:01<00:00, 52.05it/s]


Epoch 3: train_loss: 0.6901 train_precision: 0.5750 train_recall: 0.5323 train_accuracy: 0.5323train_f1: 0.5296


100%|██████████| 19/19 [00:00<00:00, 51.54it/s]


Epoch 3: val_loss: 0.6901 val_precision: 0.6623 val_recall: 0.6393 val_accuracy: 0.6393 val_f1: 0.6337
Saving model..
Model saved.


In [None]:
trainer = Trainer(
    big5category="OPN",
    batch_size=BATCH_SIZE,
    device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
    epochs=EPOCHS,
    save_path=SAVE_PATH + 'top_4_training',
    training_type='top_4_training',
    train_data=train_data,
    eval_data=eval_data
)
trainer.execute()


In [None]:
trainer = Trainer(
    big5category="OPN",
    batch_size=BATCH_SIZE,
    device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
    epochs=EPOCHS,
    save_path=SAVE_PATH + 'bottom_4_training',
    training_type='bottom_4_training',
    train_data=train_data,
    eval_data=eval_data
)
trainer.execute()


In [None]:
trainer = Trainer(
    big5category="OPN",
    batch_size=BATCH_SIZE,
    device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
    epochs=EPOCHS,
    save_path=SAVE_PATH + 'all_training',
    training_type='all_training',
    train_data=train_data,
    eval_data=eval_data
)
trainer.execute()
