In [167]:
#!pip install datasets
#!pip install transformers

In [168]:
import numpy as np
import pandas as pd
from datasets import load_dataset
import torch
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from transformers import AutoTokenizer

In [169]:
# Load the simplified version of GoEmotions
dataset = load_dataset("google-research-datasets/go_emotions", "simplified")

# Access train, validation, and test splits
train_data = dataset["train"]
validation_data = dataset["validation"]
test_data = dataset["test"]

In [170]:
# Convert to pandas DataFrames
train_df = train_data.to_pandas()
validation_df = validation_data.to_pandas()
test_df = test_data.to_pandas()

# Inspect the DataFrame
print("Training Data Info:")
train_df.info()
print("\nSample Data:", train_df.head(), "\n\n\n")

print("Validation Data Info:")
validation_df.info()
print("\nSample Data:", validation_df.head(), "\n\n\n")

print("Testing Data Info:")
test_df.info()
print("\nSample Data:", test_df.head(), "\n\n\n")

Training Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43410 entries, 0 to 43409
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    43410 non-null  object
 1   labels  43410 non-null  object
 2   id      43410 non-null  object
dtypes: object(3)
memory usage: 1017.5+ KB

Sample Data:                                                 text labels       id
0  My favourite food is anything I didn't have to...   [27]  eebbqej
1  Now if he does off himself, everyone will thin...   [27]  ed00q6i
2                     WHY THE FUCK IS BAYLESS ISOING    [2]  eezlygj
3                        To make her feel threatened   [14]  ed7ypvh
4                             Dirty Southern Wankers    [3]  ed0bdzj 



Validation Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5426 entries, 0 to 5425
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5426

In [171]:
# Check for null values
print("Missing Values:")
print("\nTraining data frame: \n", train_df.isnull().sum())
print("\nValidation data frame: \n", train_df.isnull().sum())
print("\nTesting data frame: \n", train_df.isnull().sum())

Missing Values:

Training data frame: 
 text      0
labels    0
id        0
dtype: int64

Validation data frame: 
 text      0
labels    0
id        0
dtype: int64

Testing data frame: 
 text      0
labels    0
id        0
dtype: int64


In [172]:
# Delete multiple labels

# Define a helper function to extract the single label
def extract_label(label_list):
  return label_list[0]

# Filter rows with exactly one label and extract the single label
train_df = train_df[train_df['labels'].apply(len) == 1]
train_df.loc[:, 'labels'] = train_df['labels'].apply(extract_label)

validation_df = validation_df[validation_df['labels'].apply(len) == 1]
validation_df.loc[:, 'labels'] = validation_df['labels'].apply(extract_label)

test_df = test_df[test_df['labels'].apply(len) == 1]
test_df.loc[:, 'labels'] = test_df['labels'].apply(extract_label)

# Check shapes after filtering
print("Filtered Training Data Shape:", train_df.shape)
print("\nFiltered Validation Data Shape:", validation_df.shape)
print("\nFiltered Testing Data Shape:", test_df.shape)

# Verify the changes
print("\nSample from Training Data:\n", train_df.head())

print("\nSample from Validation Data:\n", validation_df.head())

print("\nSample from Test Data:\n", test_df.head())

Filtered Training Data Shape: (36308, 3)

Filtered Validation Data Shape: (4548, 3)

Filtered Testing Data Shape: (4590, 3)

Sample from Training Data:
                                                 text  labels       id
0  My favourite food is anything I didn't have to...      27  eebbqej
1  Now if he does off himself, everyone will thin...      27  ed00q6i
2                     WHY THE FUCK IS BAYLESS ISOING       2  eezlygj
3                        To make her feel threatened      14  ed7ypvh
4                             Dirty Southern Wankers       3  ed0bdzj

Sample from Validation Data:
                                                 text  labels       id
0  Is this in New Orleans?? I really feel like th...      27  edgurhb
2               I've never been this sad in my life!      25  edcu99z
4  He could have easily taken a real camera from ...      20  eepig6r
5  Thank you for your vote of confidence, but we ...      15  eczm50f
6  Wah Mum other people call me on my bullshit

  train_df.loc[:, 'labels'] = train_df['labels'].apply(extract_label)
  validation_df.loc[:, 'labels'] = validation_df['labels'].apply(extract_label)
  test_df.loc[:, 'labels'] = test_df['labels'].apply(extract_label)


In [173]:
print(train_df['labels'].unique())  # Show all unique values in the column
print(train_df['labels'].dtype)

[27  2 14  3 26 15  0  6  5 12 17 25 10 20  4 13  1  9 24 18  7 22 11 23
 21 16  8 19]
int64


In [174]:
# Conversion of label objects to Integers
train_df['labels'] = train_df['labels'].astype(int)
validation_df['labels'] = validation_df['labels'].astype(int)
test_df['labels'] = test_df['labels'].astype(int)

# Confirm the data type
print(train_df['labels'].dtype)  # Should now show 'int64'

int32


In [175]:
# Final check
# Check shapes after filtering
print("Filtered Training Data Shape:", train_df.shape)
print("\nFiltered Validation Data Shape:", validation_df.shape)
print("\nFiltered Testing Data Shape:", test_df.shape)

# Verify the changes
print("\nSample from Training Data:\n", train_df.head())

print("\nSample from Validation Data:\n", validation_df.head())

print("\nSample from Test Data:\n", test_df.head())

Filtered Training Data Shape: (36308, 3)

Filtered Validation Data Shape: (4548, 3)

Filtered Testing Data Shape: (4590, 3)

Sample from Training Data:
                                                 text  labels       id
0  My favourite food is anything I didn't have to...      27  eebbqej
1  Now if he does off himself, everyone will thin...      27  ed00q6i
2                     WHY THE FUCK IS BAYLESS ISOING       2  eezlygj
3                        To make her feel threatened      14  ed7ypvh
4                             Dirty Southern Wankers       3  ed0bdzj

Sample from Validation Data:
                                                 text  labels       id
0  Is this in New Orleans?? I really feel like th...      27  edgurhb
2               I've never been this sad in my life!      25  edcu99z
4  He could have easily taken a real camera from ...      20  eepig6r
5  Thank you for your vote of confidence, but we ...      15  eczm50f
6  Wah Mum other people call me on my bullshit

In [176]:
# Preprocessing for Baseline

# Initialize a Vectorizer
#vectorizer_baseline = CountVectorizer()
vectorizer_baseline = TfidfVectorizer()

# Fit the vectorizer on training data and transform all splits
X_train_baseline = vectorizer_baseline.fit_transform(train_df['text'])
X_validation_baseline = vectorizer_baseline.transform(validation_df['text'])
X_test_baseline = vectorizer_baseline.transform(test_df['text'])

# Labels (already preprocessed)
y_train_baseline = train_df['labels'].to_numpy()  # Convert to NumPy array
y_validation_baseline = validation_df['labels'].to_numpy()
y_test_baseline = test_df['labels'].to_numpy()

# Verify feature matrix shapes
print("Baseline Training Feature Matrix Shape:", X_train_baseline.shape)
print("Baseline Validation Feature Matrix Shape:", X_validation_baseline.shape)
print("Baseline Testing Feature Matrix Shape:", X_test_baseline.shape)
print("Baseline Training label Matrix Shape:", y_train_baseline.shape)
print("Baseline Validation label Matrix Shape:", y_validation_baseline.shape)
print("Baseline Testing label Matrix Shape:", y_test_baseline.shape)

Baseline Training Feature Matrix Shape: (36308, 24311)
Baseline Validation Feature Matrix Shape: (4548, 24311)
Baseline Testing Feature Matrix Shape: (4590, 24311)
Baseline Training label Matrix Shape: (36308,)
Baseline Validation label Matrix Shape: (4548,)
Baseline Testing label Matrix Shape: (4590,)


In [177]:
# Preprocessing for Bayesian

# Initialize a Vectorizer
vectorizer_bayesian = CountVectorizer()

# Fit the vectorizer on training data and transform all splits
X_train_bayesian = vectorizer_bayesian.fit_transform(train_df['text'])
X_validation_bayesian = vectorizer_bayesian.transform(validation_df['text'])
X_test_bayesian = vectorizer_bayesian.transform(test_df['text'])

# Labels (already preprocessed)
y_train_bayesian = train_df['labels'].to_numpy()  # Convert to NumPy array
y_validation_bayesian = validation_df['labels'].to_numpy()
y_test_bayesian = test_df['labels'].to_numpy()

# Verify feature matrix shapes
print("Naive Bayes Training Feature Matrix Shape:", X_train_bayesian.shape)
print("Naive Bayes Validation Feature Matrix Shape:", X_validation_bayesian.shape)
print("Naive Bayes Testing Feature Matrix Shape:", X_test_bayesian.shape)
print("Naive Bayes Training label Matrix Shape:", y_train_bayesian.shape)
print("Naive Bayes Validation label Matrix Shape:", y_validation_bayesian.shape)
print("Naive Bayes Testing label Matrix Shape:", y_test_bayesian.shape)

Naive Bayes Training Feature Matrix Shape: (36308, 24311)
Naive Bayes Validation Feature Matrix Shape: (4548, 24311)
Naive Bayes Testing Feature Matrix Shape: (4590, 24311)
Naive Bayes Training label Matrix Shape: (36308,)
Naive Bayes Validation label Matrix Shape: (4548,)
Naive Bayes Testing label Matrix Shape: (4590,)


In [178]:
# Class distributions

def class_distribtion(label_set):
  labels, counts = np.unique(label_set, return_counts=True)

  # Calculate frequencies as percentages
  frequencies = (counts / counts.sum()) * 100

  # Create a DataFrame to store counts and frequencies
  class_distribution_table = pd.DataFrame({
    "Class Label": labels,
    "Count": counts,
    "Frequency (%)": frequencies
  })

  # Sort the table by class label for better organization
  class_distribution_table = class_distribution_table.sort_values(by="Class Label").reset_index(drop=True)
  return class_distribution_table

print("Training Label Distribution: \n", class_distribtion(y_train_bayesian))
print("\nValidation Label Distribution:\n", class_distribtion(y_validation_bayesian))
print("\nTesting Label Distribution:\n", class_distribtion(y_test_bayesian))

Training Label Distribution: 
     Class Label  Count  Frequency (%)
0             0   2710       7.463920
1             1   1652       4.549961
2             2   1025       2.823069
3             3   1451       3.996364
4             4   1873       5.158643
5             5    649       1.787485
6             6    858       2.363116
7             7   1389       3.825603
8             8    389       1.071389
9             9    709       1.952738
10           10   1402       3.861408
11           11    498       1.371599
12           12    203       0.559105
13           13    510       1.404649
14           14    430       1.184312
15           15   1857       5.114575
16           16     39       0.107414
17           17    853       2.349344
18           18   1427       3.930263
19           19     85       0.234108
20           20    861       2.371378
21           21     51       0.140465
22           22    586       1.613969
23           23     88       0.242371
24           24    

In [179]:
# Preprocessing for LLM

# Initialize a tokenizer
tokenizer_llm = AutoTokenizer.from_pretrained("bert-base-uncased") # We will use bert-base-uncased

# Tokenize text data for all splits
train_encodings = tokenizer_llm(
    list(train_df['text']), truncation=True, padding='max_length', max_length=128
)
validation_encodings = tokenizer_llm(
    list(validation_df['text']), truncation=True, padding='max_length', max_length=128
)
test_encodings = tokenizer_llm(
    list(test_df['text']), truncation=True, padding='max_length', max_length=128
)

# Convert tokenized data into PyTorch tensors
X_train_llm = torch.tensor(train_encodings['input_ids'])
X_train_attention_masks = torch.tensor(train_encodings['attention_mask'])
y_train_llm = torch.tensor(train_df['labels'].to_numpy())

X_validation_llm = torch.tensor(validation_encodings['input_ids'])
X_validation_attention_masks = torch.tensor(validation_encodings['attention_mask'])
y_validation_llm = torch.tensor(validation_df['labels'].to_numpy())

X_test_llm = torch.tensor(test_encodings['input_ids'])
X_test_attention_masks = torch.tensor(test_encodings['attention_mask'])
y_test_llm = torch.tensor(test_df['labels'].to_numpy())

# Check shapes
print("Training Input IDs Shape:", X_train_llm.shape)
print("Training Attention Masks Shape:", X_train_attention_masks.shape)
print("Training Labels Shape:", y_train_llm.shape)

print("Validation Input IDs Shape:", X_validation_llm.shape)
print("Validation Attention Masks Shape:", X_validation_attention_masks.shape)
print("Validation Labels Shape:", y_validation_llm.shape)

print("Testing Input IDs Shape:", X_test_llm.shape)
print("Testing Attention Masks Shape:", X_test_attention_masks.shape)
print("Testing Labels Shape:", y_test_llm.shape)


Training Input IDs Shape: torch.Size([36308, 128])
Training Attention Masks Shape: torch.Size([36308, 128])
Training Labels Shape: torch.Size([36308])
Validation Input IDs Shape: torch.Size([4548, 128])
Validation Attention Masks Shape: torch.Size([4548, 128])
Validation Labels Shape: torch.Size([4548])
Testing Input IDs Shape: torch.Size([4590, 128])
Testing Attention Masks Shape: torch.Size([4590, 128])
Testing Labels Shape: torch.Size([4590])


In [180]:
# 2 - Implement Naive Bayes and Finetune an LLM

In [181]:
# 2.1 - Naive Bayes Model

In [182]:
def logsumexp(Z):                                                # dimension C x N
    Zmax = np.max(Z,axis=0)[None,:]                              # max over C
    log_sum_exp = Zmax + np.log(np.sum(np.exp(Z - Zmax), axis=0))
    return log_sum_exp

class NaiveBayes: 

    def __init__(self):
        return
    
    def fit(self, x, y):
        N, D = x.shape
        C = np.max(y) + 1
        # one parameter for each feature conditioned on each class
        mu, sigma = np.zeros((C,D)), np.zeros((C,D))
        Nc = np.zeros(C) # number of instances in class c
        # for each class get the MLE for the mean and std
        for c in range(C):
            x_c = x[y == c]                           #slice all the elements from class c
            Nc[c] = x_c.shape[0]                      #get number of elements of class c
            mu[c,:] = np.mean(x_c,0)                  #mean of features of class c
            sigma[c,:] = np.std(x_c, 0)               #std of features of class c
            
        self.mu = mu                                  # C x D
        self.sigma = sigma                            # C x D
        self.pi = (Nc+1)/(N+C)                        #Laplace smoothing (using alpha_c=1 for all c) you can derive using Dirichlet's distribution
        return self

    def predict(self, xt):
        Nt, D = xt.shape
        # for numerical stability we work in the log domain
        # we add a dimension because this is added to the log-likelihood matrix 
        # that assigns a likelihood for each class (C) to each test point, and so it is C x N
        log_prior = np.log(self.pi)[:, None]
        # logarithm of the likelihood term for Gaussian 
        # the first two terms are the logarithm of the normalization term in the Gaussian and the final term is the exponent in the Gaussian. 
        # Notice that we are adding dimensions (using None) to model parameters and data to make this evaluation. 
        # The reason is that sigma and mu are C x D, while the data x is N x D. We operate on a C x N x D shape by increasing the number of dimensions when needed
        log_likelihood = -.5 * np.log(2*np.pi) - np.log(self.sigma[:,None,:]) -.5 * (((xt[None,:,:] - self.mu[:,None,:])/self.sigma[:,None,:])**2)
        # now we sum over the feature dimension to get a C x N matrix (this has the log-likelihood for each class-test point combination)
        log_likelihood = np.sum(log_likelihood, axis=2)
        # posterior calculation
        log_posterior = log_prior + log_likelihood
        posterior = np.exp(log_posterior - logsumexp(log_posterior))
        return posterior.T                                                  # dimension N x C
    
    def evaluate_acc(self, y, y_hat):
        return np.mean(y == y_hat)

In [183]:
import numpy as np

class NaiveBayesOptimized:
    def __init__(self):
        self.class_priors = {}
        self.feature_likelihoods = {}
        self.classes = None

    def fit(self, X, y):
        self.classes = np.unique(y)
        n_samples, n_features = X.shape
        
        # Calculate prior probabilities
        for cls in self.classes:
            X_cls = X[y == cls]
            self.class_priors[cls] = X_cls.shape[0] / n_samples
            self.feature_likelihoods[cls] = {}
            
            # Calculate likelihoods
            for feature_index in range(n_features):
                feature_values = X_cls[:, feature_index]
                unique_values, counts = np.unique(feature_values, return_counts=True)
                likelihoods = counts / counts.sum()
                self.feature_likelihoods[cls][feature_index] = dict(zip(unique_values, likelihoods))

    def predict(self, X):
        n_samples, n_features = X.shape
        log_probs = np.zeros((n_samples, len(self.classes)))
        
        for idx, cls in enumerate(self.classes):
            log_prob = np.log(self.class_priors[cls])
            log_probs[:, idx] = log_prob
            
            for feature_index in range(n_features):
                feature_values = X[:, feature_index]
                likelihoods = np.array([self.feature_likelihoods[cls][feature_index].get(value, 1e-6) for value in feature_values])
                log_probs[:, idx] += np.log(likelihoods)
        
        return self.classes[np.argmax(log_probs, axis=1)]

    def evaluate_acc(self, y_true, y_pred):
        correct = np.sum(y_true == y_pred)
        return correct / len(y_true)


In [184]:
X_train_small = X_train_bayesian[:500]
y_train_small = y_train_bayesian[:500]

X_validation_small = X_validation_bayesian[:500]
y_validation_small = y_validation_bayesian[:500]

X_test_small = X_test_bayesian[:500]
y_test_small = y_test_bayesian[:500]

naive_bayes = NaiveBayesOptimized() 
naive_bayes.fit(X_train_bayesian.toarray(), y_train_bayesian)
y_hat = naive_bayes.predict(X_test_bayesian.toarray())

# Evaluate the model
accuracy = naive_bayes.evaluate_acc(y_test_bayesian, y_hat)
print(accuracy)


KeyboardInterrupt: 

In [None]:
# 2. Implement Softmax Regression from Sklearn

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Initialize the model
softmax_regression = LogisticRegression(max_iter=1000)

# Fit the model
softmax_regression.fit(X_train_baseline, y_train_baseline)

# Predict on the validation, test set
y_pred_softmax = softmax_regression.predict(X_validation_baseline)
y_pred_test_softmax = softmax_regression.predict(X_test_baseline)

# Evaluate the model
accuracy_validation_softmax = accuracy_score(y_validation_baseline, y_pred_softmax)
accuracy_test_softmax = accuracy_score(y_test_baseline, y_pred_test_softmax)

print("Validation Accuracy (Softmax Regression):", accuracy_validation_softmax)
print("Test Accuracy (Softmax Regression):", accuracy_test_softmax)

Validation Accuracy (Softmax Regression): 0.5521108179419525
Test Accuracy (Softmax Regression): 0.5490196078431373


In [185]:
%%javascript
require.config({
    paths: {
        d3: '//cdnjs.cloudflare.com/ajax/libs/d3/3.4.8/d3.min',
        jquery: '//ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min',
    }
});


<IPython.core.display.Javascript object>

In [None]:
#LLM Model

from transformers import AutoModelForSequenceClassification
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import AdamW
from sklearn.metrics import accuracy_score


# this is the tokenizer using BERT tokenizer_llm

model_llm = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=27)

RuntimeError: Failed to import transformers.models.bert.modeling_bert because of the following error (look up to see its traceback):


IMPORTANT: PLEASE READ THIS FOR ADVICE ON HOW TO SOLVE THIS ISSUE!

Importing the numpy C-extensions failed. This error can happen for
many reasons, often due to issues with your setup or how NumPy was
installed.

We have compiled some common reasons and troubleshooting tips at:

    https://numpy.org/devdocs/user/troubleshooting-importerror.html

Please note and check the following:

  * The Python version is: Python3.9 from "C:\Users\finnl_y\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\python.exe"
  * The NumPy version is: "1.24.1"

and make sure that they are the versions you expect.
Please carefully study the documentation linked above for further help.

Original error was: DLL load failed while importing _multiarray_umath: The specified module could not be found.


In [24]:
#freeze all parameters except the last layer
for param in model_llm.bert.parameters():
    param.requires_grad = False

#the classification head (last layer) remains trainable
for param in model_llm.classifier.parameters():
    param.requires_grad = True

AttributeError: 'BertModel' object has no attribute 'bert'

In [None]:
#Create the TensorDataset and DataLoader
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], y_train_llm)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

validation_dataset = TensorDataset(validation_encodings['input_ids'], validation_encodings['attention_mask'], y_validation_llm)
validation_loader = DataLoader(validation_dataset, batch_size=16)

#ensure optimizer only works on trainable layers
optimizer = AdamW(filter(lambda p: p.requires_grad, model_llm.parameters()), lr=5e-5)

In [None]:
#training loop

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model_llm.to(device)

epochs = 3
for epoch in range(epochs):
    model_llm.train()
    total_loss = 0

    for batch in train_loader:
        input_ids, attention_mask, labels = [x.to(device) for x in batch]
        
        optimizer.zero_grad()

        # Forward pass
        outputs = model_llm(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss  # CrossEntropyLoss is handled internally
        logits = outputs.logits
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader):.4f}")


In [None]:
#evaluate on validation set

model_llm.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in validation_loader:
        input_ids, attention_mask, labels = [x.to(device) for x in batch]
        
        outputs = model_llm(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
print(f"Validation Accuracy: {accuracy:.4f}")
