In [1]:
import os
import sys

# Get the absolute path to the src folder
src_path = os.path.abspath(os.path.join(os.getcwd(), '../src'))

# Add src to sys.path
sys.path.append(src_path)

print(f'Added {src_path} to sys.path')

Added /home/josecaloca/multiclass-text-classification/src to sys.path


In [2]:
from dotenv import load_dotenv

load_dotenv('./../settings.env')

True

In [3]:
from datasets import load_dataset

from config import config

tokenized_dataset_dict = load_dataset(config.hf_dataset_registry)

In [5]:
from datasets import DatasetDict

# Define fraction reduction mapping for each split
sample_fractions = {
    'train': config.frac_sample_reduction_training,
    'validation': config.frac_sample_reduction_training,
    'test': config.frac_sample_reduction_training,
}

# Apply subsampling to each dataset split
tokenized_dataset_dict = DatasetDict(
    {
        split: dataset.shuffle(seed=config.random_state).select(
            range(int(dataset.num_rows * sample_fractions[split]))
        )
        for split, dataset in tokenized_dataset_dict.items()
    }
)

# Print new dataset sizes
print({split: ds.num_rows for split, ds in tokenized_dataset_dict.items()})

{'train': 337, 'validation': 42, 'test': 42}


In [6]:
import numpy as np
import torch
from datasets import DatasetDict
from transformers import DistilBertModel

# Load DistilBERT model
model = DistilBertModel.from_pretrained('distilbert/distilbert-base-uncased')
model.eval()


def extract_embeddings(batch):
    # Convert input_ids and attention_mask to tensors (handling variable-length sequences)
    input_ids = [torch.tensor(seq) for seq in batch['input_ids']]
    attention_mask = [torch.tensor(seq) for seq in batch['attention_mask']]

    # Pad sequences to the longest length within the batch
    input_ids = torch.nn.utils.rnn.pad_sequence(
        input_ids, batch_first=True, padding_value=0
    )
    attention_mask = torch.nn.utils.rnn.pad_sequence(
        attention_mask, batch_first=True, padding_value=0
    )

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    # Mean Pooling over tokens
    embeddings = outputs.last_hidden_state.mean(dim=1).numpy()

    return {'embeddings': embeddings}


# Apply function to extract embeddings
tokenized_dataset_dict = tokenized_dataset_dict.map(
    extract_embeddings, batched=True, batch_size=32
)

# Convert to NumPy arrays for XGBoost training
X_train = np.array(tokenized_dataset_dict['train']['embeddings'])
y_train = np.array(tokenized_dataset_dict['train']['label'])

X_val = np.array(tokenized_dataset_dict['validation']['embeddings'])
y_val = np.array(tokenized_dataset_dict['validation']['label'])

X_test = np.array(tokenized_dataset_dict['test']['embeddings'])
y_test = np.array(tokenized_dataset_dict['test']['label'])

print('Training set shape:', X_train.shape, y_train.shape)
print('Validation set shape:', X_val.shape, y_val.shape)
print('Test set shape:', X_test.shape, y_test.shape)

2025-03-14 16:18:26.515366: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-14 16:18:26.515450: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-14 16:18:26.549611: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-14 16:18:26.628651: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Map:   0%|          | 0/337 [00:00<?, ? examples/s]

Map:   0%|          | 0/42 [00:00<?, ? examples/s]

Map:   0%|          | 0/42 [00:00<?, ? examples/s]

Training set shape: (337, 768) (337,)
Validation set shape: (42, 768) (42,)
Test set shape: (42, 768) (42,)


In [20]:
import pandas as pd

# Convert training data
df_train = pd.DataFrame(X_train)
df_train['label'] = y_train

# Convert validation data
df_val = pd.DataFrame(X_val)
df_val['label'] = y_val

# Convert test data
df_test = pd.DataFrame(X_test)
df_test['label'] = y_test

# Print shapes
print('Train DataFrame shape:', df_train.shape)
print('Validation DataFrame shape:', df_val.shape)
print('Test DataFrame shape:', df_test.shape)

Train DataFrame shape: (337, 769)
Validation DataFrame shape: (42, 769)
Test DataFrame shape: (42, 769)


In [26]:
import pandas as pd
import xgboost as xgb

# Convert embeddings to DataFrames
df_train = pd.DataFrame(X_train)
df_train['label'] = y_train

df_val = pd.DataFrame(X_val)
df_val['label'] = y_val

df_test = pd.DataFrame(X_test)
df_test['label'] = y_test

# Separate features and labels
X_train, y_train = df_train.drop(columns=['label']), df_train['label']
X_val, y_val = df_val.drop(columns=['label']), df_val['label']
X_test, y_test = df_test.drop(columns=['label']), df_test['label']

# Define XGBoost model for multiclass classification
model = xgb.XGBClassifier(
    objective='multi:softmax',  # Multiclass classification
    num_class=4,  # Number of classes
    eval_metric='mlogloss',  # Multiclass log loss
    eta=0.1,  # Learning rate
    max_depth=6,  # Tree depth
    use_label_encoder=False,
)

# Train the model
model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True)

# Evaluate on the test set
test_accuracy = model.score(X_test, y_test)
print('Test Accuracy:', test_accuracy)

Parameters: { "use_label_encoder" } are not used.



[0]	validation_0-mlogloss:1.28713
[1]	validation_0-mlogloss:1.20621
[2]	validation_0-mlogloss:1.14320
[3]	validation_0-mlogloss:1.09335
[4]	validation_0-mlogloss:1.03713
[5]	validation_0-mlogloss:0.99101
[6]	validation_0-mlogloss:0.94520
[7]	validation_0-mlogloss:0.91010
[8]	validation_0-mlogloss:0.87892
[9]	validation_0-mlogloss:0.84943
[10]	validation_0-mlogloss:0.81986
[11]	validation_0-mlogloss:0.79927
[12]	validation_0-mlogloss:0.77653
[13]	validation_0-mlogloss:0.75074
[14]	validation_0-mlogloss:0.73523
[15]	validation_0-mlogloss:0.71833
[16]	validation_0-mlogloss:0.70347
[17]	validation_0-mlogloss:0.69272
[18]	validation_0-mlogloss:0.68468
[19]	validation_0-mlogloss:0.66858
[20]	validation_0-mlogloss:0.66532
[21]	validation_0-mlogloss:0.65765
[22]	validation_0-mlogloss:0.64344
[23]	validation_0-mlogloss:0.63722
[24]	validation_0-mlogloss:0.62654
[25]	validation_0-mlogloss:0.62274
[26]	validation_0-mlogloss:0.61239
[27]	validation_0-mlogloss:0.60781
[28]	validation_0-mlogloss:0.5

In [None]:
from transformers import DistilBertModel, DistilBertTokenizer

# Load pretrained DistilBERT
model_name = config.pre_trained_bert_model
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
bert_model = DistilBertModel.from_pretrained(model_name)

# Ensure model is in evaluation mode
bert_model.eval()

In [None]:
config.pre_trained_bert_model

In [None]:
for id in range(10):
    print(len(tokenized_dataset_dict['train']['input_ids'][id]))

In [None]:
import numpy as np
from datasets import DatasetDict
from torch.nn.utils.rnn import pad_sequence


def extract_embeddings(dataset_dict: DatasetDict, model, batch_size=32):
    """Extract DistilBERT embeddings for input_ids in a DatasetDict, handling variable-length sequences."""
    model.eval()
    embeddings = {}

    with torch.no_grad():
        for split in dataset_dict.keys():
            all_embeddings = []
            all_labels = []
            dataset = dataset_dict[split]

            for i in range(0, len(dataset), batch_size):
                batch = dataset[i : i + batch_size]

                # Convert to torch tensors
                input_ids = [torch.tensor(ids) for ids in batch['input_ids']]
                attention_mask = [
                    torch.tensor(mask) for mask in batch['attention_mask']
                ]

                # Pad sequences to max length in batch
                input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
                attention_mask = pad_sequence(
                    attention_mask, batch_first=True, padding_value=0
                )

                # Pass through BERT model
                outputs = model(input_ids, attention_mask=attention_mask)
                last_hidden_states = outputs.last_hidden_state

                # Extract [CLS] token (first token) as the embedding
                cls_embeddings = last_hidden_states[:, 0, :].cpu().numpy()

                all_embeddings.append(cls_embeddings)
                all_labels.extend(batch['label'])

            embeddings[split] = {
                'X': np.vstack(all_embeddings),
                'y': np.array(all_labels),
            }

    return embeddings


# Extract embeddings
embeddings_dict = extract_embeddings(tokenized_dataset_dict, bert_model)

In [None]:
input_ids = [
    torch.tensor(tokenized_dataset_dict['train']['input_ids'][0]),
    torch.tensor(tokenized_dataset_dict['train']['input_ids'][1]),
]
attention_mask = [
    torch.tensor(tokenized_dataset_dict['train']['attention_mask'][0]),
    torch.tensor(tokenized_dataset_dict['train']['attention_mask'][1]),
]

In [None]:
input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)

In [None]:
outputs = bert_model(input_ids, attention_mask=attention_mask)

In [None]:
last_hidden_states = outputs.last_hidden_state

In [None]:
last_hidden_states[:, 0, :].cpu().detach().numpy()

In [None]:
import torch
from transformers import DistilBertModel, DistilBertTokenizer

# Load DistilBERT tokenizer and model
model_name = 'distilbert/distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertModel.from_pretrained(model_name)

# Example text
text = ['This is an example sentence.', 'Another sentence for embedding extraction.']

# Tokenize input text
inputs = tokenizer(text, padding=True, truncation=True, return_tensors='pt')

# Forward pass to get embeddings
with torch.no_grad():
    outputs = model(**inputs)

# Extract embeddings from the last hidden state
embeddings = (
    outputs.last_hidden_state
)  # Shape: (batch_size, sequence_length, hidden_size)

# Optionally, use the mean of all token embeddings as a sentence embedding
sentence_embeddings = embeddings.mean(dim=1)  # Shape: (batch_size, hidden_size)

print('Embedding shape:', sentence_embeddings.shape)  # Should be (num_sentences, 768)

In [None]:
inputs

In [None]:
import torch
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    'josecaloca/multiclass-text-classification'
)
with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

In [None]:
from transformers import pipeline

classifier = pipeline(
    'sentiment-analysis', model='josecaloca/multiclass-text-classification'
)
classifier(text)