In [1]:
!pip install datasets peft evaluate torch numpy
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np
import pandas as pd

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


### dataset

In [2]:
import pandas as pd

# Load the dataset into a pandas DataFrame
dataframe = pd.read_csv('/workspace/bert/text.csv')
# data = load_dataset('/content/text.csv')


In [3]:
# data validation and formatting
dataframe.rename(columns={"Unnamed: 0": "index", "text": "text", "label": "label"}, inplace = True)

In [18]:
# Handle data imbalance

# Step 1: Split the dataset
from sklearn.model_selection import train_test_split

df = dataframe
X = df['text']  # Features
y = df['label']                # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Rebalancing the Training Dataset
# Combine X_train and y_train again for easy manipulation
train_df = pd.DataFrame(X_train)
train_df['label'] = y_train


# Separating different emotion instances
sadness = train_df[train_df['label'] == 0]
joy = train_df[train_df['label'] == 1]
love = train_df[train_df['label'] == 2]
anger = train_df[train_df['label'] == 3]
fear = train_df[train_df['label'] == 4]
surprise = train_df[train_df['label'] == 5]


# Rebalance: Downsample or upsample to ensure even distribution
# We downsample for simplicity
min_size = min(len(sadness), len(joy))  # Find the size of the smaller class
min_size = min(min_size, len(love))
min_size = min(min_size, len(anger))
min_size = min(min_size, len(fear))
min_size = min(min_size, len(surprise))
# print('Minimal size:')
# print(min_size)

# random downsample both to min_size
sadness_downsampled = sadness.sample(n=min_size, random_state=42)
joy_downsampled = joy.sample(n=min_size, random_state=42)
love_downsampled = love.sample(n=min_size, random_state=42)
anger_downsampled = anger.sample(n=min_size, random_state=42)
fear_downsampled = fear.sample(n=min_size, random_state=42)
surprise_downsampled = surprise.sample(n=min_size, random_state=42)

# Combine back into a balanced training set
balanced_train_df = pd.concat([sadness_downsampled, joy_downsampled, love_downsampled, anger_downsampled, fear_downsampled, surprise_downsampled])

# balanced_train_df.head()
# balanced_train_df.tail()

# Splitting the features and target variable again
X_train_balanced = balanced_train_df['text']
y_train_balanced = balanced_train_df['label']


# This is the training data
train_df = balanced_train_df

# X_test.info()
# testing_data = X_test.copy()
validation_df = pd.DataFrame(X_test)
# Directly add y_test as a new column to testing_data and this is the validation data
validation_df['label'] = y_test.values

# Randomly shuffle data to avoid unnecessary patterns
from sklearn.utils import shuffle
train_df = shuffle(train_df, random_state=42)
validation_df = shuffle(validation_df, random_state=42)

# print('Balanced training data:')
# balanced_train_df.head(5)
# print('testing_data:')
# print(X_test.head(5))
# print(y_test.head(5))
# testing_data.head()

Unnamed: 0,text,label
155684,i feel lucky to have medical care,1
402502,i was standing inside a small room that held t...,4
230800,i always assume that im the only one who feels...,4
134843,i feel a bit distressed going over in my head ...,4
155261,i want to feel less agitated and less distress...,3
167983,i feel triumphant about this other days i want...,1
130972,i was still feeling no pain but had thankfully...,4
65777,i feel like i have so much to say but i want t...,1
243439,i do need to cook more often i feel deprived,0
231098,i guess i just feel really appreciative i thin...,1


In [None]:
# Create a dictionary structure for the training and validation datasets
dataset_dict = DatasetDict({
    'train': Dataset.from_dict({
        'text': train_df['text'].tolist(),
        'label': train_df['label'].tolist()
    }),
    'validation': Dataset.from_dict({
        'text': validation_df['text'].tolist(),
        'label': validation_df['label'].tolist()
    })
})

print(type(dataset_dict))
# dataset is now formatted and ready to be used for training

### model

In [20]:
model_checkpoint = 'distilbert-base-uncased'
# model_checkpoint = 'roberta-base' # you can alternatively use roberta-base but this model is bigger thus training will take longer


# Emotion labels: sadness (0), joy (1), love (2), anger (3), fear (4), and surprise (5)
# define label maps
id2label = {
  0: "sadness",
  1: "joy",
  2: "love",
  3: "anger",
  4: "fear",
  5: "surprise"
}
label2id = {
  "sadness": 0,
  "joy": 1,
  "love": 2,
  "anger": 3,
  "fear": 4,
  "surprise": 5
}

# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=6, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
# display architecture
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

### preprocess data

In [22]:
# create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [23]:
# create tokenize function
def tokenize_function(examples):
    # extract text
    text = examples["text"]

    #tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    return tokenized_inputs

In [24]:
# tokenize training and validation datasets
tokenized_dataset = dataset_dict.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/71634 [00:00<?, ? examples/s]

Map:   0%|          | 0/83362 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 71634
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 83362
    })
})

In [25]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### evaluation

In [26]:
# import accuracy evaluation metric
!pip install scikit-learn
accuracy = evaluate.load("accuracy")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [27]:
# define an evaluation function to pass into trainer later
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

### Apply untrained model to text

In [None]:
# define list of examples
text_list = ["i just feel really helpless and heavy hearted", "ive enjoyed being able to slouch about relax and unwind and frankly needed it after those last few weeks around the end of uni and the expo i have lately started to find myself feeling a bit listless which is never really a good thing", "i gave up my internship with the dmrg and am feeling distraught", "i dont know i feel so lost", "i am a kindergarten teacher and i am thoroughly weary of my job after having taken the university entrance exam i suffered from anxiety for weeks as i did not want to carry on with my work studies were the only alternative"]

print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)

    print(text + " - " + id2label[predictions.tolist()])

### Train model

In [29]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=4,
                        lora_alpha=32,
                        lora_dropout=0.1,
                        target_modules = ['q_lin'])

In [30]:
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, r=4, target_modules={'q_lin'}, lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None)

In [31]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 632,070 || all params: 67,590,156 || trainable%: 0.9351509708011326


In [32]:
# hyperparameters
#lr = 1e-3
lr = 5e-5
batch_size = 4
num_epochs = 10

In [33]:
# define training arguments
from transformers import get_scheduler

training_args = TrainingArguments(
    output_dir= model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    lr_scheduler_type='linear',  # or 'cosine_with_restarts', or another scheduler
    warmup_ratio=0.1,  
)

In [None]:
# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics,
)

# train model
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss


### Generate prediction

In [24]:
model.to('cuda') # moving to mps for Mac (can alternatively do 'cpu')
text_list = ["i just feel really helpless and heavy hearted", "ive enjoyed being able to slouch about relax and unwind and frankly needed it after those last few weeks around the end of uni and the expo i have lately started to find myself feeling a bit listless which is never really a good thing", "i gave up my internship with the dmrg and am feeling distraught", "i dont know i feel so lost", "i am a kindergarten teacher and i am thoroughly weary of my job after having taken the university entrance exam i suffered from anxiety for weeks as i did not want to carry on with my work studies were the only alternative"]

print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("cuda") # moving to mps for Mac (can alternatively do 'cpu')

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])

Trained model predictions:
--------------------------
i just feel really helpless and heavy hearted - sadness
ive enjoyed being able to slouch about relax and unwind and frankly needed it after those last few weeks around the end of uni and the expo i have lately started to find myself feeling a bit listless which is never really a good thing - sadness
i gave up my internship with the dmrg and am feeling distraught - sadness
i dont know i feel so lost - sadness
i am a kindergarten teacher and i am thoroughly weary of my job after having taken the university entrance exam i suffered from anxiety for weeks as i did not want to carry on with my work studies were the only alternative - fear


# Verify model accuracy with 1000 tweet

In [60]:
from functools import reduce

model.to('cuda') # moving to mps for Mac (can alternatively do 'cpu')
test_df = dataframe.iloc[-1000: ]#dataframe.rename(columns={"Unnamed: 0": "index", "text": "text", "label": "label"}, inplace = True)
print('Head of Testing dataset:')
print(test_df.head(5))

def map_get_predictions(text):
    # Convert to CUDA tensors if using GPU
    inputs = tokenizer.encode(text, return_tensors="pt").to("cuda")
    with torch.no_grad():  # Ensure no gradients are calculated during inference
        logits = model(inputs).logits
    predictions = torch.max(logits, 1).indices
    return predictions.item()

# Define the reduce function to calculate accuracy
def reduce_calculate_accuracy(correct, total):
    accuracy = correct / total
    return accuracy

# Apply the map function to the text column to get a list of predictions
predicted_labels = list(map(map_get_predictions, test_df['text']))

# Pair up the predicted labels with the actual labels
paired_labels = zip(predicted_labels, test_df['label'])

# Use the reduce function to count how many predictions were correct
correct_predictions = reduce(lambda acc, label_pair: acc + (label_pair[0] == label_pair[1]), paired_labels, 0)

# Calculate the accuracy
accuracy = reduce_calculate_accuracy(correct_predictions, len(test_df))

print(f"Accuracy: {accuracy}")

Head of Testing dataset:
        Unnamed: 0                                               text  label
415809      415809  i feel confident in saying that all relationsh...      1
415810      415810                        i feel blank when you blank      0
415811      415811  i were to get bad news i would not want to fee...      0
415812      415812  i laid on my bed and tried to hide my feelings...      1
415813      415813  im feeling pretty heartbroken for them and can...      0
Accuracy: 0.932
