In [None]:
# from huggingface_hub import notebook_login
# notebook_login()

In [None]:
# !apt install git-lfs

# DistilBert Model Text Classification

In [None]:
# importing libaries
!pip install -U accelerate
!pip install -U transformers

In [None]:
# importing datasets and tokenizer.
!pip install datasets
from datasets import load_dataset
from transformers import DistilBertTokenizer

# loading dataset
dataset = load_dataset('dair-ai/emotion')

# loading pre-trained model bert
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize function takes a dataset as input, pads it based on max_length and truncates if above max_length.
def tokenize_function(example):
  return tokenizer(example["text"], padding=True, truncation=True, max_length=512)

# applying tokenized function on tha dataset in batches.
tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [5]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import accuracy_score

# loading the pre-trained weight of distilbert for sequence classification and initialize a model with two labels.
def model_init():
  return DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=6)

In [6]:
model_name = "distilbert-finetuned-emotion"

# Training arguments.
training_args = TrainingArguments(
  output_dir=model_name, # Directory for saving outputs
  learning_rate=2.8743538823133815e-05, # Learning rate for optimization
  seed = 1, # num of random seeds
  per_device_train_batch_size=4, # Batch size for training
  per_device_eval_batch_size=64, # Batch size for evaluation
  num_train_epochs=3, # Number of training epochs
  weight_decay=0.01, # Weight decay for regularization
  evaluation_strategy="epoch", # Evaluation is done at the end of each epoch
  save_strategy = "epoch",
)

In [None]:
# Trainer Initialization using training pipeline from huggingface.
trainer = Trainer(
  model_init=model_init,
  args=training_args,
  train_dataset=tokenized_datasets['train'],
  eval_dataset=tokenized_datasets['validation'],
  compute_metrics=lambda p: {"accuracy": accuracy_score(p.label_ids,
  np.argmax(p.predictions, axis=1))},
  tokenizer=tokenizer
)

In [None]:
# splitting the test, train, and validation dataset.

test_dataset = tokenized_datasets['test']
eval_dataset = tokenized_datasets['validation']
training_set = tokenized_datasets['train']

# Untrained state evaluation.
eval_training = trainer.evaluate(training_set)
eval_test = trainer.evaluate(test_dataset)
eval_validation = trainer.evaluate(eval_dataset)

# Printing untrained state accuracy
print("Training: ", eval_training)
print("Testing: ", eval_test)
print("Validation: ", eval_validation)

In [None]:
# Predicted emotions before training:
from transformers import DistilBertTokenizer, DistilBertConfig, DistilBertModel
import torch
import torch.nn.functional as F

# loads the DistilBert model, and initilizes config, sequence classification.
model_path = 'distilbert-base-uncased'
config = DistilBertConfig.from_pretrained(model_path)
model_saved = DistilBertForSequenceClassification.from_pretrained(model_path, config=config)

# first 5 test instances.
inputs = test_dataset['text'][:5]

# returns as py torch sequences using the tokenizer.
input_ids = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")["input_ids"]

# performs inference with a pre-trained py torch model, while making sure gradients are not calculated.
with torch.no_grad():
  outputs = model_saved(input_ids) # passes input_ids (input tensors) through the pre-trained model.

# stores raw predictions predicted by the model.
logits = outputs.logits

# Apply softmax to obtain probabilities
probs = F.softmax(logits, dim=-1)

# Getting the predicted labels
predicted_labels = torch.argmax(probs, dim=-1)

# maps int label to string label.
emotion_mapping = {
    0: "sadness",
    1: "joy",
    2: "love",
    3: "anger",
    4: "fear",
    5: "surprise"
}

# getting the predicted emotions string labels using the mapping
predicted_emotions = [emotion_mapping[label.item()] for label in predicted_labels]

# emotions predicted in untrained state.
print("Predicted emotions:")
print(predicted_emotions)

In [None]:
! pip install optuna
! pip install ray[tune]

# Hyper parameter search for 10 number of trials to find the maximized accuracy
eval = trainer.hyperparameter_search(n_trials=10, direction="maximize")
print(eval)

In [None]:
trainer.train() # training on the dataset using trainer

In [None]:
# Evaluating after training
eval_test = trainer.evaluate(test_dataset)
eval_validation = trainer.evaluate(eval_dataset)

# Printing evaluated accuracy
print("Testing: ", eval_test)
print("Validation: ", eval_validation)

In [None]:
trainer.push_to_hub()

In [None]:
# Trained state evaluation
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import torch.nn.functional as F

tokenizer = AutoTokenizer.from_pretrained("iaminhridoy/distilbert-finetuned-emotion")
model = AutoModelForSequenceClassification.from_pretrained("iaminhridoy/distilbert-finetuned-emotion")

# first 5 test instances.
inputs = test_dataset['text'][:5]

# returns as py torch sequences using the tokenizer.
input_ids = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")["input_ids"]

# performs inference with a pre-trained py torch model, while making sure gradients are not calculated.
with torch.no_grad():
  outputs = model(input_ids) # passes input_ids (input tensors) through the pre-trained model.

# stores raw predictions predicted by the model.
logits = outputs.logits

# Applying softmax to obtain probabilities
probs = F.softmax(logits, dim=-1)

# Getting the predicted labels
predicted_labels = torch.argmax(probs, dim=-1)

# maps int label to string label.
emotion_mapping = {
    0: "sadness",
    1: "joy",
    2: "love",
    3: "anger",
    4: "fear",
    5: "surprise"
}

# Getting the predicted emotion string labels using the mapping
predicted_emotions = [emotion_mapping[label.item()] for label in predicted_labels]

# Printing the predicted emotions
print("Predicted emotions:")
print(predicted_emotions)

# Bert Model Text Classification

In [13]:
bert_model = 'bert-base-uncased'

In [None]:
# importing libaries

!pip install datasets
from datasets import load_dataset
from transformers import BertTokenizer

# Loading dataset emotion.
dataset = load_dataset('dair-ai/emotion')

# loading pre-trained model bert
tokenizer = BertTokenizer.from_pretrained(bert_model)

# Tokenize function takes a dataset as input, pads it based on max_length and truncates if above max_length.
def tokenize_function(example):
  return tokenizer(example["text"], padding=True, truncation=True, max_length=512)

# applying tokenized function on tha dataset in batches.
tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
# first 5 test instances shown.
dataset.set_format(type='pandas')
df = dataset['test'][:5]
print(df)

In [16]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import accuracy_score

# loading the pre-trained weight of bert for sequence classification and initialize a model with two labels.
def model_init():
  return BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)

In [17]:
bert_fine_tuned = "bert-finetuned-emotion"

# Training arguments.
training_args = TrainingArguments(
  output_dir=bert_fine_tuned, # Directory for saving outputs
  learning_rate=8.468317724172667e-05, # Learning rate for optimization
  seed = 39, # num of random seeds
  per_device_train_batch_size=64, # Batch size for training
  per_device_eval_batch_size=64, # Batch size for evaluation
  num_train_epochs=2, # Number of training epochs
  weight_decay=0.01, # Weight decay for regularization
  evaluation_strategy="epoch", # Evaluation is done at the end of each epoch
  save_strategy = "epoch",
)

In [None]:
# Trainer Initialization using training pipeline from huggingface.

trainer = Trainer(
  model_init=model_init,
  args=training_args,
  train_dataset=tokenized_datasets['train'],
  eval_dataset=tokenized_datasets['validation'],
  compute_metrics=lambda p: {"accuracy": accuracy_score(p.label_ids,
  np.argmax(p.predictions, axis=1))},
  tokenizer=tokenizer
)

In [None]:
# splitting the test, train, and validation dataset.

test_dataset = tokenized_datasets['test']
eval_dataset = tokenized_datasets['validation']
training_set = tokenized_datasets['train']

# Untrained state evaluation.
eval_training = trainer.evaluate(training_set)
eval_test = trainer.evaluate(test_dataset)
eval_validation = trainer.evaluate(eval_dataset)

# Printing untrained state accuracy
print("Training: ", eval_training)
print("Testing: ", eval_test)
print("Validation: ", eval_validation)

In [None]:
# Predicted emotions before training:
from transformers import DistilBertTokenizer, DistilBertConfig, DistilBertModel
import torch
import torch.nn.functional as F

# loads the bert model, config and initilizes sequence classification.
model_path = 'bert-base-uncased'
config = DistilBertConfig.from_pretrained(model_path)
model_saved = DistilBertForSequenceClassification.from_pretrained(model_path, config=config)

# first 5 test instances.
inputs = test_dataset['text'][:5]

# returns as py torch sequences using the tokenizer.
input_ids = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")["input_ids"]

# performs inference with a pre-trained py torch model, while making sure gradients are not calculated.
with torch.no_grad():
  outputs = model_saved(input_ids) # passes input_ids (input tensors) through the pre-trained model.

# stores raw predictions predicted by the model.
logits = outputs.logits

# Applying softmax to obtain probabilities
probs = F.softmax(logits, dim=-1)

# Getting the predicted labels
predicted_labels = torch.argmax(probs, dim=-1)

# maps int label to string label.
emotion_mapping = {
    0: "sadness",
    1: "joy",
    2: "love",
    3: "anger",
    4: "fear",
    5: "surprise"
}

# Getting the predicted emotions string labels using the mapping
predicted_emotions = [emotion_mapping[label.item()] for label in predicted_labels]

# Printing the predicted emotions
print("Predicted emotions:")
print(predicted_emotions)

In [None]:
! pip install optuna
! pip install ray[tune]

# Hyper parameter search for 10 number of trials to find the maximized accuracy
eval = trainer.hyperparameter_search(n_trials=10, direction="maximize")
print(eval)

In [None]:
trainer.train() # training on the dataset using trainer

In [None]:
# Evaluating after training
eval_test = trainer.evaluate(test_dataset)
eval_validation = trainer.evaluate(eval_dataset)

# Printing evaluated accuracy
print("Testing: ", eval_test)
print("Validation: ", eval_validation)

In [None]:
# saving the model
trainer.push_to_hub()

In [None]:
# trained state evaluation

from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import torch.nn.functional as F

tokenizer = AutoTokenizer.from_pretrained("iaminhridoy/bert-finetuned-emotion")
model = AutoModelForSequenceClassification.from_pretrained("iaminhridoy/bert-finetuned-emotion")

# first 5 test instances.
inputs = test_dataset['text'][:5]

# returns as py torch sequences using the tokenizer.
input_ids = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")["input_ids"]

# performs inference with a pre-trained py torch model, while making sure gradients are not calculated.
with torch.no_grad():
  outputs = model(input_ids) # passes input_ids (input tensors) through the pre-trained model.

# stores raw predictions predicted by the model.
logits = outputs.logits

# Apply softmax to obtain probabilities
probs = F.softmax(logits, dim=-1)

# Get the predicted labels
predicted_labels = torch.argmax(probs, dim=-1)

# maps int label to string label.
emotion_mapping = {
    0: "sadness",
    1: "joy",
    2: "love",
    3: "anger",
    4: "fear",
    5: "surprise"
}

# mapping the predictions with the string labels.
predicted_emotions = [emotion_mapping[label.item()] for label in predicted_labels]

# print predicted emotions
print("Predicted emotions:")
print(predicted_emotions)

# Albert Model - Text Classification

In [24]:
albert_model = 'albert-base-v2'

In [None]:
# importing datasets and tokenizer.
!pip install datasets
from datasets import load_dataset
from transformers import AlbertTokenizer

# loading dataset
dataset = load_dataset('dair-ai/emotion')

# loading pre-trained model albert
tokenizer = AlbertTokenizer.from_pretrained(albert_model)

# Tokenize function takes a dataset as input, pads it based on max_length and truncates if above max_length.
def tokenize_function(example):
  return tokenizer(example["text"], padding=True, truncation=True, max_length=512)

# applying tokenized function on tha dataset in batches.
tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [26]:
from transformers import AlbertForSequenceClassification, Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import accuracy_score

# loading the pre-trained weight of distilbert for sequence classification and initialize a model with two labels.
def model_init():
  return AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=6)

In [None]:
# Untrained state evaluation

from transformers import AlbertTokenizer, AlbertConfig, AlbertModel
import torch
import torch.nn.functional as F

# loads the Albert model, and initilizes config, sequence classification.
model_path = 'albert-base-v2'
config = AlbertConfig.from_pretrained(model_path)
model_saved = AlbertForSequenceClassification.from_pretrained(model_path, config=config)


# first 5 test instances.
inputs = test_dataset['text'][:5]

# returns as py torch sequences using the tokenizer.
input_ids = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")["input_ids"]

# performs inference with a pre-trained py torch model, while making sure gradients are not calculated.
with torch.no_grad():
  outputs = model_saved(input_ids) # passes input_ids (input tensors) through the pre-trained model.

# stores raw predictions predicted by the model.
logits = outputs.logits

# Applying softmax to obtain probabilities
probs = F.softmax(logits, dim=-1)

# Getting the predicted labels
predicted_labels = torch.argmax(probs, dim=-1)

# maps int label to string label.
emotion_mapping = {
    0: "sadness",
    1: "joy",
    2: "love",
    3: "anger",
    4: "fear",
    5: "surprise"
}

# getting the predicted emotions string labels using the mapping
predicted_emotions = [emotion_mapping[label.item()] for label in predicted_labels]

# emotions predicted in untrained state.
print("Predicted emotions:")
print(predicted_emotions)

In [28]:
albert_fine_tuned = "AlBert-finetuned-emotion"

# Training arguments.
training_args = TrainingArguments(
  output_dir=albert_fine_tuned, # Directory for saving outputs
  learning_rate=3.069458879876956e-05, # Learning rate for optimization
  # learning_rate': 3.069458879876956e-05, 'num_train_epochs': 5, 'seed': 22, 'per_device_train_batch_size': 32
  seed = 22,
  per_device_train_batch_size=16, # Batch size for training
  per_device_eval_batch_size=16, # Batch size for evaluation
  num_train_epochs=5, # Number of training epochs
  weight_decay=0.01, # Weight decay for regularization
  # load_best_model_at_end=True,
  evaluation_strategy="epoch", # Evaluation is done at the end of each epoch
  save_strategy = "epoch",
)

In [None]:
# Trainer Initialization using training pipeline from huggingface.

trainer = Trainer(
  model_init=model_init,
  args=training_args,
  train_dataset=tokenized_datasets['train'],
  eval_dataset=tokenized_datasets['validation'],
  compute_metrics=lambda p: {"accuracy": accuracy_score(p.label_ids,
  np.argmax(p.predictions, axis=1))},
  tokenizer=tokenizer
)

In [None]:
# splitting the test, train, and validation dataset.

test_dataset = tokenized_datasets['test']
eval_dataset = tokenized_datasets['validation']
training_set = tokenized_datasets['train']

# Untrained state evaluation.
eval_training = trainer.evaluate(training_set)
eval_test = trainer.evaluate(test_dataset)
eval_validation = trainer.evaluate(eval_dataset)

# Printing untrained state accuracy
print("Training: ", eval_training)
print("Testing: ", eval_test)
print("Validation: ", eval_validation)

In [None]:
# installing libraries
! pip install optuna
! pip install ray[tune]

# Hyper parameter search for 10 number of trials to find the maximized accuracy
eval = trainer.hyperparameter_search(n_trials=10, direction="maximize")
print(eval)

In [None]:
trainer.train() # training on the dataset using trainer

In [None]:
# Evaluating after training

eval_test = trainer.evaluate(test_dataset)
eval_validation = trainer.evaluate(eval_dataset)

# Printing evaluated accuracy

print("Testing: ", eval_test)
print("Validation: ", eval_validation)

In [None]:
trainer.push_to_hub() # saving the model.

In [None]:
# Trained state evaluation

from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import torch.nn.functional as F

tokenizer = AutoTokenizer.from_pretrained("iaminhridoy/AlBert-finetuned-emotion")
model = AutoModelForSequenceClassification.from_pretrained("iaminhridoy/AlBert-finetuned-emotion")

# first 5 test instances.
inputs = test_dataset['text'][:5]

# returns as py torch sequences using the tokenizer.
input_ids = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")["input_ids"]

# performs inference with a pre-trained py torch model, while making sure gradients are not calculated.
with torch.no_grad():
  outputs = model(input_ids) # passes input_ids (input tensors) through the pre-trained model.

# stores raw predictions predicted by the model.
logits = outputs.logits

# Applying softmax to obtain probabilities
probs = F.softmax(logits, dim=-1)

# Getting the predicted labels
predicted_labels = torch.argmax(probs, dim=-1)

# maps int label to string label.
emotion_mapping = {
    0: "sadness",
    1: "joy",
    2: "love",
    3: "anger",
    4: "fear",
    5: "surprise"
}

# Getting the predicted emotions string labels using the mapping
predicted_emotions = [emotion_mapping[label.item()] for label in predicted_labels]

# Printing the predicted emotions
print("Predicted emotions:")
print(predicted_emotions)