In [15]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
!apt install git-lfs

In [None]:
# installing libaries
!pip install -U accelerate
!pip install -U transformers

In [None]:
# importing datasets and tokenizer.
!pip install datasets
from datasets import load_dataset
from transformers import DistilBertTokenizer

# loading dataset
tweet_dataset = load_dataset('tweet_eval', 'irony')
# loading pre-trained model distilbert
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize function takes a dataset as input, pads it based on max_length and truncates if above max_length.
def tokenize_function(example):
  return tokenizer(example["text"], padding=True, truncation=True, max_length=512)

# applying tokenized function on tha dataset in batches.
tokenized_tweet_dataset = tweet_dataset.map(tokenize_function, batched=True)

In [None]:
# first 5 test instances shown.
tweet_dataset.set_format(type='pandas')
df = tweet_dataset['test'][:]
df.head()

In [6]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import accuracy_score

# loading the pre-trained weight of distilbert for sequence classification and initialize a model with two labels.
def model_init():
  return DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

In [7]:
model_name = "distilbert-finetuned-tweet-eval"

# Training arguments.
training_args = TrainingArguments(
  output_dir=model_name, # Directory for saving outputs
  learning_rate=9.345135299518317e-05, # Learning rate for optimization
  seed = 13, # random seed value
  per_device_train_batch_size=64, # Batch size for training
  per_device_eval_batch_size=64, # Batch size for evaluation
  num_train_epochs=3, # Number of training epochs
  weight_decay=0.01, # Weight decay for regularization
  # load_best_model_at_end=True,
  evaluation_strategy="epoch", # Evaluation is done at the end of each epoch
  save_strategy = "epoch",
)

In [None]:
# Trainer Initialization using training pipeline from huggingface.
trainer = Trainer(
  model_init=model_init,
  args=training_args,
  train_dataset=tokenized_tweet_dataset['train'],
  eval_dataset=tokenized_tweet_dataset['validation'],
  compute_metrics=lambda p: {"accuracy": accuracy_score(p.label_ids,
  np.argmax(p.predictions, axis=1))},
  tokenizer=tokenizer
)

In [9]:
# splitting the test, train, and validation dataset.
test_dataset = tokenized_tweet_dataset['test']
training_dataset = tokenized_tweet_dataset['train']
valid_dataset = tokenized_tweet_dataset['validation']

In [None]:
# Untrained state evaluation.
eval_training = trainer.evaluate(training_dataset)
eval_validation = trainer.evaluate(valid_dataset)
eval_testing = trainer.evaluate(test_dataset)

# printing untrained state accuracy
print("Training: ", eval_training)
print("Validation: ", eval_validation)
print("Testing: ", eval_testing)

In [None]:
! pip install optuna
! pip install ray[tune]

# Hyper parameter search for 10 number of trials to find the maximized accuracy
eval = trainer.hyperparameter_search(n_trials=10, direction="maximize")
print(eval)

In [None]:
# Untrained state evaluation
from transformers import DistilBertTokenizer, DistilBertConfig, DistilBertModel
import torch
import torch.nn.functional as F

# loads the distilbert model
model_path = 'distilbert-base-uncased'
config = DistilBertConfig.from_pretrained(model_path)
model_saved = DistilBertForSequenceClassification.from_pretrained(model_path, config=config)

# first 5 test instances.
inputs = test_dataset['text'][:5]

# returns as py torch sequences using the tokenizer.
input_ids = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")["input_ids"]

# performs inference with a pre-trained py torch model, while making sure gradients are not calculated.
with torch.no_grad():
  outputs = model_saved(input_ids) # passes input_ids (input tensors) through the pre-trained model.

# stores raw predictions predicted by the model.
logits = outputs.logits

# Applying softmax to obtain probabilities
probs = F.softmax(logits, dim=-1)

# Getting the predicted labels
predicted_labels = torch.argmax(probs, dim=-1)

# maps int label to string label.
tweet_mapping = {
    0: "non-irony",
    1: "irony"
}

# Getting the predicted tweets string labels using the mapping
predicted_tweets = [tweet_mapping[label.item()] for label in predicted_labels]

# Tweets predicted.
print("Predicted tweets:")
print(predicted_tweets)

In [None]:
trainer.train() # training on the dataset using trainer

In [None]:
# Evaluating after training
eval_validation = trainer.evaluate(valid_dataset)
eval_testing = trainer.evaluate(test_dataset)

# Printing evaluated accuracy
print("Validation: ", eval_validation)
print("Testing: ", eval_testing)

In [None]:
trainer.push_to_hub()

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("iaminhridoy/distilbert-finetuned-tweet-eval")
model = AutoModelForSequenceClassification.from_pretrained("iaminhridoy/distilbert-finetuned-tweet-eval")
inputs = test_dataset['text'][:5] # First 5 test instances

# returns as py torch sequences using the tokenizer.
input_ids = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")["input_ids"]

# performs inference with a pre-trained py torch model, while making sure gradients are not calculated.
with torch.no_grad():
  outputs = model(input_ids) # passes input_ids (input tensors) through the pre-trained model.

# stores raw predictions predicted by the model.
logits = outputs.logits

# Applying softmax to obtain probabilities
probs = F.softmax(logits, dim=-1)

# Getting the predicted labels
predicted_labels = torch.argmax(probs, dim=-1)

# maps int label to string label.
irony_mapping = {
    0: "non-irony",
    1: "irony",
}

# mapping the predictions with the string labels.
predicted_tweets = [irony_mapping[label.item()] for label in predicted_labels]

# print predicted tweets
print("Predicted Tweets:")
print(predicted_tweets)

In [None]:
from sklearn.metrics import confusion_matrix
import numpy as np
# Confusion matrix
#____________________________________________________________________
# predicted labels for the validation dataset
predictions = trainer.predict(tokenized_tweet_dataset['validation'])

# true labels from the validation dataset
true_labels = tokenized_tweet_dataset['validation']['label']

# converting predicted labels to numpy array
predicted_labels = np.argmax(predictions.predictions, axis=1)

# computing confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)

import pandas as pd

# creating a dataframe from the confusion matrix
conf_matrix_df = pd.DataFrame(conf_matrix, index=["True 0", "True 1"], columns=["Predicted 0", "Predicted 1"])

# Print the DataFrame
print("Confusion Matrix:")
print(conf_matrix_df)

# Bert Model Classification-- Tweet_Eval


In [None]:
# importing libaries
!pip install datasets
from datasets import load_dataset
from transformers import BertTokenizer

# Loading dataset irony.
tweet_dataset = load_dataset('tweet_eval', 'irony')

# loading pre-trained model bert
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize function takes a dataset as input, pads it based on max_length and truncates if above max_length.
def tokenize_function(example):
  return tokenizer(example["text"], padding="max_length", truncation=True)

# applying tokenized function on tha dataset in batches.
tokenized_tweet_dataset = tweet_dataset.map(tokenize_function, batched=True)

In [None]:
# first 5 test instances shown.
tweet_dataset.set_format(type='pandas')
df = tweet_dataset['test'][:5]
print(df)

In [19]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import accuracy_score

# loading the pre-trained weight of bert for sequence classification and initialize a model with two labels.
def model_init():
  return BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

In [20]:
model_name = "bert-finetuned-tweet_eval"

# Training arguments.
training_args = TrainingArguments(
  output_dir=model_name, # Directory for saving outputs
  learning_rate=7.32121081228238e-05, # Learning rate for optimization
  seed = 16, # num of random seeds
  per_device_train_batch_size=8, # Batch size for training
  per_device_eval_batch_size=16, # Batch size for evaluation
  num_train_epochs=5, # Number of training epochs
  weight_decay=0.01, # Weight decay for regularization
  # load_best_model_at_end=True,
  evaluation_strategy="epoch", # Evaluation is done at the end of each epoch
  save_strategy = "epoch",
)

In [None]:
# Trainer Initialization using training pipeline from huggingface.
trainer = Trainer(
  model_init=model_init,
  args=training_args,
  train_dataset=tokenized_tweet_dataset['train'],
  eval_dataset=tokenized_tweet_dataset['validation'],
  compute_metrics=lambda p: {"accuracy": accuracy_score(p.label_ids,
  np.argmax(p.predictions, axis=1))},
  tokenizer=tokenizer
)

In [None]:
! pip install optuna
! pip install ray[tune]

# Hyper parameter search for 10 number of trials to find the maximized accuracy
eval = trainer.hyperparameter_search(n_trials=10, direction="maximize")
print(eval)

In [22]:
# splitting the test, train, and validation dataset.
test_dataset = tokenized_tweet_dataset['test']
training_dataset = tokenized_tweet_dataset['train']
valid_dataset = tokenized_tweet_dataset['validation']

In [None]:
# Untrained state evaluation.
eval_training = trainer.evaluate(training_dataset)
eval_validation = trainer.evaluate(valid_dataset)
eval_testing = trainer.evaluate(test_dataset)

# Printing untrained state accuracy
print("Training: ", eval_training)
print("Validation: ", eval_validation)
print("Testing: ", eval_testing)

In [None]:
# Untrained state evaluation

from transformers import BertTokenizer, BertConfig, BertModel
import torch
import torch.nn.functional as F

# loads the bert model, config and initilizes sequence classification.
model_path = 'bert-base-uncased'
config = BertConfig.from_pretrained(model_path)
model_saved = BertForSequenceClassification.from_pretrained(model_path, config=config)

# first 5 test instances.
inputs = test_dataset['text'][:5]

# returns as py torch sequences using the tokenizer.
input_ids = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")["input_ids"]

# performs inference with a pre-trained py torch model, while making sure gradients are not calculated.
with torch.no_grad():
  outputs = model_saved(input_ids) # passes input_ids (input tensors) through the pre-trained model.

# stores raw predictions predicted by the model.
logits = outputs.logits

# Applying softmax to obtain probabilities
probs = F.softmax(logits, dim=-1)

# Getting the predicted labels
predicted_labels = torch.argmax(probs, dim=-1)

# maps int label to string label.
tweet_mapping = {
    0: "non-irony",
    1: "irony"
}

# Getting the predicted tweets string labels using the mapping
predicted_tweets = [tweet_mapping[label.item()] for label in predicted_labels]

# Printing the predicted tweets
print("Predicted tweets:")
print(predicted_tweets)

In [None]:
trainer.train() # training on the dataset using trainer

In [None]:
# Evaluating after training
eval_validation = trainer.evaluate(valid_dataset)
eval_testing = trainer.evaluate(test_dataset)

# Printing evaluated accuracy
print("Validation: ", eval_validation)
print("Testing: ", eval_testing)

In [None]:
trainer.push_to_hub()

In [None]:
# saving the model.
trainer.save_model("Bert-Model_FineTuned-Tweet_Eval")

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("iaminhridoy/bert-finetuned-tweet_eval")
model = AutoModelForSequenceClassification.from_pretrained("iaminhridoy/bert-finetuned-tweet_eval")
# first 5 test instances.
inputs = test_dataset['text'][:5]

# returns as py torch sequences using the tokenizer.
input_ids = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")["input_ids"]

# performs inference with a pre-trained py torch model, while making sure gradients are not calculated.
with torch.no_grad():
  outputs = model(input_ids) # passes input_ids (input tensors) through the pre-trained model.

# stores raw predictions predicted by the model.
logits = outputs.logits

# Applying softmax to obtain probabilities
probs = F.softmax(logits, dim=-1)

# Getting the predicted labels
predicted_labels = torch.argmax(probs, dim=-1)

# maps int label to string label.
tweet_mapping = {
    0: "non-irony",
    1: "irony",
}

# Get the predicted emotion string labels using the mapping
predicted_emotions = [tweet_mapping[label.item()] for label in predicted_labels]

# Print the predicted emotions
print("Predicted emotions:")
print(predicted_emotions)

In [None]:
from sklearn.metrics import confusion_matrix
import numpy as np

# Confusion matrix
#____________________________________________________________________

# predicting labels for the validation dataset
predictions = trainer.predict(tokenized_tweet_dataset['validation'])

# finding true labels from the validation dataset
true_labels = tokenized_tweet_dataset['validation']['label']

# converting predicted labels to numpy array
predicted_labels = np.argmax(predictions.predictions, axis=1)

# computing confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)

import pandas as pd

# creating a dataframe from the confusion matrix
conf_matrix_df = pd.DataFrame(conf_matrix, index=["True 0", "True 1"], columns=["Predicted 0", "Predicted 1"])

# Printing the DataFrame
print("Confusion Matrix:")
print(conf_matrix_df)

**Albert** Model - Tweet_Eval text Classification

In [None]:
# importing datasets and tokenizer.
!pip install datasets
from datasets import load_dataset
from transformers import AlbertTokenizer

# loading dataset
tweet_dataset = load_dataset('tweet_eval', 'irony')

# loading pre-trained model albert
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')

# Tokenize function takes a dataset as input, pads it based on max_length and truncates if above max_length.
def tokenize_function(example):
  return tokenizer(example["text"], padding=True, truncation=True, max_length=160)

# applying tokenized function on tha dataset in batches.
tokenized_tweet_dataset = tweet_dataset.map(tokenize_function, batched=True)

In [46]:
from transformers import AlbertForSequenceClassification, Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import accuracy_score

# loading the pre-trained weight of distilbert for sequence classification and initialize a model with two labels.
def model_init():
  return AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=2)

In [47]:
albert_fine_tuned = "AlBert-finetuned-Tweet_Eval"

# Training arguments.
training_args = TrainingArguments(
  output_dir=albert_fine_tuned, # Directory for saving outputs
  learning_rate=1.5702521904670393e-05, # Learning rate for optimization
  seed = 25, # num of random seeds
  per_device_train_batch_size=16, # Batch size for training
  per_device_eval_batch_size=16, # Batch size for evaluation
  num_train_epochs=3, # Number of training epochs
  weight_decay=0.01, # Weight decay for regularization
  evaluation_strategy="epoch", # Evaluation is done at the end of each epoch
  save_strategy = "epoch",
)

In [None]:
# Trainer Initialization using training pipeline from huggingface.
trainer = Trainer(
  model_init=model_init,
  args=training_args,
  train_dataset=tokenized_tweet_dataset['train'],
  eval_dataset=tokenized_tweet_dataset['validation'],
  compute_metrics=lambda p: {"accuracy": accuracy_score(p.label_ids,
  np.argmax(p.predictions, axis=1))},
  tokenizer=tokenizer
)

In [49]:
# splitting the test, train, and validation dataset.
training_dataset = tokenized_tweet_dataset['train']
validation_dataset = tokenized_tweet_dataset['validation']
test_dataset = tokenized_tweet_dataset['test']

In [None]:
# Untrained state evaluation.
eval_training = trainer.evaluate(training_dataset)
eval_validation = trainer.evaluate(validation_dataset)
eval_test = trainer.evaluate(test_dataset)

# Printing untrained state accuracy
print("Training: ", eval_training)
print("Validation: ", eval_validation)
print("Testing: ", eval_test)

In [None]:
# Untrained state evaluation

from transformers import AlbertTokenizer, AlbertConfig, AlbertModel
import torch
import torch.nn.functional as F

# loads the Albert model, and initilizes config, sequence classification.
model_path = 'albert-base-v2'
config = AlbertConfig.from_pretrained(model_path)
model_saved = AlbertForSequenceClassification.from_pretrained(model_path, config=config)

# first 5 test instances.
inputs = test_dataset['text'][:5]

# returns as py torch sequences using the tokenizer.
input_ids = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")["input_ids"]

# performs inference with a pre-trained py torch model, while making sure gradients are not calculated.
with torch.no_grad():
  outputs = model_saved(input_ids) # passes input_ids (input tensors) through the pre-trained model.

# stores raw predictions predicted by the model.
logits = outputs.logits

# Applying softmax to obtain probabilities
probs = F.softmax(logits, dim=-1)

# Getting the predicted labels
predicted_labels = torch.argmax(probs, dim=-1)

# maps int label to string label.
tweet_mapping = {
    0: "non-irony",
    1: "irony"
}

# getting the predicted emotions string labels using the mapping
predicted_tweets = [tweet_mapping[label.item()] for label in predicted_labels]

# tweets predicted in untrained state.
print("Predicted tweets:")
print(predicted_tweets)

In [None]:
# installing libraries
! pip install optuna
! pip install ray[tune]
# Hyper parameter search for 10 number of trials to find the maximized accuracy
eval = trainer.hyperparameter_search(n_trials=10, direction="maximize")
print(eval)

In [None]:
trainer.train() # training on the dataset using trainer

In [None]:
# Evaluating after training
eval_training = trainer.evaluate(training_dataset)
eval_validation = trainer.evaluate(validation_dataset)
eval_test = trainer.evaluate(test_dataset)

# Printing evaluated accuracy
print("Training: ", eval_training)
print("Validation: ", eval_validation)
print("Testing: ", eval_test)

In [None]:
trainer.push_to_hub()

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import torch.nn.functional as F

tokenizer = AutoTokenizer.from_pretrained("iaminhridoy/AlBert-finetuned-Tweet_Eval")
model = AutoModelForSequenceClassification.from_pretrained("iaminhridoy/AlBert-finetuned-Tweet_Eval")


# first 5 test instances.
inputs = test_dataset['text'][:5]

# returns as py torch sequences using the tokenizer.
input_ids = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")["input_ids"]


# performs inference with a pre-trained py torch model, while making sure gradients are not calculated.
with torch.no_grad():
  outputs = model(input_ids) # passes input_ids (input tensors) through the pre-trained model.

# stores raw predictions predicted by the model.
logits = outputs.logits

# Applying softmax to obtain probabilities
probs = F.softmax(logits, dim=-1)

# Getting the predicted labels
predicted_labels = torch.argmax(probs, dim=-1)

# maps int label to string label.
tweet_mapping = {
    0: "non-irony",
    1: "irony"
}

# Getting the predicted tweets string labels using the mapping
predicted_irony = [tweet_mapping[label.item()] for label in predicted_labels]

# Printing the predicted tweets
print("Predicted Irony:")
print(predicted_irony)


In [None]:
from sklearn.metrics import confusion_matrix
import numpy as np

# Confusion matrix
#____________________________________________________________________

# predicting labels for the validation dataset
predictions = trainer.predict(tokenized_tweet_dataset['validation'])

# finding true labels from the validation dataset
true_labels = tokenized_tweet_dataset['validation']['label']

# converting predicted labels to numpy array
predicted_labels = np.argmax(predictions.predictions, axis=1)

# computing confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)

import pandas as pd

# creating a dataframe from the confusion matrix
conf_matrix_df = pd.DataFrame(conf_matrix, index=["True 0", "True 1"], columns=["Predicted 0", "Predicted 1"])

# Printing the dataframe
print("Confusion Matrix:")
print(conf_matrix_df)