In [None]:
!pip install transformers datasets



In [None]:
from datasets import load_dataset
import pandas as pd

# Load the 'sentiment' task data from the TweetEval dataset
dataset = load_dataset("tweet_eval", "sentiment")

# Convert the dataset to a pandas DataFrame for easier manipulation
df = pd.DataFrame(dataset['train'])


Downloading readme:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading data: 100%|██████████| 3.78M/3.78M [00:02<00:00, 1.50MB/s]
Downloading data: 100%|██████████| 901k/901k [00:02<00:00, 427kB/s]
Downloading data: 100%|██████████| 167k/167k [00:01<00:00, 93.0kB/s]


Generating train split:   0%|          | 0/45615 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/12284 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
#check more details
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 45615
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 12284
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [None]:


import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# 'df' is a pandas DataFrame that has already been created and contains the sentiment labels
labels = df['label']
label_counter = Counter(labels)

# Create a bar plot to show the distribution of sentiment labels
plt.figure(figsize=(7, 5))
sns.barplot(x=list(label_counter.keys()), y=list(label_counter.values()))
plt.title('Class Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Number of Samples')
plt.show()


In [None]:
# Visualize word counts with a word cloud
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Assuming 'dataset' is already loaded and contains the text data
all_text = ' '.join(dataset['train']['text'])

# Generate a word cloud image
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text)

# Display the word cloud image
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')  # 'bilinear' interpolation for smoother appearance
plt.axis('off')  # Turn off the axis labels
plt.show()

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
from transformers import AutoModelForSequenceClassification, BertTokenizer

# Load a pre-trained BERT tokenizer for processing text
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load a pre-trained BERT model specifically for sequence classification
# This model is configured to classify sequences into one of three categories
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Define a function to tokenize text data.
# This function applies the tokenizer to each example, padding and truncating as necessary.
def tokenize_function(examples):
    # `padding='max_length'` ensures all sequences are padded to the same length for batch processing.
    # `truncation=True` truncates sequences longer than the model's maximum input length.
    return tokenizer(examples['text'], padding='max_length', truncation=True)

# Apply the tokenize function to all examples in the dataset in batches.
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Split the data into smaller subsets for training and evaluation to enable quicker experimentation.
# Shuffle the train dataset with a fixed seed to ensure reproducibility.
small_train_dataset = tokenized_datasets['train'].shuffle(seed=42).select(range(4000))
# Similarly, shuffle the test dataset and select a smaller subset for validation.
small_eval_dataset = tokenized_datasets['test'].shuffle(seed=42).select(range(1000))


Map:   0%|          | 0/45615 [00:00<?, ? examples/s]

Map:   0%|          | 0/12284 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
!pip install transformers[torch] accelerate -U

Collecting accelerate
  Downloading accelerate-0.29.2-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.29.2-py3-none-any.whl (297 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.4/297.4 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.28.0
    Uninstalling accelerate-0.28.0:
      Successfully uninstalled accelerate-0.28.0
Successfully installed accelerate-0.29.2


In [None]:
import os

# Disable the Weights & Biases integration. This is used to prevent the automatic tracking and logging
# of experiments, useful in scenarios where you want to run scripts without sending data to W&B servers.
os.environ["WANDB_DISABLED"] = "true"



In [None]:
from transformers import Trainer, TrainingArguments
import numpy as np

# Define training parameters and configurations for the model
training_args = TrainingArguments(
    output_dir='./results',           # Directory where model checkpoints will be saved
    num_train_epochs=3,               # Total number of training epochs to perform
    per_device_train_batch_size=8,    # Batch size per device during training
    per_device_eval_batch_size=8,     # Batch size for evaluation
    warmup_steps=500,                 # Number of steps to perform learning rate warmup
    weight_decay=0.01,                # Weight decay to apply (regularization technique to prevent overfitting)
    logging_dir='./logs',             # Directory where logs will be stored
    evaluation_strategy='epoch',      # Perform evaluation at the end of each epoch
)

# Initialize the Trainer, which handles the training loop and evaluation
trainer = Trainer(
    model=model,                     # The model to be trained, loaded earlier
    args=training_args,              # The training arguments defined above
    train_dataset=small_train_dataset,  # The training dataset
    eval_dataset=small_eval_dataset,    # The evaluation dataset
    # Define a function to compute metrics for evaluation
    compute_metrics=lambda p: {
        "accuracy": (np.argmax(p.predictions, axis=1) == p.label_ids).mean()
    }
)

# Train the model according to the specified training arguments and datasets
trainer.train()


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  deprecated_dl_args["use_seedable_sampler"] = use_seedable_sampler


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2362,1.658228,0.612
2,0.1908,1.718583,0.642
3,0.1145,1.93012,0.661


TrainOutput(global_step=1500, training_loss=0.18052713012695312, metrics={'train_runtime': 723.5471, 'train_samples_per_second': 16.585, 'train_steps_per_second': 2.073, 'total_flos': 3157361012736000.0, 'train_loss': 0.18052713012695312, 'epoch': 3.0})

In [None]:
#evaluate the trained model
trainer.evaluate()


{'eval_loss': 1.9301201105117798,
 'eval_accuracy': 0.661,
 'eval_runtime': 18.5206,
 'eval_samples_per_second': 53.994,
 'eval_steps_per_second': 6.749,
 'epoch': 3.0}