In [1]:
!pip install transformers datasets pandas torch scikit-learn
!pip uninstall torch -y
!pip install torch torchvision torchaudio


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [2]:
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu113


In [2]:
import torch

# Select device (use CPU if MPS or CUDA are unavailable)
device = "mps" if torch.backends.mps.is_available() else ("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load Custom Dataset
custom_path = "/content/drive/MyDrive/base_data_full.csv"
custom_data = pd.read_csv(custom_path)

# Ensure correct data types
custom_data = custom_data.dropna()  # Remove missing values
custom_data['sentence'] = custom_data['sentence'].astype(str)  # Ensure sentences are strings
custom_data['sentiment'] = custom_data['sentiment'].str.lower()  # Normalize sentiment column to lowercase

# Map sentiment to numerical labels
sentiment_mapping = {"negative": 0, "positive": 1, "neutral": 2}
custom_data['label'] = custom_data['sentiment'].map(sentiment_mapping)

# Convert to Hugging Face Dataset
custom_dataset = Dataset.from_pandas(custom_data[['sentence', 'label']])

# Load Pretrained BERTweet
model_name = "vinai/bertweet-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Tokenization Function
def tokenize_function(examples):
    return tokenizer(examples['sentence'], padding="max_length", truncation=True, max_length=128)

custom_dataset = custom_dataset.map(tokenize_function, batched=True)
custom_dataset = custom_dataset.rename_column("label", "labels")
custom_dataset.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])

# Split Dataset for Training and Validation
custom_dataset = custom_dataset.train_test_split(test_size=0.2)
train_dataset = custom_dataset['train']
eval_dataset = custom_dataset['test']

# Define Training Arguments
training_args = TrainingArguments(
    output_dir="./finetuned_results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./finetuned_logs",
    logging_steps=10,
    save_strategy="epoch"
)

# Evaluation Metrics
def compute_metrics(pred):
    predictions = pred.predictions.argmax(-1)
    labels = pred.label_ids
    acc = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

# Fine-Tune the Model
trainer.train()

# Save the Fine-Tuned Model
model.save_pretrained("/content/drive/MyDrive/finetuned_bertweet_model")
tokenizer.save_pretrained("/content/drive/MyDrive/finetuned_bertweet_tokenizer")

# Evaluate Fine-Tuned Model on Filtered Reddit Dataset
eval_results = trainer.evaluate(eval_dataset=eval_dataset)
print("Fine-Tuned Model Evaluation Results on Custom Dataset:", eval_results)


emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2744 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4542,0.406871,0.87796,0.835431,0.87796,0.855814
2,0.3017,0.386744,0.883424,0.840533,0.883424,0.861264


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, PeftModel, PeftConfig
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import random

# Load Dataset
dataset = load_dataset("tweet_eval", "sentiment")

# Sample a Small Subset of the Data
def sample_dataset(dataset, sample_size=1200):
    sampled_dataset = dataset.shuffle(seed=42).select(range(min(len(dataset), sample_size)))
    return sampled_dataset

train_sample_size = 1000
test_sample_size = 200

dataset["train"] = sample_dataset(dataset["train"], sample_size=train_sample_size)
dataset["test"] = sample_dataset(dataset["test"], sample_size=test_sample_size)

# Pretrained Model and Tokenizer
model_name = "vinai/bertweet-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Tokenize Dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

dataset = dataset.map(tokenize_function, batched=True)
dataset = dataset.rename_column("label", "labels")
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Split Dataset
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

# Configure LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    task_type="SEQ_CLS"  # Sequence Classification
)
model = get_peft_model(base_model, lora_config)

# Define Training Arguments
training_args = TrainingArguments(
    output_dir="./finetuned_bertweet_lora",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
)

# Evaluation Metrics
def compute_metrics(pred):
    predictions = pred.predictions.argmax(-1)
    labels = pred.label_ids
    acc = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

# Fine-Tune the Model with LoRA
trainer.train()

# Save Fine-Tuned Model
peft_model_dir = "/content/drive/MyDrive/finetuned_bertweet_lora"
model.save_pretrained(peft_model_dir)  # Save LoRA adapters
tokenizer.save_pretrained(peft_model_dir)

print("Model and tokenizer saved successfully!")


emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0103,1.146468,0.43,0.1849,0.43,0.258601
2,1.0101,1.165464,0.43,0.1849,0.43,0.258601
3,1.0025,1.165468,0.43,0.1849,0.43,0.258601


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model and tokenizer saved successfully!


In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load the Custom Dataset
custom_dataset_path = "/content/drive/MyDrive/base_data_full.csv"
custom_data = pd.read_csv(custom_dataset_path)

# Prepare the custom word set
custom_words = custom_data['word'].dropna().str.lower().unique()
custom_words_set = set(custom_words)

# Load the Reddit Dataset
reddit_path = "/content/drive/MyDrive/the-reddit-dataset-dataset-comments.csv"
reddit_data = pd.read_csv(reddit_path)

# Filter necessary columns: 'body' and 'sentiment'
reddit_data = reddit_data[['body', 'sentiment']]
reddit_data.columns = ['body', 'sentiment']  # Rename for clarity

# Clean and validate the Reddit data
reddit_data = reddit_data.dropna(subset=['body'])  # Remove rows with missing 'body'
reddit_data['body'] = reddit_data['body'].astype(str)  # Ensure all 'body' values are strings

# Filter Reddit data to include only rows with at least one word from the custom dataset
filtered_reddit_data = reddit_data[
    reddit_data['body'].str.contains('|'.join(custom_words_set), case=False, na=False)
]
print(f"Filtered Reddit dataset size: {len(filtered_reddit_data)}")

# Map sentiment to labels
filtered_reddit_data['label'] = filtered_reddit_data['sentiment'].apply(
    lambda x: 0 if x < 0 else (1 if x > 0 else 2)
)

# Convert to Hugging Face Dataset
filtered_reddit_dataset = Dataset.from_pandas(filtered_reddit_data[['body', 'label']])


# Load Tokenizer (already used for tokenization in Parts 1 & 2)
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/finetuned_bertweet_tokenizer")

# Tokenization Function
def tokenize_function(examples):
    return tokenizer(examples['body'], padding="max_length", truncation=True, max_length=128)

# Apply Tokenization
filtered_reddit_dataset = filtered_reddit_dataset.map(tokenize_function, batched=True)
filtered_reddit_dataset = filtered_reddit_dataset.rename_column("label", "labels")
filtered_reddit_dataset.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reddit_data['body'] = reddit_data['body'].astype(str)  # Ensure all 'body' values are strings


Filtered Reddit dataset size: 26687


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_reddit_data['label'] = filtered_reddit_data['sentiment'].apply(
emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


Map:   0%|          | 0/26687 [00:00<?, ? examples/s]

In [5]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig, TaskType
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load Custom Dataset
custom_path = "/content/drive/MyDrive/base_data_full.csv"
custom_data = pd.read_csv(custom_path)

# Ensure correct data types
custom_data = custom_data.dropna()  # Remove missing values
custom_data['sentence'] = custom_data['sentence'].astype(str)  # Ensure sentences are strings
custom_data['sentiment'] = custom_data['sentiment'].str.lower()  # Normalize sentiment column to lowercase

# Map sentiment to numerical labels
sentiment_mapping = {"negative": 0, "positive": 1, "neutral": 2}
custom_data['label'] = custom_data['sentiment'].map(sentiment_mapping)

# Convert to Hugging Face Dataset
custom_dataset = Dataset.from_pandas(custom_data[['sentence', 'label']])

# Load Pretrained BERTweet
model_name = "vinai/bertweet-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Configure LoRA
lora_config = LoraConfig(
    r=8,  # Low-rank matrix dimension
    lora_alpha=16,  # Scaling factor
    lora_dropout=0.1,  # Dropout rate for LoRA layers
    task_type=TaskType.SEQ_CLS  # Sequence classification task type
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Tokenization Function
def tokenize_function(examples):
    return tokenizer(examples['sentence'], padding="max_length", truncation=True, max_length=128)

custom_dataset = custom_dataset.map(tokenize_function, batched=True)
custom_dataset = custom_dataset.rename_column("label", "labels")
custom_dataset.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])

# Split Dataset for Training and Validation
custom_dataset = custom_dataset.train_test_split(test_size=0.2)
train_dataset = custom_dataset['train']
eval_dataset = custom_dataset['test']

# Define Training Arguments
training_args = TrainingArguments(
    output_dir="./finetuned_results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./finetuned_logs",
    logging_steps=10,

    save_strategy="epoch"
)

# Evaluation Metrics
def compute_metrics(pred):
    predictions = pred.predictions.argmax(-1)
    labels = pred.label_ids
    acc = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

# Fine-Tune the Model with LoRA
trainer.train()

# Save the Fine-Tuned Model
model.save_pretrained("/content/drive/MyDrive/finetuned_bertweet_lora_model")
tokenizer.save_pretrained("/content/drive/MyDrive/finetuned_bertweet_lora_tokenizer")

# Evaluate Fine-Tuned Model on Custom Dataset
eval_results = trainer.evaluate(eval_dataset=eval_dataset)
print("Fine-Tuned LoRA Model Evaluation Results on Custom Dataset:", eval_results)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/558 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/843k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.91M [00:00<?, ?B/s]

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2744 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8152,0.858986,0.519126,0.269491,0.519126,0.354798
2,0.8329,0.845765,0.519126,0.269491,0.519126,0.354798
3,0.854,0.841803,0.519126,0.269491,0.519126,0.354798


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fine-Tuned LoRA Model Evaluation Results on Custom Dataset: {'eval_loss': 0.8418030738830566, 'eval_accuracy': 0.5191256830601093, 'eval_precision': 0.2694914748126251, 'eval_recall': 0.5191256830601093, 'eval_f1': 0.3547981287101466, 'eval_runtime': 259.6072, 'eval_samples_per_second': 2.115, 'eval_steps_per_second': 0.135, 'epoch': 3.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
