# !!!!!! This notebook utilizes BERT (transformer) base from huggingface and transfer learning training on the dataset, takes 2 hours on T4 GPU !!!!!!

In [None]:
!wget "https://raw.githubusercontent.com/joyarup/Twitter-Sentiment-Analysis/main/Sentiment140.tenPercent.sample.tweets.tsv"

--2024-02-24 10:41:01--  https://raw.githubusercontent.com/joyarup/Twitter-Sentiment-Analysis/main/Sentiment140.tenPercent.sample.tweets.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12507818 (12M) [text/plain]
Saving to: ‘Sentiment140.tenPercent.sample.tweets.tsv’


2024-02-24 10:41:01 (173 MB/s) - ‘Sentiment140.tenPercent.sample.tweets.tsv’ saved [12507818/12507818]



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
# Load dataset
file_path = '/content/Sentiment140.tenPercent.sample.tweets.tsv'
data = pd.read_csv(file_path, sep='\t', header=0)  # Adjust header accordingly

# Basic text cleaning and preprocessing function
def preprocess_text(s):
    # Remove URLs
    s = re.sub(r'https?://\S+|www\.\S+', '', s)
    # Remove mentions and hashtags
    s = re.sub(r'@\w+|#\w+', '', s)
    # Remove HTML tags
    s = re.sub(r'<.*?>', '', s)
    # Remove digits and special characters
    s = re.sub(r'\d+', '', s)
    s = re.sub(r'[^\w\s]', '', s)
    # Lowercase
    s = s.lower()
    # Remove stopwords (optional, based on testing with/without this step)
    stop_words = set(stopwords.words('english'))
    s = ' '.join(w for w in s.split() if w not in stop_words)
    return s

# Apply preprocessing to tweet texts
data['tweet_text'] = data['tweet_text'].apply(preprocess_text)

# Split data into features and target labels
X = data['tweet_text']
y = data['sentiment_label'].astype(int)  # Ensure labels are numeric

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
data

Unnamed: 0,sentiment_label,tweet_text
0,4,hey dear happy friday already rices bowl lunch
1,4,ughhh layin downnnn waiting zeina cook breakfast
2,0,reckon hell play even hes know nothing wont wi...
3,0,know saw news
4,0,sad closed one web services ive used years
...,...,...
159995,0,still panerastudying quotmockquot board exams ...
159996,0,insomnia control tonighthavent slept wink
159997,4,take pride
159998,4,heading work


### Naive Bayes

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Create a pipeline that vectorizes the text and then applies Naïve Bayes classifier
model_nb = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB()),
])

# Train the Naïve Bayes model
model_nb.fit(X_train, y_train)

# Predict and evaluate the model
predictions_nb = model_nb.predict(X_test)
accuracy_nb = accuracy_score(y_test, predictions_nb)
print(f"Accuracy of Naïve Bayes model: {accuracy_nb:.4f}")


Accuracy of Naïve Bayes model: 0.7537


In [None]:
! pip install -U accelerate
! pip install -U transformers
import accelerate

accelerate.__version__


Collecting accelerate
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/280.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.27.2
Collecting transformers
  Downloading transformers-4.38.1-py3-none-any.whl (8.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.37.2
    Uninstalling transformers-4.37.2:
      Successfully uninstalled transformers-4.37.2
Successfully installed transformers-4.38.1


'0.27.2'

In [None]:
import torch

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode labels to range between 0 and n_classes-1
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Then use `y_encoded` for splitting and training


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

### BERT

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset

# Define a PyTorch dataset
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the input (truncation and padding are handled automatically)
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=512)

# Convert to dataset
train_dataset = SentimentDataset(train_encodings, y_train.tolist())
test_dataset = SentimentDataset(test_encodings, y_test.tolist())

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Load the BERT model for sequence classification

# Assuming `y` contains your labels
num_labels = y.nunique()  # This calculates the number of unique labels

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)


# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train the model
trainer.train()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,0.6822
20,0.7063
30,0.7039
40,0.7104
50,0.6899
60,0.6905
70,0.6717
80,0.6904
90,0.6847
100,0.6631


TrainOutput(global_step=24000, training_loss=0.3860885711014271, metrics={'train_runtime': 9494.2636, 'train_samples_per_second': 40.445, 'train_steps_per_second': 2.528, 'total_flos': 2.525866131456e+16, 'train_loss': 0.3860885711014271, 'epoch': 3.0})

In [None]:
pip install datasets


Collecting datasets
  Downloading datasets-2.17.1-py3-none-any.whl (536 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.17.1 dill-0.3.8 multiprocess-0.70.16


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import EvalPrediction
from datasets import load_metric

# Load the accuracy metric function
accuracy_metric = load_metric("accuracy")

def compute_metrics(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    # Calculate accuracy using Hugging Face's `datasets` metric
    accuracy = accuracy_metric.compute(predictions=preds, references=p.label_ids)["accuracy"]
    # You can also calculate other metrics such as precision, recall, and F1-score
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='binary')
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}



In [None]:
results = trainer.evaluate()
print(results)

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
10,0.2821,,,,,
20,0.2453,,,,,
26,0.2453,0.567061,0.791844,0.798135,0.781223,0.789588


{'eval_loss': 0.5670613050460815, 'eval_accuracy': 0.79184375, 'eval_precision': 0.7981352576792898, 'eval_recall': 0.781222652831604, 'eval_f1': 0.7895884006696782}
