In [28]:
!pip install --quiet wandb accelerate>=0.20.1

In [1]:
import gc
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset
from torch.utils.data import Dataset , DataLoader
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from torch.optim import AdamW

from transformers import BertTokenizer, get_linear_schedule_with_warmup
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback, BertForSequenceClassification
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import pandas as pd
import torch
import os
import wandb
from sklearn.model_selection import train_test_split



class CustomTokenizerDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.label_encoder = LabelEncoder()
        self.labels = self.label_encoder.fit_transform(labels)
        self.max_length = max_length
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        tokens = self.tokenize_text(text)
        return {
            'input_ids': tokens['input_ids'].squeeze(),
            'attention_mask': tokens['attention_mask'].squeeze(),
            'label': torch.tensor(label)
        }

    def tokenize_text(self, text):
        return self.tokenizer.encode_plus(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
        )

class CustomSentimentClassifier:
    def __init__(self, model, tokenizer, num_labels=3, early_stopping_patience=3, early_stopping_threshold=0.005):

        self.tokenizer = tokenizer
        self.num_labels = num_labels
        self.early_stopping_patience = early_stopping_patience
        self.early_stopping_threshold = early_stopping_threshold
        self.model = model.to(device)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def load_data(self, file_path):
        data = pd.read_csv(file_path, lineterminator='\n')
        return data

    def tokenize_and_split(self, data, test_size=0.2, random_state=42):
        texts = data['review_text'].tolist()
        labels = data['label'].tolist()
        dataset = CustomTokenizerDataset(texts, labels, tokenizer=self.tokenizer)
        train_dataset, val_dataset = train_test_split(dataset, test_size=test_size, random_state=random_state)

        return train_dataset, val_dataset

    def evaluate(self, eval_dataloader):
        trainer = Trainer(model=self.model, compute_metrics=self.compute_metrics)
        results = trainer.evaluate(eval_dataloader.dataset)
        return results

    def predict(self, text):
        inputs = self.tokenizer(text, return_tensors="pt")
        input_ids = inputs["input_ids"].to(self.device)
        attention_mask = inputs["attention_mask"].to(self.device)

        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probabilities = torch.nn.functional.softmax(logits, dim=1)
        predicted_class = torch.argmax(probabilities, dim=1).item()
        return predicted_class


    def train(self, train_dataloaders, val_dataloaders, num_train_epochs=10, per_device_train_batch_size=16, per_device_eval_batch_size=16, warmup_steps=500, weight_decay=0.01, model_identifier=""):
        # Modify model instantiation to use DataParallel
        training_args = TrainingArguments(
            output_dir=f"./sentiment_model_{model_identifier}",
            num_train_epochs=num_train_epochs,
            per_device_train_batch_size=per_device_train_batch_size,
            per_device_eval_batch_size=per_device_eval_batch_size,
            warmup_steps=warmup_steps,
            weight_decay=weight_decay,
            logging_dir="./logs",
            save_total_limit=1,
            load_best_model_at_end=True,
            save_strategy="epoch",
            evaluation_strategy="epoch",
            report_to="wandb",
        )

        wandb.init(project="Sentiment_analysis", config=training_args)

        optimizer = AdamW(self.model.parameters(), lr=5e-6)
        num_training_steps = len(train_dataloaders) * training_args.num_train_epochs
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=200, num_training_steps=num_training_steps)

        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataloaders.dataset,
            eval_dataset=val_dataloaders.dataset,
            compute_metrics=self.compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=self.early_stopping_patience, early_stopping_threshold=self.early_stopping_threshold)],
            optimizers=(optimizer, scheduler),
        )

        # Train the model
        trainer.train()

        # Save the model and metrics
        self.save_model(model_identifier)
        metrics_df = self.save_metrics(trainer, model_identifier)

        # Evaluate on the test set
        test_metrics = self.evaluate(val_dataloaders)
        print(f"Test Metrics (Model {model_identifier}): {test_metrics}")

        # Clear GPU memory
        del trainer
        torch.cuda.empty_cache()
        gc.collect()

        return metrics_df

    def compute_metrics(self, pred):
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        accuracy = accuracy_score(labels, preds)
        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
        return {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
        }

    def save_model(self,model_identifier, output_dir="./sentiment_model"):
        model_save_dir = os.path.join(output_dir, model_identifier)
        os.makedirs(model_save_dir, exist_ok=True)

        # Save the model's state_dict
        model_state_path = os.path.join(model_save_dir, "model_state.pth")
        torch.save(self.model.state_dict(), model_state_path)

    def save_metrics(self, trainer, model_identifier):
        metrics = trainer.evaluate()
        metrics_df = pd.DataFrame(metrics, index=[0])
        metrics_csv_path = os.path.join(trainer.args.output_dir, "metrics.csv")
        metrics_df.to_csv(metrics_csv_path, index=False)
        return metrics_df



In [2]:
# Example usage:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_labels = 3  # Set your number of labels

In [None]:

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
bert_classifier = CustomSentimentClassifier(model=bert_model, tokenizer=bert_tokenizer)
data = bert_classifier.load_data('/content/reviews_data.csv')
train_data, val_data = bert_classifier.tokenize_and_split(data)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

train_dataloader = DataLoader(train_data, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=8, shuffle=False)
bert_metrics = bert_classifier.train(train_dataloader, val_dataloader, model_identifier="bert_model",num_train_epochs=5)

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8668,0.527663,0.774378,0.789462,0.76849,0.77059
2,0.5376,0.48123,0.797579,0.790911,0.791772,0.791247
3,0.3972,0.466287,0.811029,0.81538,0.807711,0.809338
4,0.3666,0.502993,0.817081,0.81505,0.81164,0.812656
5,0.2619,0.554972,0.815064,0.810387,0.808818,0.809211


Test Metrics (Model bert_model): {'eval_loss': 0.46628692746162415, 'eval_accuracy': 0.8110289172831204, 'eval_precision': 0.8153796463270963, 'eval_recall': 0.8077114363978771, 'eval_f1': 0.8093376388412864, 'eval_runtime': 95.2131, 'eval_samples_per_second': 31.235, 'eval_steps_per_second': 3.907}


In [None]:
torch.cuda.empty_cache()
gc.collect()

53

In [3]:

roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_labels)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
roberta_classifier = CustomSentimentClassifier(model=roberta_model, tokenizer=roberta_tokenizer)
data = roberta_classifier.load_data('/content/reviews_data.csv')
train_data, val_data = roberta_classifier.tokenize_and_split(data)
train_dataloader = DataLoader(train_data, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=16, shuffle=False)
robert_metrics = roberta_classifier.train(train_dataloader, val_dataloader, model_identifier="roberta_model",num_train_epochs=5)

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7752,0.434097,0.818763,0.826863,0.815194,0.816825
2,0.4505,0.404292,0.840282,0.837337,0.835629,0.836211
3,0.3538,0.423979,0.839274,0.837502,0.83663,0.836444
4,0.3336,0.438593,0.84499,0.840797,0.839996,0.840299
5,0.2735,0.435514,0.85037,0.846585,0.846382,0.846446


Test Metrics (Model roberta_model): {'eval_loss': 0.40429186820983887, 'eval_accuracy': 0.8402824478816409, 'eval_precision': 0.8373371412664651, 'eval_recall': 0.8356291454733227, 'eval_f1': 0.8362113345662469, 'eval_runtime': 94.318, 'eval_samples_per_second': 31.532, 'eval_steps_per_second': 3.944}
