In [None]:
pip install transformers datasets torch scikit-learn

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch
from transformers import AdamW
from sklearn.metrics import accuracy_score
!pip install tqdm
from tqdm import tqdm
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Load the dataset
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

train_text = train["text"].tolist()
train_labels = train["label"].tolist()
test_text = test["text"].tolist()
test_ids = test["id"]

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Custom Dataset Class
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]

        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    train_text, train_labels, test_size=0.2, random_state=42
)

# Create Dataset objects
train_dataset = SentimentDataset(X_train, y_train, tokenizer)
val_dataset = SentimentDataset(X_val, y_val, tokenizer)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=28)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=3e-5, weight_decay=0.01)

# # Training Loop
# # epochs = 3
# # for epoch in range(epochs):
# #     model.train()
# #     total_loss = 0
# #     for batch in train_loader:
# #         input_ids = batch["input_ids"].to(device)
# #         attention_mask = batch["attention_mask"].to(device)
# #         labels = batch["label"].to(device)

# #         optimizer.zero_grad()
# #         outputs = model(
# #             input_ids=input_ids,
# #             attention_mask=attention_mask,
# #             labels=labels,
# #         )
# #         loss = outputs.loss
# #         total_loss += loss.item()
# #         loss.backward()
# #         optimizer.step()

# #     avg_loss = total_loss / len(train_loader)
# #     print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")



# # Training Loop
# epochs = 7
# for epoch in range(epochs):
#     model.train()
#     total_loss = 0
#     print(f"Epoch {epoch + 1}/{epochs}")

#     # Use tqdm for progress bar
#     train_loader_tqdm = tqdm(train_loader, desc="Training", unit="batch")

#     for batch in train_loader_tqdm:
#         input_ids = batch["input_ids"].to(device)
#         attention_mask = batch["attention_mask"].to(device)
#         labels = batch["label"].to(device)

#         optimizer.zero_grad()
#         outputs = model(
#             input_ids=input_ids,
#             attention_mask=attention_mask,
#             labels=labels,
#         )
#         loss = outputs.loss
#         total_loss += loss.item()
#         loss.backward()
#         optimizer.step()

#         # Update the tqdm description with the current loss
#         train_loader_tqdm.set_postfix(loss=loss.item())

#     avg_loss = total_loss / len(train_loader)
#     print(f"Epoch {epoch + 1} completed. Average Loss: {avg_loss:.4f}")

# # Validation Loop
# model.eval()
# val_preds = []
# val_labels = []
# with torch.no_grad():
#     for batch in val_loader:
#         input_ids = batch["input_ids"].to(device)
#         attention_mask = batch["attention_mask"].to(device)
#         labels = batch["label"].to(device)

#         outputs = model(
#             input_ids=input_ids,
#             attention_mask=attention_mask,
#         )
#         logits = outputs.logits
#         preds = torch.argmax(logits, dim=1).cpu().numpy()
#         val_preds.extend(preds)
#         val_labels.extend(labels.cpu().numpy())

# accuracy = accuracy_score(val_labels, val_preds)
# print(f"Validation Accuracy: {accuracy:.4f}")

# Calculate Class Weights
class_weights = compute_class_weight(
    "balanced", classes=np.unique(train_labels), y=train_labels
)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# Training Loop
epochs = 7
for epoch in range(epochs):
    model.train()
    total_loss = 0
    print(f"Epoch {epoch + 1}/{epochs}")

    # Use tqdm for progress bar
    train_loader_tqdm = tqdm(train_loader, desc="Training", unit="batch")

    for batch in train_loader_tqdm:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )

        # Compute weighted loss
        logits = outputs.logits
        loss = torch.nn.CrossEntropyLoss(weight=class_weights)(logits, labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

        # Update the tqdm description with the current loss
        train_loader_tqdm.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1} completed. Average Loss: {avg_loss:.4f}")

# Validation Loop (unchanged)
model.eval()
val_preds = []
val_labels = []
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        val_preds.extend(preds)
        val_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(val_labels, val_preds)
print(f"Validation Accuracy: {accuracy:.4f}")




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/7


Training: 100%|██████████| 250/250 [02:36<00:00,  1.60batch/s, loss=2.67]


Epoch 1 completed. Average Loss: 2.9907
Epoch 2/7


Training: 100%|██████████| 250/250 [02:35<00:00,  1.61batch/s, loss=2.27]


Epoch 2 completed. Average Loss: 2.3147
Epoch 3/7


Training: 100%|██████████| 250/250 [02:35<00:00,  1.61batch/s, loss=1.59]


Epoch 3 completed. Average Loss: 1.7299
Epoch 4/7


Training: 100%|██████████| 250/250 [02:35<00:00,  1.61batch/s, loss=0.964]


Epoch 4 completed. Average Loss: 1.2306
Epoch 5/7


Training: 100%|██████████| 250/250 [02:35<00:00,  1.61batch/s, loss=0.624]


Epoch 5 completed. Average Loss: 0.8405
Epoch 6/7


Training: 100%|██████████| 250/250 [02:35<00:00,  1.61batch/s, loss=0.628]


Epoch 6 completed. Average Loss: 0.5410
Epoch 7/7


Training: 100%|██████████| 250/250 [02:35<00:00,  1.61batch/s, loss=0.158]


Epoch 7 completed. Average Loss: 0.3535
Validation Accuracy: 0.7425


In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch
from transformers import AdamW
from sklearn.metrics import accuracy_score
!pip install tqdm
from tqdm import tqdm
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Load the dataset
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

train_text = train["text"].tolist()
train_labels = train["label"].tolist()
test_text = test["text"].tolist()
test_ids = test["id"]

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Custom Dataset Class
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]

        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    train_text, train_labels, test_size=0.2, random_state=42
)

# Define a function to train and evaluate the model
def train_and_evaluate(params):
    max_len, batch_size, learning_rate, weight_decay, dropout = params
    print(f"Training with parameters: {params}")

    # Create datasets and dataloaders
    train_dataset = SentimentDataset(X_train, y_train, tokenizer, max_len=max_len)
    val_dataset = SentimentDataset(X_val, y_val, tokenizer, max_len=max_len)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    # Load pre-trained BERT model
    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased", num_labels=28, hidden_dropout_prob=dropout
    )

    # Move model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Compute class weights
    class_weights = compute_class_weight(
        "balanced",
        classes=np.unique(np.array(train_labels)),  # Ensure classes is a NumPy array
        y=np.array(train_labels)  # Ensure y is also a NumPy array
    )

    # Convert class weights to a PyTorch tensor
    class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

    # Optimizer and loss function
    optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)

    # Training Loop
    epochs = 5
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        train_loader_tqdm = tqdm(train_loader, desc=f"Training Epoch {epoch + 1}")
        for batch in train_loader_tqdm:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs.logits, labels)
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            train_loader_tqdm.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1} completed. Average Loss: {avg_loss:.4f}")

    # Validation Loop
    model.eval()
    val_preds, val_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            val_preds.extend(preds)
            val_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(val_labels, val_preds)
    print(f"Validation Accuracy: {accuracy:.4f}")
    return accuracy

# Hyperparameter grid
max_lens = [128, 256]
batch_sizes = [16, 32]
learning_rates = [5e-5, 3e-5, 2e-5]
weight_decays = [0.01, 0.1]
dropouts = [0.1, 0.2]

# Perform grid search
best_params = None
best_accuracy = 0

for max_len in max_lens:
    for batch_size in batch_sizes:
        for lr in learning_rates:
            for wd in weight_decays:
                for dropout in dropouts:
                    params = (max_len, batch_size, lr, wd, dropout)
                    accuracy = train_and_evaluate(params)
                    if accuracy > best_accuracy:
                        best_accuracy = accuracy
                        best_params = params

print(f"Best Parameters: {best_params}")
print(f"Best Validation Accuracy: {best_accuracy:.4f}")




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Training with parameters: (128, 16, 5e-05, 0.01, 0.1)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|██████████| 500/500 [02:56<00:00,  2.84it/s, loss=0.905]


Epoch 1 completed. Average Loss: 2.7629


Training Epoch 2: 100%|██████████| 500/500 [02:58<00:00,  2.80it/s, loss=1.92]


Epoch 2 completed. Average Loss: 2.0618


Training Epoch 3: 100%|██████████| 500/500 [02:58<00:00,  2.81it/s, loss=2.26]


Epoch 3 completed. Average Loss: 1.5795


Training Epoch 4: 100%|██████████| 500/500 [02:58<00:00,  2.81it/s, loss=0.507]


Epoch 4 completed. Average Loss: 1.2342


Training Epoch 5: 100%|██████████| 500/500 [02:58<00:00,  2.81it/s, loss=0.643]


Epoch 5 completed. Average Loss: 0.8550
Validation Accuracy: 0.7285
Training with parameters: (128, 16, 5e-05, 0.01, 0.2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|██████████| 500/500 [02:57<00:00,  2.81it/s, loss=2.97]


Epoch 1 completed. Average Loss: 3.1348


Training Epoch 2: 100%|██████████| 500/500 [02:58<00:00,  2.81it/s, loss=2.1]


Epoch 2 completed. Average Loss: 2.6640


Training Epoch 3: 100%|██████████| 500/500 [02:58<00:00,  2.81it/s, loss=3.43]


Epoch 3 completed. Average Loss: 2.4075


Training Epoch 4: 100%|██████████| 500/500 [02:58<00:00,  2.81it/s, loss=2.13]


Epoch 4 completed. Average Loss: 2.2969


Training Epoch 5: 100%|██████████| 500/500 [02:57<00:00,  2.81it/s, loss=2.1]


Epoch 5 completed. Average Loss: 2.1018
Validation Accuracy: 0.7030
Training with parameters: (128, 16, 5e-05, 0.1, 0.1)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|██████████| 500/500 [02:58<00:00,  2.80it/s, loss=2.73]


Epoch 1 completed. Average Loss: 2.6379


Training Epoch 2: 100%|██████████| 500/500 [02:57<00:00,  2.81it/s, loss=1.31]


Epoch 2 completed. Average Loss: 1.8960


Training Epoch 3: 100%|██████████| 500/500 [02:58<00:00,  2.81it/s, loss=0.79]


Epoch 3 completed. Average Loss: 1.4674


Training Epoch 4: 100%|██████████| 500/500 [02:57<00:00,  2.81it/s, loss=0.661]


Epoch 4 completed. Average Loss: 1.1047


Training Epoch 5: 100%|██████████| 500/500 [02:58<00:00,  2.80it/s, loss=0.556]


Epoch 5 completed. Average Loss: 0.8367
Validation Accuracy: 0.7200
Training with parameters: (128, 16, 5e-05, 0.1, 0.2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|██████████| 500/500 [02:58<00:00,  2.81it/s, loss=3.5]


Epoch 1 completed. Average Loss: 2.7255


Training Epoch 2: 100%|██████████| 500/500 [02:58<00:00,  2.81it/s, loss=1.54]


Epoch 2 completed. Average Loss: 2.0547


Training Epoch 3: 100%|██████████| 500/500 [02:58<00:00,  2.81it/s, loss=0.695]


Epoch 3 completed. Average Loss: 1.6761


Training Epoch 4: 100%|██████████| 500/500 [02:58<00:00,  2.81it/s, loss=2.85]


Epoch 4 completed. Average Loss: 1.4144


Training Epoch 5: 100%|██████████| 500/500 [02:58<00:00,  2.81it/s, loss=0.716]


Epoch 5 completed. Average Loss: 1.2007
Validation Accuracy: 0.7435
Training with parameters: (128, 16, 3e-05, 0.01, 0.1)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|██████████| 500/500 [02:57<00:00,  2.81it/s, loss=1.65]


Epoch 1 completed. Average Loss: 2.8919


Training Epoch 2: 100%|██████████| 500/500 [02:58<00:00,  2.81it/s, loss=2.88]


Epoch 2 completed. Average Loss: 2.0208


Training Epoch 3: 100%|██████████| 500/500 [02:58<00:00,  2.81it/s, loss=2.84]


Epoch 3 completed. Average Loss: 1.5131


Training Epoch 4: 100%|██████████| 500/500 [02:58<00:00,  2.81it/s, loss=1.11]


Epoch 4 completed. Average Loss: 1.0951


Training Epoch 5: 100%|██████████| 500/500 [02:57<00:00,  2.81it/s, loss=2.01]


Epoch 5 completed. Average Loss: 0.7434
Validation Accuracy: 0.7435
Training with parameters: (128, 16, 3e-05, 0.01, 0.2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|██████████| 500/500 [02:58<00:00,  2.81it/s, loss=1.93]


Epoch 1 completed. Average Loss: 2.8272


Training Epoch 2: 100%|██████████| 500/500 [02:58<00:00,  2.81it/s, loss=1.56]


Epoch 2 completed. Average Loss: 2.0522


Training Epoch 3: 100%|██████████| 500/500 [02:57<00:00,  2.81it/s, loss=2.92]


Epoch 3 completed. Average Loss: 1.6487


Training Epoch 4: 100%|██████████| 500/500 [02:58<00:00,  2.81it/s, loss=0.319]


Epoch 4 completed. Average Loss: 1.2898


Training Epoch 5: 100%|██████████| 500/500 [02:58<00:00,  2.81it/s, loss=1.14]


Epoch 5 completed. Average Loss: 0.9978
Validation Accuracy: 0.7410
Training with parameters: (128, 16, 3e-05, 0.1, 0.1)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|██████████| 500/500 [02:57<00:00,  2.81it/s, loss=2.42]


Epoch 1 completed. Average Loss: 2.6560


Training Epoch 2: 100%|██████████| 500/500 [02:57<00:00,  2.81it/s, loss=1.33]


Epoch 2 completed. Average Loss: 1.8353


Training Epoch 3: 100%|██████████| 500/500 [02:57<00:00,  2.81it/s, loss=0.897]


Epoch 3 completed. Average Loss: 1.3723


Training Epoch 4: 100%|██████████| 500/500 [02:58<00:00,  2.81it/s, loss=0.291]


Epoch 4 completed. Average Loss: 0.9876


Training Epoch 5: 100%|██████████| 500/500 [02:57<00:00,  2.81it/s, loss=1.15]


Epoch 5 completed. Average Loss: 0.6494
Validation Accuracy: 0.7400
Training with parameters: (128, 16, 3e-05, 0.1, 0.2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|██████████| 500/500 [02:58<00:00,  2.80it/s, loss=3.08]


Epoch 1 completed. Average Loss: 3.0240


Training Epoch 2: 100%|██████████| 500/500 [02:57<00:00,  2.81it/s, loss=2.11]


Epoch 2 completed. Average Loss: 2.4595


Training Epoch 3: 100%|██████████| 500/500 [02:58<00:00,  2.81it/s, loss=1.89]


Epoch 3 completed. Average Loss: 1.9509


Training Epoch 4: 100%|██████████| 500/500 [02:58<00:00,  2.81it/s, loss=2.02]


Epoch 4 completed. Average Loss: 1.5905


Training Epoch 5: 100%|██████████| 500/500 [02:58<00:00,  2.81it/s, loss=1.92]


Epoch 5 completed. Average Loss: 1.2656
Validation Accuracy: 0.7295
Training with parameters: (128, 16, 2e-05, 0.01, 0.1)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|██████████| 500/500 [02:58<00:00,  2.81it/s, loss=2.65]


Epoch 1 completed. Average Loss: 2.9340


Training Epoch 2: 100%|██████████| 500/500 [02:58<00:00,  2.81it/s, loss=2.22]


Epoch 2 completed. Average Loss: 2.2039


Training Epoch 3: 100%|██████████| 500/500 [02:58<00:00,  2.81it/s, loss=1.31]


Epoch 3 completed. Average Loss: 1.6264


Training Epoch 4: 100%|██████████| 500/500 [02:58<00:00,  2.81it/s, loss=2.02]


Epoch 4 completed. Average Loss: 1.1858


Training Epoch 5:  30%|██▉       | 148/500 [00:52<02:05,  2.81it/s, loss=1.02]

In [None]:
from torch.utils.data import DataLoader, TensorDataset
import torch
!pip install datasets
from datasets import Dataset

# Convert the encodings to PyTorch tensors
test_encodings = tokenizer(
    test_text, max_length=128, padding="max_length", truncation=True, return_tensors="pt"
)

# Use TensorDataset instead of Dataset from Hugging Face
test_dataset = TensorDataset(
    test_encodings["input_ids"],
    test_encodings["attention_mask"]
)

# Create DataLoader
test_loader = DataLoader(test_dataset, batch_size=16)

test_preds = []
model.eval()
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch[0].to(device)  # input_ids is at index 0
        attention_mask = batch[1].to(device)  # attention_mask is at index 1

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        test_preds.extend(preds)

# Prepare submission file
submission = pd.DataFrame({"id": test_ids, "label": test_preds})
submission.to_csv("submission.csv", index=False)
print("Submission file created!")


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

NameError: name 'tokenizer' is not defined

In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from tqdm import tqdm
import nltk
from nltk.corpus import wordnet
import random
from transformers import MarianMTModel, MarianTokenizer

# Ensure NLTK WordNet data is available
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load the dataset
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

train_text = train["text"].tolist()
train_labels = train["label"].tolist()
test_text = test["text"].tolist()
test_ids = test["id"]

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Data Augmentation: Synonym Replacement
def synonym_replacement(text, num_replacements=1):
    words = text.split()
    for _ in range(num_replacements):
        word_idx = random.randint(0, len(words) - 1)
        synonyms = wordnet.synsets(words[word_idx])
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            words[word_idx] = synonym
    return " ".join(words)

# Data Augmentation: Back Translation
def back_translation(text, model_name="Helsinki-NLP/opus-mt-en-fr"):
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)

    # Translate to French
    encoded = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
    translated = model.generate(**encoded)
    french_translation = tokenizer.decode(translated[0], skip_special_tokens=True)

    # Translate back to English
    back_model_name = "Helsinki-NLP/opus-mt-fr-en"
    back_tokenizer = MarianTokenizer.from_pretrained(back_model_name)
    back_model = MarianMTModel.from_pretrained(back_model_name)
    back_encoded = back_tokenizer(french_translation, return_tensors="pt", max_length=512, truncation=True)
    back_translated = back_model.generate(**back_encoded)

    return back_tokenizer.decode(back_translated[0], skip_special_tokens=True)

# Apply Data Augmentation
augmented_texts = []
augmented_labels = []

for text, label in zip(train_text, train_labels):
    # Original sample
    augmented_texts.append(text)
    augmented_labels.append(label)

    # Synonym replacement
    augmented_texts.append(synonym_replacement(text))
    augmented_labels.append(label)

    # Back translation
    try:
        augmented_texts.append(back_translation(text))
        augmented_labels.append(label)
    except Exception as e:
        print(f"Back translation failed for text: {text}. Error: {e}")

# Replace train_text and train_labels with augmented versions
train_text = augmented_texts
train_labels = augmented_labels

# Save augmented data (optional)
pd.DataFrame({'text': train_text, 'label': train_labels}).to_csv("augmented_train.csv", index=False)

# Custom Dataset Class
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]

        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    train_text, train_labels, test_size=0.2, random_state=42
)

# Define a function to train and evaluate the model
def train_and_evaluate():
    max_len = 128
    batch_size = 16
    learning_rate = 2e-5
    weight_decay = 0.01
    dropout = 0.1

    print(f"Training with parameters: max_len={max_len}, batch_size={batch_size}")

    # Create datasets and dataloaders
    train_dataset = SentimentDataset(X_train, y_train, tokenizer, max_len=max_len)
    val_dataset = SentimentDataset(X_val, y_val, tokenizer, max_len=max_len)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    # Load pre-trained BERT model
    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased", num_labels=28, hidden_dropout_prob=dropout
    )

    # Move model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Compute class weights
    class_weights = compute_class_weight(
        "balanced",
        classes=np.unique(np.array(train_labels)),  # Ensure classes is a NumPy array
        y=np.array(train_labels)  # Ensure y is also a NumPy array
    )

    # Convert class weights to a PyTorch tensor
    class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

    # Optimizer and loss function
    optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)

    # Training Loop
    epochs = 3
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        train_loader_tqdm = tqdm(train_loader, desc=f"Training Epoch {epoch + 1}")
        for batch in train_loader_tqdm:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs.logits, labels)
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            train_loader_tqdm.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1} completed. Average Loss: {avg_loss:.4f}")

    # Validation Loop
    model.eval()
    val_preds, val_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            val_preds.extend(preds)
            val_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(val_labels, val_preds)
    print(f"Validation Accuracy: {accuracy:.4f}")
    return accuracy

# Run Training
train_and_evaluate()


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]