In [3]:
# Install necessary dependencies
!git clone https://github.com/csebuetnlp/banglabert
%cd banglabert
!pip install torch torchvision torchaudio
!pip install --upgrade datasets transformers pyarrow huggingface_hub
!bash setup.sh

Cloning into 'banglabert'...
remote: Enumerating objects: 141, done.[K
remote: Counting objects: 100% (141/141), done.[K
remote: Compressing objects: 100% (116/116), done.[K
remote: Total 141 (delta 70), reused 75 (delta 23), pack-reused 0 (from 0)[K
Receiving objects: 100% (141/141), 1.11 MiB | 3.32 MiB/s, done.
Resolving deltas: 100% (70/70), done.
/content/banglabert/banglabert
Collecting datasets
  Using cached datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting huggingface_hub
  Using cached huggingface_hub-0.26.0-py3-none-any.whl.metadata (13 kB)
Using cached datasets-3.0.1-py3-none-any.whl (471 kB)
Using cached huggingface_hub-0.26.0-py3-none-any.whl (447 kB)
Installing collected packages: huggingface_hub, datasets
  Attempting uninstall: huggingface_hub
    Found existing installation: huggingface-hub 0.0.19
    Uninstalling huggingface-hub-0.0.19:
      Successfully uninstalled huggingface-hub-0.0.19
  Attempting uninstall: datasets
    Found existing installation:

Cloning into 'transformers'...
remote: Enumerating objects: 235907, done.[K
remote: Counting objects: 100% (22383/22383), done.[K
remote: Compressing objects: 100% (1591/1591), done.[K
^C


In [1]:
import pandas as pd
import json
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import os
os.environ["WANDB_DISABLED"] = "true"

# Load labels
with open('labels.json', 'r') as f:
    labels = json.load(f)

# Load data from parquet file
data_df = pd.read_parquet('senti.parquet')

# Display the first few rows of the data and labels
print(data_df.head())
print(labels)

# Define a mapping from string labels to integers
label_mapping = {'SP': 0, 'WP': 1, 'NU': 2, 'WN': 3, 'SN': 4}

# Map the Polarity column to numeric labels
data_df['Polarity'] = data_df['Polarity'].map(label_mapping)

# Drop any rows with missing values in 'Text' or 'Polarity'
data_df = data_df.dropna(subset=['Text', 'Polarity'])

# Check the processed data
print(data_df.head())

class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize the text
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=128, return_tensors='pt')

        return {
            'input_ids': encoding['input_ids'].squeeze(),  # No flattening
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Initialize the tokenizer
model_name = "csebuetnlp/banglabert"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Split the dataset into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data_df['Text'].tolist(),
    data_df['Polarity'].tolist(),
    test_size=0.2,
    random_state=42
)

# Create training and validation datasets
train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer)

# Load the classification model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_mapping))

# Freeze all BanglaBERT encoder layers (if applicable)
for param in model.base_model.parameters():
    param.requires_grad = False

# Ensure all model parameters are contiguous (just in case of any memory issues)
for param in model.parameters():
    if not param.is_contiguous():
        param.data = param.data.contiguous()

# Define a compute_metrics function for evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')  # Use weighted F1 score
    return {'accuracy': accuracy, 'f1': f1}

# Prepare training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,  # Log every 100 steps
    evaluation_strategy="epoch",  # Evaluate after every epoch
    save_steps=1000,  # Save model every 1000 steps
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics  # Include the metrics function
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()


                                 ID  \
0  7abbf5aeb011e883c0a47a5e299b371e   
1  1089584daf51e3be29f985bcb935d1fa   
2  1742705348624306b2a6e9c256213808   
3  31aa85f5fd4918a960937825b226b597   
4  37c5947791f4d1a787ba9b1111b0e87b   

                                              Text Polarity    Domain  
0  যাওয়ার সময় যেন স্বাস্থ্যমন্ত্রীকে সাথে নিয়ে যায়       NU  facebook  
1              তার আগে যদি আপনি বিদায় নিতেন স্যার,       NU  facebook  
2   রাষ্ট্রের তহবিল কি একেবারে তলানিতে গিয়ে ঠেকেছে       NU  facebook  
3            সাথে আপনাকে চিপ গেস্ট হিসেবে নিয়ে যাক       NU  facebook  
4     শ্রিপাকে ও রিমান্ডে নিয়ে জিজ্ঞাসাবাদ করা হোক       NU  facebook  
{'SP': 'Strongly Positive', 'WP': 'Weakly Positive', 'NU': 'Neutral', 'WN': 'Weakly Negative', 'SN': 'Strongly Negative'}
                                 ID  \
0  7abbf5aeb011e883c0a47a5e299b371e   
1  1089584daf51e3be29f985bcb935d1fa   
2  1742705348624306b2a6e9c256213808   
3  31aa85f5fd4918a960937825b226b597   
4  37c5947791f4d

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/586 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/528k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at csebuetnlp/banglabert and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.3935,1.34429,0.442286,0.407951
2,1.3611,1.292319,0.473571,0.460811
3,1.3759,1.281523,0.476,0.460506


{'eval_loss': 1.28152334690094,
 'eval_accuracy': 0.476,
 'eval_f1': 0.46050638543874006,
 'eval_runtime': 102.5739,
 'eval_samples_per_second': 136.487,
 'eval_steps_per_second': 17.061,
 'epoch': 3.0}