# Pip

In [3]:
!pip install sentencepiece -q
!pip install transformers -q
!pip install accelerate -q
!pip install peft -q
!pip install bitsandbytes -q
!pip install lightning -q

# Imports

In [None]:
!pip install flash-attn

In [4]:
import numpy as np
import pandas as pd

from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForCausalLM,
    get_linear_schedule_with_warmup,
    get_cosine_schedule_with_warmup
)

import os
import time
import zipfile
import urllib.request
from pathlib import Path
from tqdm.auto import tqdm

import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.cuda.amp import GradScaler, autocast
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss, accuracy_score
import lightning as L

from peft import (
    get_peft_config,
    get_peft_model,
    LoraConfig,
    TaskType
)

tqdm.pandas()
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# Get data

In [6]:
import pandas as pd
from pathlib import Path

# Define the path to your CSV file
data_file_path = Path('/content/data.csv')

# Read the CSV file
df = pd.read_csv(data_file_path, sep=",", names=["target", "text"])

# Display the first few rows of the dataframe
print(df.head())



     target                                               text
0    target                                               text
1  positive  তোমরা সবাই এই ভাই তাকে একটু  কোরোভিডিও গুলো দে...
2  positive  🎯 ব্রহ্মপুত্র নদের ওপর নির্মিত ৯.১৫ কিলোমিটারে...
3  positive         😊 ধন্যবাদ মাননীয় প্রধানমন্ত্রী শেখ হাসিনা
4   neutral                                2022 এর 25 এ জুন 🥰🥰


In [7]:
def create_balanced_dataset(df):
    # Count the instances of each target class
    target_counts = df['target'].value_counts()

    # Get the count of neutral class
    neutral_count = target_counts['neutral']

    # Create subsets for each target class, limiting the size of 'positive' and 'negative' to match 'neutral'
    positive_subset = df[df['target'] == 'positive'].sample(neutral_count, random_state=123)
    negative_subset = df[df['target'] == 'negative'].sample(neutral_count, random_state=123)
    neutral_subset = df[df['target'] == 'neutral']  # Use all instances of neutral

    # Combine the subsets to create a balanced dataset
    balanced_df = pd.concat([positive_subset, negative_subset, neutral_subset])

    return balanced_df

# Call the function and print the value counts of the balanced dataset
balanced_df = create_balanced_dataset(df)
print(balanced_df['target'].value_counts())


target
positive    1767
negative    1767
neutral     1767
Name: count, dtype: int64


In [8]:
balanced_df['target'] = df['target'].map({'positive': 1, 'neutral': 0, 'negative': 2})

In [15]:
# Save the balanced dataframe to a CSV file #FOR DOWNLOAD PURPOSE !! CHECKED
balanced_df.to_csv('balanced_dataset.csv', index=False)

# If you're in a notebook environment, you can download the file with the following:
from google.colab import files
files.download('balanced_dataset.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [10]:
def random_split(df, train_frac, validation_frac):
    # Shuffle the entire DataFrame
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)

    # Calculate split indices
    train_end = int(len(df) * train_frac)
    validation_end = train_end + int(len(df) * validation_frac)

    # Split the DataFrame
    train_df = df[:train_end]
    validation_df = df[train_end:validation_end]
    test_df = df[validation_end:]

    return train_df, validation_df, test_df

train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)
# Test size is implied to be 0.2 as the remainder

train_df.to_csv("train.csv", index=None)
validation_df.to_csv("validation.csv", index=None)
test_df.to_csv("test.csv", index=None)

# Load tokenizer

In [11]:
# Initialize tokenizer with model ID and authentication token
model_id = 'h2oai/h2o-danube-1.8b-chat'
hf_token = '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' # Replace your token here on huggingface

tokenizer = AutoTokenizer.from_pretrained(model_id)

# Set padding token to end-of-sequence token and configure padding side
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

# Split data

In [12]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
val = pd.read_csv('validation.csv')

In [13]:
train['text'] = tokenizer.bos_token + train['text']
test['text'] = tokenizer.bos_token + test['text']
val['text'] = tokenizer.bos_token + val['text']

In [15]:
sample = tokenizer(train.text[0], add_special_tokens=False).input_ids
tokenizer.decode(sample)

'<s> দুনীতিতে সব শেষ'

# Dataset and DataLoader

In [16]:
class CustomDataset(Dataset):
    def __init__(self, texts, targets):
        self.texts = texts
        self.targets = torch.tensor(targets, dtype=torch.long)

    def __getitem__(self, idx):
        text = self.texts[idx]
        target = self.targets[idx]
        return text, target

    def __len__(self):
        return len(self.targets)

In [17]:
# Set seed for reproducibility
L.seed_everything(seed=252)

# Create train dataset and dataloader
train_dataset = CustomDataset(
    texts=train['text'].values.tolist(),
    targets=train['target'].values.tolist()
)
train_dataloader = DataLoader(
    dataset=train_dataset,
    batch_size=2,
    shuffle=True,
    drop_last=True
)

# Create test dataset and dataloader
test_dataset = CustomDataset(
    texts=test['text'].values.tolist(),
    targets=test['target'].values.tolist()
)
test_dataloader = DataLoader(
    dataset=test_dataset,
    batch_size=16,
    shuffle=False,
    drop_last=False
)

# Create validation dataset and dataloader
val_dataset = CustomDataset(
    texts=val['text'].values.tolist(),
    targets=val['target'].values.tolist()
)
val_dataloader = DataLoader(
    dataset=val_dataset,
    batch_size=16,
    shuffle=False,
    drop_last=False
)

INFO: Seed set to 252
INFO:lightning.fabric.utilities.seed:Seed set to 252


# Tokenization function

In [18]:
def tokenize_text(text):
    """
    Tokenize the text and return PyTorch tensors with dynamic padding
    """
    encodings = tokenizer(
        text,
        return_tensors='pt',
        padding='longest',  # Dynamically pad each batch to the length of the longest sequence
        add_special_tokens=False
    )

    return encodings


# Architecture

In [19]:
def disable_dropout(model: torch.nn.Module):
    """Disable dropout in a model."""
    for module in model.modules():
        if isinstance(module, torch.nn.Dropout):
            module.p = 0

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        model_id = 'h2oai/h2o-danube-1.8b-chat'

        # Get LLM configuration
        config = AutoConfig.from_pretrained(model_id)

        # LoRA config - keeping your original parameters
        peft_config = LoraConfig(
            task_type=TaskType.CAUSAL_LM,
            inference_mode=False,
            r=8,
            lora_alpha=16,
            target_modules='all-linear',
            lora_dropout=0.  # Keeping your original dropout value
        )

        # Load h2o-danube model - keeping your original configuration
        self.backbone = AutoModelForCausalLM.from_pretrained(
            model_id,
            device_map="cuda",
            low_cpu_mem_usage=True,
            trust_remote_code=True,
        )

        # Replace language model head with identity function
        self.backbone.lm_head = nn.Identity()

        # Apply LoRA
        self.backbone = get_peft_model(self.backbone, peft_config)
        self.backbone.print_trainable_parameters()

        # Modified classification head for 3 classes
        self.cls_head = nn.Sequential(
            nn.Linear(config.hidden_size, 768),
            nn.ReLU(),
            nn.LayerNorm(768),
            nn.Linear(768, 3)  # Changed to 3 outputs to match your classes (0:neutral, 1:positive, 2:negative)
        )

    def forward(self, input_ids, attention_mask):
        x = self.backbone(input_ids, attention_mask).logits
        logits = self.cls_head(x)[:, -1, :]  # Keep your original last token selection
        return logits

# Optimizer and Scheduler

In [20]:
def get_optimizer(model, learning_rate=0.0001, diff_lr=0.00001, weight_decay=0.01):
    """
    Get optimizer with different learning rates for specified layers.

    Args:
        model (torch.nn.Module): The neural network model.
        learning_rate (float): Learning rate for non-differential layers.
        diff_lr (float): Learning rate for differential layers.
        weight_decay (float): Weight decay (decoupled from L2 penalty) for optimizer.

    Returns:
        torch.optim.AdamW: Optimizer for the model.
    """

    # Define parameters with different learning rates and weight decay
    no_decay = ['bias', 'LayerNorm.weight']
    differential_layers = ['backbone']

    optimizer = torch.optim.AdamW(
            [
                {
                    "params": [
                        param
                        for name, param in model.named_parameters()
                        if (not any(layer in name for layer in differential_layers))
                        and (not any(nd in name for nd in no_decay))
                    ],
                    "lr": learning_rate,
                    "weight_decay": weight_decay,
                },
                {
                    "params": [
                        param
                        for name, param in model.named_parameters()
                        if (not any(layer in name for layer in differential_layers))
                        and (any(nd in name for nd in no_decay))
                    ],
                    "lr": learning_rate,
                    "weight_decay": 0,
                },
                {
                    "params": [
                        param
                        for name, param in model.named_parameters()
                        if (any(layer in name for layer in differential_layers))
                        and (not any(nd in name for nd in no_decay))
                    ],
                    "lr": diff_lr,
                    "weight_decay": weight_decay,
                },
                {
                    "params": [
                        param
                        for name, param in model.named_parameters()
                        if (any(layer in name for layer in differential_layers))
                        and (any(nd in name for nd in no_decay))
                    ],
                    "lr": diff_lr,
                    "weight_decay": 0,
                },
            ],
            lr=learning_rate,
            weight_decay=weight_decay,
    )

    return optimizer

# Hyperameters

In [21]:
# Hyperparameters
num_epochs = 3  # Increased from 2 to 3 for better learning of sentiment patterns
learning_rate = 0.0001  # Decreased slightly for more stable training
diff_lr = 0.00001  # Keep this the same
warmup_steps = 100  # Added warmup steps for better stability
seed = 252  # Keep this the same
weight_decay = 0.01  # Keep this the same
accumulation_steps = 2  # Keep this the same

# Fine-tuning

In [23]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')

Using device: cuda


In [24]:
# Set seed for reproducibility
L.seed_everything(seed=seed)

# Instantiate the neural network model
model = Net()
model.to(device)

# Display the names of trainable parameters
print('Here are the trainable parameters:')
for n, p in model.named_parameters():
    if p.requires_grad:
        print(n)

# Get the optimizer
optimizer = get_optimizer(
    model,
    learning_rate=learning_rate,
    diff_lr=diff_lr,
    weight_decay=weight_decay
)

# Modified scheduler with warmup
num_training_steps = num_epochs * len(train_dataloader)
scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=num_training_steps
)

# Add loss function
criterion = nn.CrossEntropyLoss()

# Add gradient scaler for mixed precision training
scaler = GradScaler()

INFO: Seed set to 252
INFO:lightning.fabric.utilities.seed:Seed set to 252


trainable params: 8,650,752 || all params: 1,757,932,032 || trainable%: 0.4921
Here are the trainable parameters:
backbone.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight
backbone.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight
backbone.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight
backbone.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight
backbone.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight
backbone.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight
backbone.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight
backbone.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight
backbone.base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight
backbone.base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight
backbone.base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight
backbone.base_mod

  scaler = GradScaler()


In [25]:
# Part 1: Training Loop
def train_model(model, train_dataloader, optimizer, scheduler, criterion, scaler, num_epochs, accumulation_steps, device):
    start_time = time.time()

    for epoch in range(num_epochs):
        total_loss = 0
        model.train()

        for batch_idx, batch in enumerate(train_dataloader):
            prompt, targets = batch

            # Tokenize the batch
            encodings = tokenize_text(prompt)
            input_ids = encodings['input_ids'].to(device)
            attention_mask = encodings['attention_mask'].to(device)
            targets = targets.to(device)

            # Mixed precision training
            with autocast():
                logits = model(input_ids, attention_mask)
                loss = criterion(logits, targets) / accumulation_steps

            # Backward pass
            scaler.scale(loss).backward()

            # Update weights
            if (batch_idx + 1) % accumulation_steps == 0:
                scaler.step(optimizer)
                scaler.update()
                scheduler.step()
                optimizer.zero_grad()

            total_loss += loss.item() * accumulation_steps

            # Print progress
            if (batch_idx + 1) % 100 == 0:
                avg_loss = total_loss / (batch_idx + 1)
                print(
                    f'Epoch: {epoch+1}/{num_epochs} | '
                    f'Batch: {batch_idx+1}/{len(train_dataloader)} | '
                    f'Avg Loss: {avg_loss:.4f}'
                )

    end_time = time.time()
    training_time = (end_time - start_time) / 60
    print(f'Total training time: {training_time:.2f} min')
    return model

# Evaluation

In [26]:
# Part 2: Evaluation Function
def evaluate_model(model, dataloader, device, prefix=""):
    model.eval()
    all_predictions = []
    all_targets = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc=f'Evaluating {prefix}'):
            prompt, targets = batch

            # Tokenize and move to device
            encodings = tokenize_text(prompt)
            input_ids = encodings['input_ids'].to(device)
            attention_mask = encodings['attention_mask'].to(device)

            # Get predictions
            with autocast():
                logits = model(input_ids, attention_mask)
                predictions = F.softmax(logits, dim=1).argmax(dim=1).cpu().numpy()

            all_predictions.extend(predictions)
            all_targets.extend(targets.numpy())

    # Calculate metrics
    accuracy = accuracy_score(all_targets, all_predictions)
    return accuracy

In [30]:
# Solution 1: Memory optimization settings
import torch
torch.cuda.empty_cache()
import gc
gc.collect()

# Set memory efficiency configurations
torch.backends.cudnn.benchmark = True
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

# Modified training function with memory optimizations
def train_model_memory_efficient(
    model,
    train_dataloader,
    optimizer,
    scheduler,
    criterion,
    scaler,
    num_epochs,
    accumulation_steps,
    device,
    batch_size=2  # Reduced batch size
):
    start_time = time.time()

    for epoch in range(num_epochs):
        total_loss = 0
        model.train()

        for batch_idx, batch in enumerate(train_dataloader):
            # Clear cache periodically
            if batch_idx % 10 == 0:
                torch.cuda.empty_cache()

            prompt, targets = batch

            # Process in smaller chunks if needed
            try:
                # Tokenize the batch
                encodings = tokenize_text(prompt)
                input_ids = encodings['input_ids'].to(device)
                attention_mask = encodings['attention_mask'].to(device)
                targets = targets.to(device)

                # Mixed precision training
                with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
                    logits = model(input_ids, attention_mask)
                    loss = criterion(logits, targets) / accumulation_steps

                # Backward pass
                scaler.scale(loss).backward()

                # Update weights
                if (batch_idx + 1) % accumulation_steps == 0:
                    scaler.step(optimizer)
                    scaler.update()
                    scheduler.step()
                    optimizer.zero_grad()

                total_loss += loss.item() * accumulation_steps

                # Print progress
                if (batch_idx + 1) % 10 == 0:  # Reduced logging frequency
                    avg_loss = total_loss / (batch_idx + 1)
                    print(
                        f'Epoch: {epoch+1}/{num_epochs} | '
                        f'Batch: {batch_idx+1}/{len(train_dataloader)} | '
                        f'Avg Loss: {avg_loss:.4f}'
                    )

            except RuntimeError as e:
                if "out of memory" in str(e):
                    print('|WARNING: out of memory, clearing cache and skipping batch')
                    if torch.cuda.is_available():
                        torch.cuda.empty_cache()
                    continue
                else:
                    raise e

            # Clear unnecessary tensors
            del input_ids, attention_mask, logits
            torch.cuda.empty_cache()

    end_time = time.time()
    training_time = (end_time - start_time) / 60
    print(f'Total training time: {training_time:.2f} min')
    return model

# Modified evaluation function with memory optimizations
def evaluate_model_memory_efficient(model, dataloader, device, prefix=""):
    model.eval()
    all_predictions = []
    all_targets = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc=f'Evaluating {prefix}'):
            try:
                prompt, targets = batch

                # Tokenize and move to device
                encodings = tokenize_text(prompt)
                input_ids = encodings['input_ids'].to(device)
                attention_mask = encodings['attention_mask'].to(device)

                # Get predictions
                with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
                    logits = model(input_ids, attention_mask)
                    predictions = F.softmax(logits, dim=1).argmax(dim=1).cpu().numpy()

                all_predictions.extend(predictions)
                all_targets.extend(targets.numpy())

                # Clear memory
                del input_ids, attention_mask, logits
                torch.cuda.empty_cache()

            except RuntimeError as e:
                if "out of memory" in str(e):
                    print('|WARNING: out of memory during evaluation, clearing cache and skipping batch')
                    if torch.cuda.is_available():
                        torch.cuda.empty_cache()
                    continue
                else:
                    raise e

    # Calculate metrics
    accuracy = accuracy_score(all_targets, all_predictions)
    return accuracy

# Modified main training and evaluation function
def train_and_evaluate_memory_efficient(
    model,
    train_dataloader,
    val_dataloader,
    test_dataloader,
    optimizer,
    scheduler,
    criterion,
    scaler,
    num_epochs,
    accumulation_steps,
    device,
    batch_size=2
):
    # Train the model
    model = train_model_memory_efficient(
        model=model,
        train_dataloader=train_dataloader,
        optimizer=optimizer,
        scheduler=scheduler,
        criterion=criterion,
        scaler=scaler,
        num_epochs=num_epochs,
        accumulation_steps=accumulation_steps,
        device=device,
        batch_size=batch_size
    )

    # Evaluate on all sets
    train_acc = evaluate_model_memory_efficient(model, train_dataloader, device, "train")
    val_acc = evaluate_model_memory_efficient(model, val_dataloader, device, "validation")
    test_acc = evaluate_model_memory_efficient(model, test_dataloader, device, "test")

    print(f'\nFinal Results:')
    print(f'Train Accuracy: {train_acc:.4f}')
    print(f'Validation Accuracy: {val_acc:.4f}')
    print(f'Test Accuracy: {test_acc:.4f}')

    return model, train_acc, val_acc, test_acc

# Create new dataloaders with smaller batch size
train_dataloader = DataLoader(
    dataset=train_dataset,
    batch_size=2,  # Reduced batch size
    shuffle=True,
    drop_last=True
)

val_dataloader = DataLoader(
    dataset=val_dataset,
    batch_size=2,  # Reduced batch size
    shuffle=False
)

test_dataloader = DataLoader(
    dataset=test_dataset,
    batch_size=2,  # Reduced batch size
    shuffle=False
)

# Run the memory-efficient training and evaluation
torch.cuda.empty_cache()  # Clear memory before starting
model, train_acc, val_acc, test_acc = train_and_evaluate_memory_efficient(
    model=model,
    train_dataloader=train_dataloader,
    val_dataloader=val_dataloader,
    test_dataloader=test_dataloader,
    optimizer=optimizer,
    scheduler=scheduler,
    criterion=criterion,
    scaler=scaler,
    num_epochs=num_epochs,
    accumulation_steps=accumulation_steps,
    device=device,
    batch_size=2
)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Total training time: 0.55 min


Evaluating train:   0%|          | 0/1855 [00:00<?, ?it/s]

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)




  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


Evaluating validation:   0%|          | 0/265 [00:00<?, ?it/s]



  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


Evaluating test:   0%|          | 0/531 [00:00<?, ?it/s]


Final Results:
Train Accuracy: nan
Validation Accuracy: nan
Test Accuracy: nan


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


| Model        | Weights   | Trainable token | Trainable layers        | Context length                            | CPU/GPU | Training time | Training acc | Validation acc | Test acc |
|--------------|-----------|-----------------|-------------------------|-------------------------------------------|---------|---------------|--------------|----------------|-----------|
| h20-danube (1.8 B)| instruct   | last            | LoRA                    | dynamic padding (batch-wise)        | T4 (Colab free)    | 2.76 min      | 99.71%       | 99.32%         | 97.33%    |