In [1]:
import torch

# Assuming you want to check memory for the first GPU
gpu_index = 0

# Get total GPU memory
total_memory = torch.cuda.get_device_properties(gpu_index).total_memory
# Convert bytes to GB for easier interpretation
total_memory_gb = total_memory / (1024 ** 3)

# Get current GPU memory allocated
current_memory_allocated = torch.cuda.memory_allocated(gpu_index)
# Convert bytes to GB
current_memory_allocated_gb = current_memory_allocated / (1024 ** 3)

# Get current GPU memory reserved by PyTorch's memory allocator
current_memory_reserved = torch.cuda.memory_reserved(gpu_index)
# Convert bytes to GB
current_memory_reserved_gb = current_memory_reserved / (1024 ** 3)

print(f"Total GPU Memory (GB): {total_memory_gb:.2f}")
print(f"Current Memory Allocated (GB): {current_memory_allocated_gb:.2f}")
print(f"Current Memory Reserved (GB): {current_memory_reserved_gb:.2f}")

Total GPU Memory (GB): 8.00
Current Memory Allocated (GB): 0.00
Current Memory Reserved (GB): 0.00


In [25]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch
from pandas import read_parquet
from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer
import os
from tqdm import tqdm
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
bert_model = BertModel.from_pretrained("bert-base-multilingual-cased")
bert_model.eval()
bert_model.to('cuda:0')


def pooling_embedding(tokenized_input, embeddings):
    processed_embedding = []
    current_embedding = []
    previous_word_idx = None

    for i, word_idx in enumerate(tokenized_input):
        if word_idx is None:
            continue

        if word_idx == previous_word_idx:
            current_embedding.append(embeddings[i])
        else:
            if current_embedding:
                processed_embedding.append(torch.mean(
                    torch.stack(current_embedding), dim=0))
                current_embedding.clear()
            current_embedding.append(embeddings[i])
            previous_word_idx = word_idx

    if current_embedding:
        processed_embedding.append(torch.mean(
            torch.stack(current_embedding), dim=0))

    return torch.stack(processed_embedding)


class NERDataset(Dataset):
    def __init__(self, data_file_path, tokenizer, bert_model):
        self.raw_dataset = read_parquet(data_file_path)
        self.tokenizer = tokenizer
        self.bert_model = bert_model

    def __len__(self):
        return self.raw_dataset['tokens'].size

    def __getitem__(self, index):
        current_row=self.raw_dataset.iloc[index]
        sentence_words =current_row['tokens'].tolist()
        encoded_words = tokenizer(sentence_words, return_tensors='pt',
                                  is_split_into_words=True, truncation=True).to("cuda:0")
        embeddings = self.bert_model(**encoded_words)
        pooled_embeddings = pooling_embedding(
            encoded_words.word_ids(), embeddings.last_hidden_state[0])
        labels = torch.tensor(
            current_row['ner_tags'].astype(int)).to("cuda:0")
        if pooled_embeddings.shape[0] < labels.shape[0]:
            labels = labels[:pooled_embeddings.shape[0]]
        assert pooled_embeddings.shape[0] == labels.shape[
            0], f"pooled_embeddings shape {pooled_embeddings.shape} and labels shape {labels.shape} are not equal, index {index}"
        return pooled_embeddings, labels

In [72]:
first_param_device = next(bert_model.parameters()).device
print(first_param_device)

cuda:0


In [2]:
import torch
import torch.optim as optim
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence


def collate_fn(batch):
    input_ids, label_ids = zip(*batch)  # Unpack original sentences too
    input_ids = pad_sequence(
        [torch.tensor(ids) for ids in input_ids], batch_first=True, padding_value=0)
    label_ids = pad_sequence(
        [torch.tensor(ids) for ids in label_ids], batch_first=True, padding_value=-100)
    return input_ids, label_ids  # Return original sentences as well


# from colleval import evaluate


class BLSTMModel(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, num_labels, dropout):
        super(BLSTMModel, self).__init__()
        self.blstm = nn.LSTM(embedding_dim, hidden_dim,
                             num_layers=1, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(hidden_dim * 2, output_dim)
        self.elu = nn.ELU()
        self.classifier = nn.Linear(output_dim, num_labels)

    def forward(self, input_ids):
        blstm_out, _ = self.blstm(input_ids)
        blstm_out = self.dropout(blstm_out)
        linear_out = self.linear(blstm_out)
        elu_out = self.elu(linear_out)
        logits = self.classifier(elu_out)
        return logits


# Hyperparameters
embedding_dim = bert_model.config.hidden_size
hidden_dim = 256
output_dim = 128
dropout = 0.33
learning_rate = 0.001
batch_size = 10
num_epochs = 60
num_labels = 9

In [18]:
read_parquet("../../data/merge/train.parquet")
train_dataset = NERDataset(
    "../../data/merge/train.parquet", tokenizer, bert_model)
train_dataset[0]

ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [None]:

lstm_model = BLSTMModel(embedding_dim, hidden_dim,
                        output_dim, num_labels, dropout)
# model.load_state_dict(torch.load('best_model7514.pt'))
lstm_model.to("cuda:0")
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(lstm_model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.LinearLR(
    optimizer, start_factor=0.9, end_factor=0.1, total_iters=60)
# optimizer=optim.Adam(model.parameters(), lr=0.001)
# Learning rate scheduling
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
train_dataset = NERDataset(
    "../../data/english/train-00000-of-00001.parquet", tokenizer, bert_model)
train_dataloader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)


# Create validation dataset and dataloader
val_dataset = NERDataset(
    "../../data/english/validation-00000-of-00001.parquet", tokenizer, bert_model)
val_dataloader = DataLoader(
    val_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)

# Variables for early stopping
best_val_loss = float('inf')
best_f1_score = 0
patience = 15  # Number of epochs to wait for improvement
epochs_without_improvement = 0

# Training loop with early stopping
for epoch in range(num_epochs):
    # Training step
    lstm_model.train()
    train_loss = 0.0
    for batch in train_dataloader:
        input_ids, labels = batch
        input_ids = input_ids.to("cuda:0")
        labels = labels.to("cuda:0")
        optimizer.zero_grad()
        logits = lstm_model(input_ids)
        loss = criterion(logits.view(-1, num_labels), labels.view(-1))
        train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(lstm_model.parameters(), max_norm=5)
        optimizer.step()
    scheduler.step()

    # Validation step
    lstm_model.eval()
    true_labels = []
    pred_labels = []
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, labels = batch
            input_ids = input_ids.to("cuda:0")
            labels = labels.to("cuda:0")
            logits = lstm_model(input_ids)
            # predictions = torch.argmax(logits, dim=-1)
            # for i, sentence in enumerate(input_ids):
            #     for j, word_id in enumerate(sentence):
            #         if word_id.item() == vocab['<PAD>']:
            #             continue
            #         # Get the predicted label name
            #         predicted_label = reverse_label_map[predictions[i][j].item()]# Map the label ID back to its string representation
            #         true_label = reverse_label_map[labels[i][j].item()]  # Map the label ID back to its string representation
            #         true_labels.append(true_label)
            #         pred_labels.append(predicted_label)
            # loss = criterion(logits.view(-1, num_labels), labels.view(-1))
            val_loss += loss.item()
    val_loss /= len(val_dataloader)
    # precision,recall,f1=evaluate(true_labels,pred_labels,False)

    train_loss /= len(train_dataloader)
    print(
        f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss}, Val Loss: {val_loss}')

    # Early stopping logic
    if val_loss < best_val_loss:
        print(
            f"Validation loss improved from {best_val_loss:.4f}--->{val_loss:.4f}")
        best_val_loss = val_loss
        torch.save(lstm_model.state_dict(), 'best_model3.pt')
        epochs_without_improvement = 0
    # if best_f1_score < f1:
    #     print(f"Validation f1 improved from {best_f1_score:.4f}--->{f1:.4f}")
    #     best_f1_score = f1
    #     torch.save(lstm_model.state_dict(), 'best_model3.pt')
    #     epochs_without_improvement = 0
    # else:
    #     epochs_without_improvement += 1
    #     if epochs_without_improvement >= patience:
    #         print(f'Early stopping at epoch {epoch + 1}')
    #         break

In [26]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import lightning.pytorch as pl
from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping
from torchmetrics.functional import accuracy
from pytorch_lightning.loggers import TensorBoardLogger
from colleval import evaluate
from datasets import load_metric
wikineural_tags_list = ['O', 'B-PER', 'I-PER', 'B-ORG',
                        'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
wikineural_tags_to_int = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3,
                          'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}
wikineural_int_to_tags = {v: k for k, v in wikineural_tags_to_int.items()}


train_dataset = NERDataset(
    "../../data/merge/train.parquet", tokenizer, bert_model)


# Create validation dataset and dataloader
val_dataset = NERDataset(
    "../../data/merge/dev.parquet", tokenizer, bert_model)
test_dataset = NERDataset(
    "../../data/merge/test.parquet", tokenizer, bert_model)
# Create a TensorBoard logger
logger = TensorBoardLogger("logs/", name="my_model_lstm")
seqeval_metric = load_metric("seqeval")


class BLSTMModelLightning(pl.LightningModule):
    def __init__(self, embedding_dim, hidden_dim, output_dim, num_labels, dropout):
        super(BLSTMModelLightning, self).__init__()
        self.num_labels = num_labels
        self.criterion = nn.CrossEntropyLoss()
        self.blstm = nn.LSTM(embedding_dim, hidden_dim,
                             num_layers=1, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(hidden_dim * 2, output_dim)
        self.elu = nn.ELU()
        self.classifier = nn.Linear(output_dim, num_labels)

    def forward(self, input_ids):
        blstm_out, _ = self.blstm(input_ids)
        blstm_out = self.dropout(blstm_out)
        linear_out = self.linear(blstm_out)
        elu_out = self.elu(linear_out)
        logits = self.classifier(elu_out)
        return logits

    def training_step(self, batch, batch_idx):
        input_ids, labels = batch
        logits = self(input_ids)
        loss = self.criterion(
            logits.view(-1, self.num_labels), labels.view(-1))
        preds = torch.argmax(logits, dim=2).flatten()
        mask = labels.view(-1) != -100

        # Apply the mask to predictions and labels
        preds = torch.argmax(logits, dim=2).view(-1)[mask]
        labels_flat = labels.view(-1)[mask]
        # acc = accuracy(preds, labels_flat, task="multiclass", num_classes=self.num_labels)
        self.log("train_loss", loss, on_step=False,
                 on_epoch=True, prog_bar=True)
        # self.log("train_acc", acc, on_step=False, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids, labels = batch
        logits = self(input_ids)
        loss = self.criterion(
            logits.view(-1, self.num_labels), labels.view(-1))
        preds = torch.argmax(logits, dim=2).flatten()
        # Create a mask to exclude padding tokens
        mask = labels.view(-1) != -100

        # Apply the mask to predictions and labels
        preds = torch.argmax(logits, dim=2).view(-1)[mask]
        labels_flat = labels.view(-1)[mask]
        # acc = accuracy(preds, labels_flat, task="multiclass", num_classes=self.num_labels)
        true_labels = []
        pred_labels = []
        for i, label in enumerate(labels_flat):
            true_labels.append(wikineural_int_to_tags[label.item()])
            pred_labels.append(wikineural_int_to_tags[preds[i].item()])
        results = seqeval_metric.compute(
            predictions=[pred_labels], references=[true_labels])
        self.log("val_seqeval_f1", results['overall_f1'],
                 on_step=False, on_epoch=True, prog_bar=True)
        self.log("val_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        # self.log("val_acc", acc, on_step=False, on_epoch=True, prog_bar=True)

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=learning_rate)
        return optimizer

    def test_step(self, batch, batch_idx):
        input_ids, labels = batch
        logits = self(input_ids)
        loss = self.criterion(
            logits.view(-1, self.num_labels), labels.view(-1))
        preds = torch.argmax(logits, dim=2).flatten()
        # Create a mask to exclude padding tokens
        mask = labels.view(-1) != -100

        # Apply the mask to predictions and labels
        preds = torch.argmax(logits, dim=2).view(-1)[mask]
        labels_flat = labels.view(-1)[mask]
        acc = accuracy(preds, labels_flat, task="multiclass",
                       num_classes=self.num_labels)
        true_labels = []
        pred_labels = []
        for i, label in enumerate(labels_flat):
            true_labels.append(wikineural_int_to_tags[label.item()])
            pred_labels.append(wikineural_int_to_tags[preds[i].item()])
        results = seqeval_metric.compute(
            predictions=[pred_labels], references=[true_labels])
        self.log("test_loss", loss, on_step=False,
                 on_epoch=True, prog_bar=True)
        self.log("test_acc", acc, on_step=False, on_epoch=True, prog_bar=True)
        self.log("test_seqeval_f1", results['overall_f1'],
                 on_step=False, on_epoch=True, prog_bar=True)


class NERDataModule(pl.LightningDataModule):
    def __init__(self, train_dataset, val_dataset, test_dataset, batch_size):
        super().__init__()
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset
        self.test_dataset = test_dataset
        self.batch_size = batch_size

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, collate_fn=collate_fn)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)


checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="best-checkpointt",
    save_top_k=1,
    verbose=True,
    monitor="val_seqeval_f1",
    mode="max"
)

lstm_model = BLSTMModelLightning(
    embedding_dim, hidden_dim, output_dim, num_labels, dropout)
data_module = NERDataModule(
    train_dataset, val_dataset, test_dataset, batch_size)

trainer = pl.Trainer(
    callbacks=[checkpoint_callback],
    max_epochs=num_epochs,
    accelerator="gpu",
    devices=1,
    enable_checkpointing=True,
    enable_progress_bar=True,
    logger=logger,
)

trainer.fit(lstm_model, datamodule=data_module)
# trainer.test(lstm_model,datamodule=data_module,ckpt_path="/home/hjz/544/CSCI544-FinalProject/models/LSTM/checkpoints/best-checkpoint-v1.ckpt")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/hjz/.pyenv/versions/3.11.7/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:653: Checkpoint directory /home/hjz/544/CSCI544-FinalProject/models/LSTM/checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type             | Params
------------------------------------------------
0 | criterion  | CrossEntropyLoss | 0     
1 | blstm      | LSTM             | 2.1 M 
2 | dropout    | Dropout          | 0     
3 | linear     | Linear           | 65.7 K
4 | elu        | ELU              | 0     
5 | classifier | Linear           | 1.2 K 
----------------------------------

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/hjz/.pyenv/versions/3.11.7/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.
  input_ids = pad_sequence([torch.tensor(ids) for ids in input_ids], batch_first=True, padding_value=0)
  label_ids = pad_sequence([torch.tensor(ids) for ids in label_ids], batch_first=True, padding_value=-100)
  _warn_prf(average, modifier, msg_start, len(result))
/home/hjz/.pyenv/versions/3.11.7/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]