In [1]:
#--------------------------------------------------------------------------------------
#----------------------------HUGGINGFACE DATASET --------------------------------------
#--------------------------------------------------------------------------------------
import gzip
import shutil
import time

import pandas as pd
import requests
import torch
import torch.nn.functional as F
import torchtext

import transformers
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification


torch.backends.cudnn.deterministic = True
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

from datasets import *
dataset = load_dataset('json', split='train', data_files='./emos/data.jsonl')

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# dataset = dataset.select(range(1000))

# 90% train, 10% test + validation
train_testvalid = dataset.train_test_split(test_size=0.1)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)

# # gather everyone if you want to have a single DatasetDict
train_test_valid_dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

train_test_valid_dataset

tokenizer = DistilBertTokenizerFast.from_pretrained(
    'distilbert-base-uncased'
)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = train_test_valid_dataset.map(tokenize_function, batched=True)

train_dataset = tokenized_datasets["train"].shuffle(seed=42)
valid_dataset = tokenized_datasets["valid"].shuffle(seed=42)
test_dataset = tokenized_datasets["test"].shuffle(seed=42)

  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 900/900 [00:00<00:00, 7043.24 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 6714.54 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 7051.15 examples/s]


In [2]:
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
valid_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format (type='torch', columns=['input_ids', 'attention_mask', 'label'])


In [3]:
import pytorch_lightning as pl

import torch 
import torch.nn as nn
from torch.utils.data import DataLoader
from torchmetrics import Accuracy

torch.set_float32_matmul_precision('medium')

import re
from collections import Counter, OrderedDict

class EmOSDataModule(pl.LightningDataModule):
    def __init__(self) -> None:
        super().__init__()
        pass

    def prepare_data(self):
        super().prepare_data()
        pass

    def setup(self, stage: str):
        super().setup(str)
        pass
    
    def train_dataloader(self):
        return DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=31)
    
    def val_dataloader(self):
        return DataLoader(valid_dataset, batch_size=32, shuffle=False, num_workers=31)
    
    def test_dataloader(self):
        return DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=31)

In [12]:
class DistilBertNetwork(pl.LightningModule):
    def __init__(self):
        super().__init__()

        self.train_acc = Accuracy(task="multiclass", num_classes=10)
        self.valid_acc   = Accuracy(task="multiclass", num_classes=10)
        self.test_acc  = Accuracy(task="multiclass", num_classes=10)

        # self.save_hyperparameters()

        # self.config = AutoConfig.from_pretrained(model_name_or_path, num_labels=num_labels)
        # self.model = AutoModelForSequenceClassification.from_pretrained(
        #     model_name_or_path, config=self.config
        # )
        # self.metric = load_metric(
        #     'glue', self.hparams.task_name, experiment_id=datetime.now().strftime("%d-%m-%Y_%H-%M-%S")
        # )

        self.model = DistilBertForSequenceClassification.from_pretrained(
        'distilbert-base-uncased', num_labels=6)
        self.model.to(DEVICE)

        

    def forward(self, input_ids,attention_mask, labels):
        out = self.model(input_ids,attention_mask=attention_mask, labels = labels)
        return out
    
    def training_step(self, batch, batch_idx):
        self.model.train()
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['label'].to(DEVICE)
        
        ### Forward pass
        outputs = self(input_ids,
        attention_mask=attention_mask,
        labels=labels)
        loss, logits = outputs['loss'], outputs['logits']
        preds = torch.argmax(logits, dim=1)
        self.train_acc.update(preds, labels)
        self.log("train loss: ",loss, prog_bar=True)

        return loss
    
    def on_train_epoch_end(self):
        self.log("train acc :", self.train_acc.compute())

    def validation_step(self, batch, batch_idx):
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['label'].to(DEVICE)

        outputs = self.model(input_ids,attention_mask=attention_mask)

        logits = outputs['logits']
        
        loss = nn.functional.cross_entropy(logits, labels)
        
        preds = torch.argmax(logits, dim=1).float()
        self.valid_acc.update(preds, labels)
        
        self.log("valid_loss", loss, prog_bar=True)
        self.log("valid_acc", self.valid_acc.compute(), prog_bar=True)

        return {"preds": preds, "labels": labels}

    def test_step(self, batch, batch_idx):
        pass
        # text_batch, label_batch, lengths = batch
        # pred = self(text_batch, lengths)[:, 0]
        # loss = nn.functional.binary_cross_entropy(pred, label_batch)

        # pred = (pred >= 0.5).float()
        # self.test_acc.update(pred, label_batch)
        # self.log("test_loss", loss, prog_bar=True)
        # self.log("test_acc", self.test_acc.compute(), prog_bar=True)
        # return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.model.parameters(), lr=5e-5)
        return optimizer

In [13]:
torch.manual_seed(1)
emos_dm = EmOSDataModule()
emosClassifier = DistilBertNetwork()

trainer = pl.Trainer(max_epochs=4, accelerator="auto", enable_checkpointing=True)

trainer.fit(model=emosClassifier, datamodule=emos_dm)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type                                | Params
------------------------------------------------------------------
0 | train_acc | MulticlassAccuracy                  | 0     
1 | valid_acc | MulticlassAccuracy                  | 0     
2 | test_acc  | MulticlassAccuracy                  | 0     
3 | model     | DistilBertForSequenceClassification | 67.0 M
------------------------------------------------------------------
67.0 M    Tra

                                                                           

/home/tej/Documents/Courses/Learning/ML_With_PyTorch_Scikit_Practice/env/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py:293: The number of training batches (2) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Epoch 3: 100%|██████████| 2/2 [00:02<00:00,  0.77it/s, v_num=9, train loss: =1.460, valid_loss=1.410, valid_acc=0.394]

`Trainer.fit` stopped: `max_epochs=4` reached.


Epoch 3: 100%|██████████| 2/2 [00:03<00:00,  0.57it/s, v_num=9, train loss: =1.460, valid_loss=1.410, valid_acc=0.394]
