## LLM as FEature Extractor

In [1]:
%load_ext watermark
%watermark --conda -p torch,transformers,datasets,sklearn

  from .autonotebook import tqdm as notebook_tqdm


torch       : 2.2.0
transformers: 4.38.2
datasets    : 2.18.0
sklearn     : 1.4.1.post1

conda environment: llm-PEFT



In [2]:
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [13]:
import os.path as op
from datasets import load_dataset
import lightning as L
from lightning.pytorch.loggers import CSVLogger
from lightning.pytorch.callbacks import ModelCheckpoint
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
from dataset_utilities import IMDBDataset

In [5]:
download_dataset()
df = load_dataset_into_to_dataframe()
partition_dataset(df)

100% | 80.23 MB | 0.32 MB/s | 252.60 sec elapsed

100%|██████████| 50000/50000 [00:26<00:00, 1892.02it/s]


Class distribution:


In [6]:
df_train = pd.read_csv("train.csv")
df_val = pd.read_csv("val.csv")
df_test = pd.read_csv("test.csv")

In [7]:
imdb_dataset = load_dataset(
    "csv",
    data_files={
        "train": "train.csv",
        "validation": "val.csv",
        "test": "test.csv"
    },
)

print(imdb_dataset)

Generating train split: 35000 examples [00:00, 115570.94 examples/s]
Generating validation split: 5000 examples [00:00, 120248.85 examples/s]
Generating test split: 10000 examples [00:00, 127783.18 examples/s]

DatasetDict({
    train: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 35000
    })
    validation: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 10000
    })
})





In [8]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
print("Tokenizer input max length: ", tokenizer.model_max_length)
print("Tokenizer vocabulary size: ", tokenizer.vocab_size)

Tokenizer input max length:  512
Tokenizer vocabulary size:  30522


In [9]:
def tokenize_text(batch):
    return tokenizer(batch["text"], truncation=True, padding=True)

In [10]:
imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)

Map: 100%|██████████| 35000/35000 [00:08<00:00, 4289.49 examples/s]
Map: 100%|██████████| 5000/5000 [00:01<00:00, 4515.30 examples/s]
Map: 100%|██████████| 10000/10000 [00:02<00:00, 4247.06 examples/s]


In [23]:
print(imdb_tokenized)
print(imdb_tokenized.shape)

DatasetDict({
    train: Dataset({
        features: ['index', 'text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 35000
    })
    validation: Dataset({
        features: ['index', 'text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['index', 'text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 10000
    })
})
{'train': (35000, 5), 'validation': (5000, 5), 'test': (10000, 5)}


In [11]:
del imdb_dataset

In [12]:
from transformers import AutoModel
model = AutoModel.from_pretrained("distilbert-base-uncased")
model.to(device)

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Li

In [25]:
imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [26]:
test_batch = {"attention_mask": imdb_tokenized["train"][:3]["attention_mask"].to(device),
              "input_ids": imdb_tokenized["train"][:3]["input_ids"].to(device)}

with torch.inference_mode():
    test_output = model(**test_batch)

print(test_output.last_hidden_state.shape)
# print(test_output)

torch.Size([3, 512, 768])


In [27]:
cls_token_output = test_output.last_hidden_state[:, 0]
cls_token_output.shape

torch.Size([3, 768])