In [1]:
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install transformers
!git clone https://github.com/jmunozmendi/SNGP

Looking in indexes: https://download.pytorch.org/whl/cu121
Cloning into 'SNGP'...
remote: Enumerating objects: 59, done.[K
remote: Counting objects: 100% (59/59), done.[K
remote: Compressing objects: 100% (55/55), done.[K
remote: Total 59 (delta 20), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (59/59), 47.07 KiB | 3.92 MiB/s, done.
Resolving deltas: 100% (20/20), done.


In [2]:
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer
import numpy as np

In [3]:
from torch.optim import SGD, Adam
from torch.optim.lr_scheduler import MultiStepLR
import torch.nn.functional as F
from torch import nn
import torch
import os
import torchvision.datasets as dset
import torchvision.transforms as transforms
#import gpytorch
import math
import tqdm

# Load dataset

In [4]:
import pandas as pd

dataset = pd.read_csv('dataset.tsv', sep='\t')

dataset['text'] = dataset['text'].astype(str)

dataset.head()

Unnamed: 0,page_id,text,label,Pattern Category
0,1012,FLASH SALE | LIMITED TIME ONLY Shop Now,1,Urgency
1,158,Pillowcases & Shams,0,Not Dark Pattern
2,108,Write a review,0,Not Dark Pattern
3,1425,"To start your return, simply click on the foll...",0,Not Dark Pattern
4,1658,newsletter signup (privacy policy),0,Not Dark Pattern


In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [6]:
def preprocess_function(examples):
    return tokenizer(examples,
                           padding='max_length',  # Pad to max_length
                           truncation=True,       # Truncate to max_length
                           max_length=200)        # Specify the max length


In [7]:
dataset['tokenized_text'] = dataset['text'].apply(lambda x: preprocess_function(x))

In [8]:
input_ids = dataset['tokenized_text'].apply(lambda x: x['input_ids'])
attention_masks = dataset['tokenized_text'].apply(lambda x: x['attention_mask'])

In [9]:
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split


input_ids_tensor = torch.tensor(input_ids)
attention_masks_tensor = torch.tensor([item for item in attention_masks])
labels_tensor = torch.tensor(dataset['label'].values)

# Custom Dataset class
class DistilBERTDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        # Return a tuple where the first element is a tuple of input_ids and attention_mask,
        # and the second element is labels
        return (self.input_ids[idx], self.attention_masks[idx]), self.labels[idx]


dataset = DistilBERTDataset(input_ids_tensor, attention_masks_tensor, labels_tensor)

# Split dataset
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = train_test_split(dataset, train_size=train_size, test_size=test_size, stratify=labels_tensor)


## Creating the DistilBERT Model

In [10]:
class DistillBERTClass(torch.nn.Module):
    def __init__(self, num_outputs = 768):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, num_outputs)


    def forward(self, X):
        (input_ids, attention_mask) = X

        output_1 = self.l1(input_ids=input_ids.long(), attention_mask=attention_mask.long())
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        output = self.pre_classifier(pooler)

        return output


num_inputs_gp = 768
feature_extractor = DistillBERTClass(num_outputs = num_inputs_gp)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

## Creating the SNGP Model

In [11]:
from SNGP.sngp import SNGP

In [12]:
model = SNGP(out_features=2, backbone=feature_extractor, backbone_output_features = num_inputs_gp, num_inducing=128, momentum = 0)


In [13]:
print(model)

SNGP(
  (rff): Sequential(
    (0): DistillBERTClass(
      (l1): DistilBertModel(
        (embeddings): Embeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (transformer): Transformer(
          (layer): ModuleList(
            (0-5): 6 x TransformerBlock(
              (attention): MultiHeadSelfAttention(
                (dropout): Dropout(p=0.1, inplace=False)
                (q_lin): Linear(in_features=768, out_features=768, bias=True)
                (k_lin): Linear(in_features=768, out_features=768, bias=True)
                (v_lin): Linear(in_features=768, out_features=768, bias=True)
                (out_lin): Linear(in_features=768, out_features=768, bias=True)
              )
              (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

## Defining Training and Testing Code

In [14]:
model.train_model(dataset=train_dataset, epochs=10, batch_size=32, weight_decay=0.01, lr = 2e-5)

[92m[1m[INFO][0m[0m [92mTraining in progress... 10%  Mean Loss...0.41792167817889636[0m
[1A[2K[92m[1m[INFO][0m[0m [92mTraining in progress... 20%  Mean Loss...0.26639947656668345[0m
[1A[2K[92m[1m[INFO][0m[0m [92mTraining in progress... 30%  Mean Loss...0.20225297095317205[0m
[1A[2K[92m[1m[INFO][0m[0m [92mTraining in progress... 40%  Mean Loss...0.16120540023874097[0m
[1A[2K[92m[1m[INFO][0m[0m [92mTraining in progress... 50%  Mean Loss...0.13420916672113303[0m
[1A[2K[92m[1m[INFO][0m[0m [92mTraining in progress... 60%  Mean Loss...0.11386181199445493[0m
[1A[2K[92m[1m[INFO][0m[0m [92mTraining in progress... 70%  Mean Loss...0.0988322501766682[0m
[1A[2K[92m[1m[INFO][0m[0m [92mTraining in progress... 80%  Mean Loss...0.08705429205132285[0m
[1A[2K[92m[1m[INFO][0m[0m [92mTraining in progress... 90%  Mean Loss...0.07808550838330901[0m
[1A[2K[92m[1m[INFO][0m[0m [92mTraining in progress... 100%  Mean Loss...0.0736375219

In [15]:
info = model.predict(dataset=test_dataset, batch_size=256)

[92m[1m[INFO][0m[0m [92mInference in progress... 50%[0m
[1A[2K[92m[1m[INFO][0m[0m [92mInference in progress... 100%[0m
[1A[2K

In [16]:
test_x, test_y = zip(*test_dataset)

In [17]:
test_x, test_y = np.array(test_x), np.array(test_y)

  test_x, test_y = np.array(test_x), np.array(test_y)
  test_x, test_y = np.array(test_x), np.array(test_y)


In [18]:
result = info.decision == test_y

In [19]:
print("Accuracy: ", result.sum() / len(result))

Accuracy:  0.9830508474576272


In [20]:
info

SNGPInfo(decision=array([1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 1, 1,