<a href="https://colab.research.google.com/github/harshitadd/DP-NLP/blob/main/DPLSTM_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install torchcsprng==0.1.3+cu101 -f https://download.pytorch.org/whl/torch_stable.html --quiet 

In [2]:
pip install opacus --quiet

In [3]:
pip install codecarbon --quiet

In [4]:
import torch
import zipfile
import urllib.request
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch.nn as nn
from collections import Counter
from pathlib import Path
from statistics import mean
from opacus import PrivacyEngine
from opacus.layers import DPLSTM
from opacus.utils.uniform_sampler import UniformWithReplacementSampler
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from tqdm.notebook import tqdm 
from torch.nn.utils.rnn import pack_sequence
from torch.nn.utils.rnn import pack_padded_sequence
from codecarbon import track_emissions


In [5]:
DATA_DIR = "/content/"

In [None]:
def download_and_extract(data_dir):
    print("Extracting Train zip...")
    filename = "train.csv.zip"
    with zipfile.ZipFile(filename) as zip_ref:
        zip_ref.extractall(data_dir)
    os.remove(filename)
    print("Completed!")

    print("Extracting Test zip...")
    filename = "test.csv.zip"
    with zipfile.ZipFile(filename) as zip_ref:
        zip_ref.extractall(data_dir)
    os.remove(filename)
    print("Completed!")

download_and_extract(DATA_DIR)

In [7]:
train_path =  '/content/train.csv'
dev_path = '/content/test.csv'

df_train = pd.read_csv(train_path)[:10000] # Slicing to take a smaller subset of the data 
df_test = pd.read_csv(dev_path)[:1000]
df_train = df_train.drop('Title', axis = 1)
df_test = df_test.drop('Title', axis = 1)

In [8]:
df = pd.concat([df_train, df_test])

In [9]:

class CharByteEncoder(nn.Module):
    """
    This encoder takes a UTF-8 string and encodes its bytes into a Tensor. It can also
    perform the opposite operation to check a result.
    Examples:
    >>> encoder = CharByteEncoder()
    >>> t = encoder('Ślusàrski')  # returns tensor([256, 197, 154, 108, 117, 115, 195, 160, 114, 115, 107, 105, 257])
    >>> encoder.decode(t)  # returns "<s>Ślusàrski</s>"
    """

    def __init__(self):
        super().__init__()
        self.start_token = "<s>"
        self.end_token = "</s>"
        self.pad_token = "<pad>"

        self.start_idx = 256
        self.end_idx = 257
        self.pad_idx = 258

    def forward(self, s: str, pad_to=0) -> torch.LongTensor:

        encoded = s.encode()
        n_pad = pad_to - len(encoded) if pad_to > len(encoded) else 0
        return torch.LongTensor(
            [self.start_idx]
            + [c for c in encoded]  # noqa
            + [self.end_idx]
            + [self.pad_idx for _ in range(n_pad)]
        )

    def decode(self, char_ids_tensor: torch.LongTensor) -> str:
        char_ids = char_ids_tensor.cpu().detach().tolist()

        out = []
        buf = []
        for c in char_ids:
            if c < 256:
                buf.append(c)
            else:
                if buf:
                    out.append(bytes(buf).decode())
                    buf = []
                if c == self.start_idx:
                    out.append(self.start_token)
                elif c == self.end_idx:
                    out.append(self.end_token)
                elif c == self.pad_idx:
                    out.append(self.pad_token)

        if buf:  # in case some are left
            out.append(bytes(buf).decode())
        return "".join(out)

    def __len__(self):
        return 259

In [10]:
class NamesDataset(Dataset):
    def __init__(self, df):
        self.labels = df['Class Index']
        self.data = df['Description']
        self.encoder = CharByteEncoder()
        self.processed = self.process_samples()

    def __getitem__(self, i):
        return self.processed[i]

    def __len__(self):
        return len(self.processed)

    def process_samples(self):
        processed = []
        for d, l in zip(self.data, self.labels):
            processed.append((self.encoder(d.strip()), torch.tensor(l).long()))
        return processed


VOCAB_SIZE = 256 + 3  # 256 alternatives in one byte, plus 3 special characters.


In [22]:
class CharNNClassifier(nn.Module):
    def __init__(
        self,
        embedding_size,
        hidden_size,
        output_size,
        num_lstm_layers=1,
        bidirectional=False,
        vocab_size=VOCAB_SIZE,
    ):
        super().__init__()

        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.vocab_size = vocab_size

        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.lstm = DPLSTM(
            embedding_size,
            hidden_size,
            num_layers=num_lstm_layers,
            bidirectional=bidirectional,
            batch_first=True,
        )
        self.out_layer = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden=None):
        x = self.embedding(x)  # -> [B, T, D]
        x, _ = self.lstm(x, hidden)  # -> [B, T, H]
        x = x[:, -1, :]  # -> [B, H]
        x = self.out_layer(x)  # -> [B, C]
        return x


def padded_collate(batch, padding_idx=0):
    x = pad_sequence(
        [elem[0] for elem in batch], batch_first=True, padding_value=padding_idx
    )
    y = torch.stack([elem[1] for elem in batch]).long()

    return x, y

@track_emissions
def train(model, criterion, optimizer, train_loader, epoch, device="cuda:0"):
    model.train()

    accs = []
    losses = []
    for x, y in tqdm(train_loader):
        x = x.to(device)
        y = y.to(device)

        logits = model(x)
        loss = criterion(logits, y)
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

        preds = logits.argmax(-1)
        n_correct = float(preds.eq(y).sum())
        batch_accuracy = n_correct / len(y)

        accs.append(batch_accuracy)
        losses.append(float(loss))

    printstr = (
        f"\t Epoch {epoch}. Accuracy: {mean(accs):.6f} | Loss: {mean(losses):.6f}"
    )
    try:
        privacy_engine = optimizer.privacy_engine
        epsilon, best_alpha = privacy_engine.get_privacy_spent()
        printstr += f" | (ε = {epsilon:.2f}, δ = {privacy_engine.target_delta}) for α = {best_alpha}"
    except AttributeError:
        pass
    print(printstr)
    return


def test(model, test_loader, privacy_engine, device="cuda:0"):
    model.eval()

    accs = []
    with torch.no_grad():
        for x, y in tqdm(test_loader):
            x = x.to(device)
            y = y.to(device)

            preds = model(x).argmax(-1)
            n_correct = float(preds.eq(y).sum())
            batch_accuracy = n_correct / len(y)

            accs.append(batch_accuracy)
    printstr = "\n----------------------------\n" f"Test Accuracy: {mean(accs):.6f}"
    if privacy_engine:
        epsilon, best_alpha = privacy_engine.get_privacy_spent()
        printstr += f" (ε = {epsilon:.2f}, δ = {privacy_engine.target_delta}) for α = {best_alpha}"
    print(printstr + "\n----------------------------\n")
    return


In [23]:
df = pd.concat([df_train, df_test])

In [24]:
ds = NamesDataset(df)
secure_rng = False
train_split = 0.8
test_every = 5
batch_size = 800
epochs = 5
learning_rate = 2.0

train_len = int(train_split * len(ds))
test_len = len(ds) - train_len

# Privacy engine hyper-parameters
max_per_sample_grad_norm = 1.5
delta = 8e-5
epsilon = 12.0


In [25]:
train_ds, test_ds = torch.utils.data.random_split(ds, [train_len, test_len], generator=None)

In [26]:
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"{train_len} samples for training, {test_len} for testing")

    if secure_rng:
        try:
            import torchcsprng as prng
        except ImportError as e:
            msg = (
                "To use secure RNG, you must install the torchcsprng package! "
                "Check out the instructions here: https://github.com/pytorch/csprng#installation"
            )
            raise ImportError(msg) from e

        generator = prng.create_random_device_generator("/dev/urandom")

    else:
        generator = None

    train_ds, test_ds = torch.utils.data.random_split(
        ds, [train_len, test_len], generator=generator
    )
    embedding_size = 64
    hidden_size = 128  # Number of neurons in hidden layer after LSTM
    n_lstm_layers = 1
    bidirectional_lstm = False

    model = CharNNClassifier(
        embedding_size,
        hidden_size,
        len(ds.labels),
        n_lstm_layers,
        bidirectional_lstm,
    )
    model = model.to(device)

    train_ds, test_ds = torch.utils.data.random_split(
        ds, [train_len, test_len], generator=generator
    )
    

    sample_rate = batch_size / len(train_ds)
    train_loader = DataLoader(
        train_ds,
        num_workers=8,
        pin_memory=True,
        generator=generator,
        batch_sampler=UniformWithReplacementSampler(
            num_samples=len(train_ds), sample_rate=sample_rate, generator=generator
        ),
        collate_fn=padded_collate,
    )

    test_loader = DataLoader(
        test_ds,
        batch_size=batch_size,
        shuffle=False,
        num_workers=8,
        pin_memory=True,
        collate_fn=padded_collate,
    )
    sigma = 1 
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
    disable_dp = False 
    if not disable_dp:
        privacy_engine = PrivacyEngine(
            model,
            sample_rate=sample_rate,
            alphas=[1 + x / 10.0 for x in range(1, 100)] + list(range(12, 64)),
            noise_multiplier = sigma, 
            max_grad_norm=max_per_sample_grad_norm,
            target_delta=delta,
            secure_rng=secure_rng,
        )
        privacy_engine.attach(optimizer)
    else:
        privacy_engine = None

    print("Train stats For Training With Privacy: \n")
    for epoch in tqdm(range(epochs)):
        train(model, criterion, optimizer, train_loader, epoch, device=device)
        if test_every:
            if epoch % test_every == 0:
                test(model, test_loader, privacy_engine, device=device)

    test(model, test_loader, privacy_engine, device=device)

    ## Without Privacy 
    model_nodp = CharNNClassifier(
    embedding_size,
    hidden_size,
    len(ds.labels),
    n_lstm_layers,
    bidirectional_lstm,
    ).to(device)

    optimizer_nodp = torch.optim.SGD(model_nodp.parameters(), lr=0.5)
    
    print("Train stats For Training Without Privacy: \n")
    for epoch in tqdm(range(epochs)):
      train(model_nodp, criterion, optimizer_nodp, train_loader, epoch, device=device)
    if test_every:
        if epoch % test_every == 0:
            test(model_nodp, test_loader, None, device=device)

    test(model_nodp, test_loader, None, device=device)

In [27]:
main()

8800 samples for training, 2200 for testing
Train stats For Training With Privacy: 



  "A ``sample_rate`` has been provided."
  "Secure RNG turned off. This is perfectly fine for experimentation as it allows "


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

CODECARBON : No CPU tracking mode found. Falling back on CPU constant mode.
CODECARBON : Failed to match CPU TDP constant. Falling back on a global constant.


HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))


	 Epoch 0. Accuracy: 0.228003 | Loss: 3.544246 | (ε = 3.39, δ = 8e-05) for α = 4.7


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

CODECARBON : No CPU tracking mode found. Falling back on CPU constant mode.




----------------------------
Test Accuracy: 0.242917 (ε = 3.39, δ = 8e-05) for α = 4.7
----------------------------



CODECARBON : Failed to match CPU TDP constant. Falling back on a global constant.


HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))


	 Epoch 1. Accuracy: 0.259166 | Loss: 2.052089 | (ε = 4.11, δ = 8e-05) for α = 4.3


CODECARBON : No CPU tracking mode found. Falling back on CPU constant mode.
CODECARBON : Failed to match CPU TDP constant. Falling back on a global constant.


HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))


	 Epoch 2. Accuracy: 0.257544 | Loss: 1.789732 | (ε = 4.68, δ = 8e-05) for α = 4.0


CODECARBON : No CPU tracking mode found. Falling back on CPU constant mode.
CODECARBON : Failed to match CPU TDP constant. Falling back on a global constant.


HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))


	 Epoch 3. Accuracy: 0.251079 | Loss: 1.729474 | (ε = 5.18, δ = 8e-05) for α = 3.9


CODECARBON : No CPU tracking mode found. Falling back on CPU constant mode.
CODECARBON : Failed to match CPU TDP constant. Falling back on a global constant.


HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))


	 Epoch 4. Accuracy: 0.249514 | Loss: 1.604670 | (ε = 5.63, δ = 8e-05) for α = 3.7



HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))



----------------------------
Test Accuracy: 0.261667 (ε = 5.63, δ = 8e-05) for α = 3.7
----------------------------

Train stats For Training Without Privacy: 



HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

CODECARBON : No CPU tracking mode found. Falling back on CPU constant mode.
CODECARBON : Failed to match CPU TDP constant. Falling back on a global constant.


HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))


	 Epoch 0. Accuracy: 0.243963 | Loss: 4.461931


CODECARBON : No CPU tracking mode found. Falling back on CPU constant mode.
CODECARBON : Failed to match CPU TDP constant. Falling back on a global constant.


HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))


	 Epoch 1. Accuracy: 0.248444 | Loss: 1.953448


CODECARBON : No CPU tracking mode found. Falling back on CPU constant mode.
CODECARBON : Failed to match CPU TDP constant. Falling back on a global constant.


HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))


	 Epoch 2. Accuracy: 0.250055 | Loss: 1.753412


CODECARBON : No CPU tracking mode found. Falling back on CPU constant mode.
CODECARBON : Failed to match CPU TDP constant. Falling back on a global constant.


HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))


	 Epoch 3. Accuracy: 0.248132 | Loss: 1.631980


CODECARBON : No CPU tracking mode found. Falling back on CPU constant mode.
CODECARBON : Failed to match CPU TDP constant. Falling back on a global constant.


HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))


	 Epoch 4. Accuracy: 0.255941 | Loss: 1.559652



HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))



----------------------------
Test Accuracy: 0.242917
----------------------------

