<a href="https://colab.research.google.com/github/harshitadd/DP-NLP/blob/main/LSTMNewsClassificationDP_V1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%bash
pip install torchcsprng==0.1.3+cu101 -f https://download.pytorch.org/whl/torch_stable.html
pip install opacus

In [None]:
import zipfile
import urllib.request
import os
import torch
import numpy as np
from tqdm import tqdm
import torch.nn as nn
from tqdm.notebook import tqdm
import pandas as pd
from torch.utils.data import Dataset
from pathlib import Path
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from opacus.utils.uniform_sampler import UniformWithReplacementSampler
from opacus import PrivacyEngine
from torch.nn.utils.rnn import pack_sequence
from torch.nn.utils.rnn import pack_padded_sequence
from torch.utils.data import TensorDataset

In [None]:
DATA_DIR = "/content/"

In [None]:
def download_and_extract(data_dir):
    print("Extracting Train zip...")
    filename = "train.csv.zip"
    with zipfile.ZipFile(filename) as zip_ref:
        zip_ref.extractall(data_dir)
    os.remove(filename)
    print("Completed!")

    print("Extracting Test zip...")
    filename = "test.csv.zip"
    with zipfile.ZipFile(filename) as zip_ref:
        zip_ref.extractall(data_dir)
    os.remove(filename)
    print("Completed!")

download_and_extract(DATA_DIR)

Extracting Train zip...
Completed!
Extracting Test zip...
Completed!


In [None]:
train_path =  '/content/train.csv'
dev_path = '/content/test.csv'

df_train = pd.read_csv(train_path)[:10000] # Slicing to take a smaller subset of the data 
df_test = pd.read_csv(dev_path)[:1000]
df_train = df_train.drop('Title', axis = 1)
df_test = df_test.drop('Title', axis = 1)

In [None]:
df = pd.concat([df_train, df_test])

In [None]:
class CharByteEncoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.start_token = "<s>"
        self.end_token = "</s>"
        self.pad_token = "<pad>"
        self.start_idx = 256
        self.end_idx = 257
        self.pad_idx = 258

    def forward(self, s: str, pad_to=0) -> torch.LongTensor:

        encoded = s.encode()
        n_pad = pad_to - len(encoded) if pad_to > len(encoded) else 0
        return torch.LongTensor(
            [self.start_idx]
            + [c for c in encoded]  # noqa
            + [self.end_idx]
            + [self.pad_idx for _ in range(n_pad)]
        )

    def decode(self, char_ids_tensor: torch.LongTensor) -> str:
        char_ids = char_ids_tensor.cpu().detach().tolist()

        out = []
        buf = []
        for c in char_ids:
            if c < 256:
                buf.append(c)
            else:
                if buf:
                    out.append(bytes(buf).decode())
                    buf = []
                if c == self.start_idx:
                    out.append(self.start_token)
                elif c == self.end_idx:
                    out.append(self.end_token)
                elif c == self.pad_idx:
                    out.append(self.pad_token)

        if buf:  # in case some are left
            out.append(bytes(buf).decode())
        return "".join(out)

    def __len__(self):
        return 259

In [None]:
class NewsClassification(Dataset):
    def __init__(self , df): # df with the news description and label 
        self.labels = df['Class Index']
        self.data = df['Description']
        self.encoder = CharByteEncoder()
        self.processed = self.process_samples()

    def __getitem__(self, i):
      return self.processed[i]
    
    def process_samples(self):
      processed = []
      for d, l in zip(self.data, self.labels):
        processed.append((self.encoder(d.strip()), torch.tensor(l).long()))
      return processed

    def __len__(self):
        return len(self.data)
      

VOCAB_SIZE = 256 + 3  # 256 alternatives in one byte, plus 3 special characters.

In [None]:
from torch.nn.utils.rnn import pad_sequence
def padded_collate(batch, padding_idx=0):

    xx = pad_sequence([elem[0] for elem in batch], batch_first=True, padding_value=padding_idx)
    y = torch.stack([elem[1] for elem in batch]).long()
    return xx, y

In [None]:
secure_rng = False
generator = None 
train_split = 0.8
test_every = 5
batch_size = 32

In [None]:
ds = NewsClassification(df)
train_len = int(train_split * len(ds))
test_len = len(ds) - train_len

In [None]:
print(f"{train_len} samples for training, {test_len} for testing")

8800 samples for training, 2200 for testing


In [None]:
train_ds, test_ds = torch.utils.data.random_split(ds, [train_len, test_len], generator=None)

In [None]:
from torch.nn.utils.rnn import pack_padded_sequence
sample_rate = batch_size / len(train_ds)

train_loader = DataLoader(
    train_ds,
    num_workers=1,
    pin_memory=True,
    generator=generator,
    batch_sampler=UniformWithReplacementSampler(
        num_samples=len(train_ds),
        sample_rate=sample_rate,
        generator=generator,
    ),
    collate_fn=padded_collate,
)

test_loader = DataLoader(
    test_ds,
    batch_size=2 * batch_size,
    shuffle=False,
    num_workers=1,
    pin_memory=True,
    collate_fn=padded_collate,
)

In [None]:
from opacus.layers import DPLSTM

class CharNNClassifier(nn.Module):
    def __init__(
        self,
        embedding_size,
        hidden_size,
        output_size,
        num_lstm_layers=1,
        bidirectional=False,
        vocab_size=VOCAB_SIZE,
    ):
        super().__init__()

        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.vocab_size = vocab_size

        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.lstm = DPLSTM(
            embedding_size,
            hidden_size,
            num_layers=num_lstm_layers,
            bidirectional=bidirectional,
            batch_first=True,
        )
        self.out_layer = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden=None):
        x = self.embedding(x)  
        x, _ = self.lstm(x, hidden)  
        x = x[:, -1, :] 
        x = self.out_layer(x)  
        return x

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
embedding_size = 64
hidden_size = 128 
n_lstm_layers = 1
bidirectional_lstm = False

model = CharNNClassifier(
    embedding_size,
    hidden_size,
    len(ds.labels),
    n_lstm_layers,
    bidirectional_lstm,
).to(device)

In [None]:
epochs = 3
learning_rate = 2.0
max_per_sample_grad_norm = 1.5
delta = 8e-5
epsilon = 12.0

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [None]:
from opacus import PrivacyEngine

privacy_engine = PrivacyEngine(
    model,
    sample_rate=sample_rate,
    max_grad_norm=max_per_sample_grad_norm,
    target_delta=delta,
    target_epsilon=epsilon,
    epochs=epochs,
    secure_rng=secure_rng,
)
privacy_engine.attach(optimizer)

  "A ``sample_rate`` has been provided."
  "Secure RNG turned off. This is perfectly fine for experimentation as it allows "


In [None]:
from statistics import mean

def train(model, criterion, optimizer, train_loader, epoch, device="cuda:0"):
    accs = []
    losses = []
    counter = 0 
    for x, y in tqdm(train_loader):
        x = x.to(device)
        y = y.to(device)


        logits = model(x)
        loss = criterion(logits, y)
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

        preds = logits.argmax(-1)
        n_correct = float(preds.eq(y).sum())
        batch_accuracy = n_correct / len(y)

        accs.append(batch_accuracy)
        losses.append(float(loss))

    printstr = (f"\t Epoch {epoch}. Accuracy: {mean(accs):.6f} | Loss: {mean(losses):.6f}")
        
    try:
        privacy_engine = optimizer.privacy_engine
        epsilon, best_alpha = privacy_engine.get_privacy_spent()
        printstr += f" | (ε = {epsilon:.2f}, δ = {privacy_engine.target_delta}) for α = {best_alpha}"
    except :
        pass

    print(printstr)
    return counter 


def test(model, test_loader, privacy_engine, device="cuda:0"):
    accs = []
    with torch.no_grad():
        for x, y in tqdm(test_loader):
            x = x.to(device)
            y = y.to(device)
            preds = model(x).argmax(-1)
            n_correct = float(preds.eq(y).sum())
            batch_accuracy = n_correct / len(y)

            accs.append(batch_accuracy)
    printstr = "\n----------------------------\n" f"Test Accuracy: {mean(accs):.6f}"
    if privacy_engine:
        epsilon, best_alpha = privacy_engine.get_privacy_spent()
        printstr += f" (ε = {epsilon:.2f}, δ = {privacy_engine.target_delta}) for α = {best_alpha}"
    print(printstr + "\n----------------------------\n")
    return

WITHOUT PRIVACY

In [None]:
model_nodp = CharNNClassifier(
    embedding_size,
    hidden_size,
    len(ds.labels),
    n_lstm_layers,
    bidirectional_lstm,
).to(device)


optimizer_nodp = torch.optim.SGD(model_nodp.parameters(), lr=0.5)

In [None]:
for epoch in tqdm(range(epochs)):
    train(model_nodp, criterion, optimizer_nodp, train_loader, epoch, device=device)
    if test_every:
        if epoch % test_every == 0:
            test(model_nodp, test_loader, None, device=device)

test(model_nodp, test_loader, None, device=device)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=275.0), HTML(value='')))


	 Epoch 0. Accuracy: 0.258356 | Loss: 1.674157


HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))



----------------------------
Test Accuracy: 0.225595
----------------------------



HBox(children=(FloatProgress(value=0.0, max=275.0), HTML(value='')))


	 Epoch 1. Accuracy: 0.256947 | Loss: 1.436169


HBox(children=(FloatProgress(value=0.0, max=275.0), HTML(value='')))


	 Epoch 2. Accuracy: 0.250519 | Loss: 1.425256



HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))



----------------------------
Test Accuracy: 0.257589
----------------------------



WITH PRIVACY - The Bug 

In [None]:
for epoch in tqdm(range(epochs)):
    train(model, criterion, optimizer, train_loader, epoch, device=device)
    if test_every:
        if epoch % test_every == 0:  test(model, test_loader, privacy_engine, device=device)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=275.0), HTML(value='')))

RuntimeError: ignored