<a href="https://colab.research.google.com/github/harshitadd/DP-NLP/blob/main/LSTMNewsClassificationDP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install opacus[dev] --quiet
!pip install transformers --quiet

In [2]:
import zipfile
import urllib.request
import os
import torch
import numpy as np
import torch.nn as nn
from tqdm.notebook import tqdm
import pandas as pd
from torch.utils.data import Dataset
from pathlib import Path
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from opacus.utils.uniform_sampler import UniformWithReplacementSampler
from opacus import PrivacyEngine
from torch.utils.data import TensorDataset
from transformers.data.processors.utils import InputExample
from transformers.data.processors.glue import glue_convert_examples_to_features

In [3]:
DATA_DIR = "/content/"

In [7]:
def download_and_extract(data_dir):
    print("Extracting Train zip...")
    filename = "train.csv.zip"
    with zipfile.ZipFile(filename) as zip_ref:
        zip_ref.extractall(data_dir)
    os.remove(filename)
    print("Completed!")

    print("Extracting Test zip...")
    filename = "test.csv.zip"
    with zipfile.ZipFile(filename) as zip_ref:
        zip_ref.extractall(data_dir)
    os.remove(filename)
    print("Completed!")

download_and_extract(DATA_DIR)

Extracting Train zip...
Completed!
Extracting Test zip...
Completed!


In [4]:
train_path =  '/content/train.csv'
dev_path = '/content/test.csv'

df_train = pd.read_csv(train_path)[:2000]
df_test = pd.read_csv(dev_path)[:400]
df_train = df_train.drop('Title', axis = 1)
df_test = df_test.drop('Title', axis = 1)

In [5]:
df = pd.concat([df_train, df_test])

In [7]:
class CharByteEncoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.start_token = "<s>"
        self.end_token = "</s>"
        self.pad_token = "<pad>"
        self.start_idx = 256
        self.end_idx = 257
        self.pad_idx = 258

    def forward(self, s: str, pad_to=0) -> torch.LongTensor:

        encoded = s.encode()
        n_pad = pad_to - len(encoded) if pad_to > len(encoded) else 0
        return torch.LongTensor(
            [self.start_idx]
            + [c for c in encoded]  # noqa
            + [self.end_idx]
            + [self.pad_idx for _ in range(n_pad)]
        )

    def decode(self, char_ids_tensor: torch.LongTensor) -> str:
        char_ids = char_ids_tensor.cpu().detach().tolist()

        out = []
        buf = []
        for c in char_ids:
            if c < 256:
                buf.append(c)
            else:
                if buf:
                    out.append(bytes(buf).decode())
                    buf = []
                if c == self.start_idx:
                    out.append(self.start_token)
                elif c == self.end_idx:
                    out.append(self.end_token)
                elif c == self.pad_idx:
                    out.append(self.pad_token)

        if buf:  # in case some are left
            out.append(bytes(buf).decode())
        return "".join(out)

    def __len__(self):
        return 259

In [8]:
class NewsClassification(Dataset):
    def __init__(self , df): # df with the news description and label 
        self.labels = df['Class Index']
        self.data = df['Description']
        self.encoder = CharByteEncoder()
        self.processed = self.process_samples()

    def __getitem__(self, i):
      return self.processed[i]
    
    def process_samples(self):
      processed = []
      for d, l in zip(self.data, self.labels):
        processed.append((self.encoder(d.strip()), torch.tensor(l).long()))
      return processed

    def __len__(self):
        return len(self.data)
      

VOCAB_SIZE = 256 + 3  # 256 alternatives in one byte, plus 3 special characters.

In [108]:
from torch.nn.utils.rnn import pad_sequence

def padded_collate(batch, padding_idx=0):

    # (xx, yy) = zip(*batch)
    # x_lens = [len(x) for x in xx]
    # y_lens = [len(y) for y in yy]

    # xx_pad = pad_sequence(xx, batch_first=True, padding_value=0)
    # yy_pad = pad_sequence(yy, batch_first=True, padding_value=0)

    # return xx_pad, yy_pad, x_lens, y_lens
    x = pad_sequence([elem[0] for elem in batch], batch_first=True, padding_value=padding_idx)
    y = torch.stack([elem[1] for elem in batch]).long()
    return x, y

In [109]:
secure_rng = False
generator = None 
train_split = 0.8
test_every = 5
batch_size = 64

In [110]:
ds = NewsClassification(df)
train_len = int(train_split * len(ds))
test_len = len(ds) - train_len

In [111]:
print(f"{train_len} samples for training, {test_len} for testing")

1920 samples for training, 480 for testing


In [112]:
train_ds, test_ds = torch.utils.data.random_split(ds, [train_len, test_len], generator=None)

In [113]:
from torch.nn.utils.rnn import pack_padded_sequence
sample_rate = batch_size / len(train_ds)

train_loader = DataLoader(
    train_ds,
    num_workers=1,
    pin_memory=True,
    generator=generator,
    batch_sampler=UniformWithReplacementSampler(
        num_samples=len(train_ds),
        sample_rate=sample_rate,
        generator=generator,
    ),
    collate_fn=padded_collate,
)

test_loader = DataLoader(
    test_ds,
    batch_size=2 * batch_size,
    shuffle=False,
    num_workers=1,
    pin_memory=True,
    collate_fn=padded_collate,
)

In [114]:
from opacus.layers import DPLSTM

class CharNNClassifier(nn.Module):
    def __init__(
        self,
        embedding_size,
        hidden_size,
        output_size,
        num_lstm_layers=1,
        bidirectional=False,
        vocab_size=VOCAB_SIZE,
    ):
        super().__init__()

        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.vocab_size = vocab_size

        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.lstm = DPLSTM(
            embedding_size,
            hidden_size,
            num_layers=num_lstm_layers,
            bidirectional=bidirectional,
            batch_first=True,
        )
        self.out_layer = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden=None):
        x = self.embedding(x)  # -> [B, T, D]
        x, _ = self.lstm(x, hidden)  # -> [B, T, H]
        x = x[:, -1, :]  # -> [B, H]
        x = self.out_layer(x)  # -> [B, C]
        return x

In [115]:
# Set the device to run on a GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define classifier parameters
embedding_size = 64
hidden_size = 128  # Number of neurons in hidden layer after LSTM
n_lstm_layers = 1
bidirectional_lstm = False

model = CharNNClassifier(
    embedding_size,
    hidden_size,
    len(ds.labels),
    n_lstm_layers,
    bidirectional_lstm,
).to(device)

In [116]:
# Training hyper-parameters
epochs = 20
learning_rate = 2.0

# Privacy engine hyper-parameters
max_per_sample_grad_norm = 1.5
delta = 8e-5
epsilon = 12.0

In [117]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [118]:
from opacus import PrivacyEngine

privacy_engine = PrivacyEngine(
    model,
    sample_rate=sample_rate,
    max_grad_norm=max_per_sample_grad_norm,
    target_delta=delta,
    target_epsilon=epsilon,
    epochs=epochs,
    secure_rng=secure_rng,
)
privacy_engine.attach(optimizer)

  "A ``sample_rate`` has been provided."
  "Secure RNG turned off. This is perfectly fine for experimentation as it allows "


In [119]:
from statistics import mean

def train(model, criterion, optimizer, train_loader, epoch, device="cuda:0"):
    accs = []
    losses = []
    counter = 0 
    for x, y in tqdm(train_loader):
        x = x.to(device)
        y = y.to(device)


        logits = model(x)
        loss = criterion(logits, y)
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

        preds = logits.argmax(-1)
        n_correct = float(preds.eq(y).sum())
        batch_accuracy = n_correct / len(y)

        accs.append(batch_accuracy)
        losses.append(float(loss))

        print('Tensor Shape', (np.shape(x)[0], np.shape(y)[0]))
        # counter += 1 
        # printstr = (f"\t Epoch {epoch}. Batch Failed")
    
    printstr = (f"\t Epoch {epoch}. Accuracy: {mean(accs):.6f} | Loss: {mean(losses):.6f}")
        
    try:
        privacy_engine = optimizer.privacy_engine
        epsilon, best_alpha = privacy_engine.get_privacy_spent()
        printstr += f" | (ε = {epsilon:.2f}, δ = {privacy_engine.target_delta}) for α = {best_alpha}"
    except :
        pass

    print(printstr)
    return counter 


def test(model, test_loader, privacy_engine, device="cuda:0"):
    accs = []
    with torch.no_grad():
        for x, y in tqdm(test_loader):
            x = x.to(device)
            y = y.to(device)
            preds = model(x).argmax(-1)
            n_correct = float(preds.eq(y).sum())
            batch_accuracy = n_correct / len(y)

            accs.append(batch_accuracy)
    printstr = "\n----------------------------\n" f"Test Accuracy: {mean(accs):.6f}"
    if privacy_engine:
        epsilon, best_alpha = privacy_engine.get_privacy_spent()
        printstr += f" (ε = {epsilon:.2f}, δ = {privacy_engine.target_delta}) for α = {best_alpha}"
    print(printstr + "\n----------------------------\n")
    return

In [120]:
from tqdm import tqdm

print("Train stats: \n")
for epoch in tqdm(range(epochs)):
    train(model, criterion, optimizer, train_loader, epoch, device=device)
    if test_every:
        if epoch % test_every == 0:  test(model, test_loader, privacy_engine, device=device)







  0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A[A[A






  0%|          | 0/30 [00:00<?, ?it/s][A[A[A[A[A[A[A

Train stats: 










  3%|▎         | 1/30 [00:01<00:39,  1.35s/it][A[A[A[A[A[A[A

Tensor Shape (65, 65)









  7%|▋         | 2/30 [00:02<00:38,  1.38s/it][A[A[A[A[A[A[A

Tensor Shape (59, 59)









 10%|█         | 3/30 [00:04<00:40,  1.49s/it][A[A[A[A[A[A[A

Tensor Shape (70, 70)









 13%|█▎        | 4/30 [00:06<00:40,  1.56s/it][A[A[A[A[A[A[A

Tensor Shape (87, 87)









 17%|█▋        | 5/30 [00:07<00:33,  1.34s/it][A[A[A[A[A[A[A

Tensor Shape (63, 63)









 20%|██        | 6/30 [00:08<00:33,  1.39s/it][A[A[A[A[A[A[A

Tensor Shape (62, 62)









 23%|██▎       | 7/30 [00:09<00:31,  1.36s/it][A[A[A[A[A[A[A

Tensor Shape (63, 63)









 27%|██▋       | 8/30 [00:10<00:25,  1.18s/it][A[A[A[A[A[A[A

Tensor Shape (63, 63)









 30%|███       | 9/30 [00:11<00:21,  1.03s/it][A[A[A[A[A[A[A

Tensor Shape (41, 41)









 33%|███▎      | 10/30 [00:13<00:24,  1.24s/it][A[A[A[A[A[A[A

Tensor Shape (76, 76)









 37%|███▋      | 11/30 [00:14<00:25,  1.32s/it][A[A[A[A[A[A[A

Tensor Shape (78, 78)









 40%|████      | 12/30 [00:16<00:26,  1.46s/it][A[A[A[A[A[A[A

Tensor Shape (64, 64)









 43%|████▎     | 13/30 [00:17<00:23,  1.40s/it][A[A[A[A[A[A[A

Tensor Shape (64, 64)









 47%|████▋     | 14/30 [00:18<00:21,  1.37s/it][A[A[A[A[A[A[A

Tensor Shape (55, 55)









 50%|█████     | 15/30 [00:20<00:21,  1.45s/it][A[A[A[A[A[A[A

Tensor Shape (66, 66)









 53%|█████▎    | 16/30 [00:21<00:19,  1.43s/it][A[A[A[A[A[A[A

Tensor Shape (61, 61)









 57%|█████▋    | 17/30 [00:23<00:17,  1.36s/it][A[A[A[A[A[A[A

Tensor Shape (48, 48)









 60%|██████    | 18/30 [00:23<00:14,  1.19s/it][A[A[A[A[A[A[A

Tensor Shape (58, 58)









 63%|██████▎   | 19/30 [00:25<00:14,  1.28s/it][A[A[A[A[A[A[A

Tensor Shape (78, 78)









 67%|██████▋   | 20/30 [00:26<00:13,  1.32s/it][A[A[A[A[A[A[A

Tensor Shape (66, 66)









 70%|███████   | 21/30 [00:27<00:10,  1.20s/it][A[A[A[A[A[A[A

Tensor Shape (57, 57)









 73%|███████▎  | 22/30 [00:29<00:10,  1.29s/it][A[A[A[A[A[A[A

Tensor Shape (77, 77)









 77%|███████▋  | 23/30 [00:30<00:08,  1.16s/it][A[A[A[A[A[A[A

Tensor Shape (50, 50)









 80%|████████  | 24/30 [00:30<00:06,  1.07s/it][A[A[A[A[A[A[A

Tensor Shape (68, 68)









 83%|████████▎ | 25/30 [00:31<00:05,  1.01s/it][A[A[A[A[A[A[A

Tensor Shape (71, 71)









 87%|████████▋ | 26/30 [00:33<00:04,  1.18s/it][A[A[A[A[A[A[A

Tensor Shape (67, 67)









 90%|█████████ | 27/30 [00:34<00:03,  1.23s/it][A[A[A[A[A[A[A

Tensor Shape (59, 59)









 93%|█████████▎| 28/30 [00:35<00:02,  1.09s/it][A[A[A[A[A[A[A

Tensor Shape (69, 69)









 97%|█████████▋| 29/30 [00:37<00:01,  1.25s/it][A[A[A[A[A[A[A

Tensor Shape (68, 68)









100%|██████████| 30/30 [00:38<00:00,  1.29s/it]

Tensor Shape (65, 65)










  0%|          | 0/4 [00:00<?, ?it/s][A[A[A[A[A[A[A

	 Epoch 0. Accuracy: 0.310868 | Loss: 2.615948 | (ε = 4.54, δ = 8e-05) for α = 3.6









 25%|██▌       | 1/4 [00:00<00:00,  3.86it/s][A[A[A[A[A[A[A






 50%|█████     | 2/4 [00:00<00:00,  3.73it/s][A[A[A[A[A[A[A






 75%|███████▌  | 3/4 [00:00<00:00,  3.66it/s][A[A[A[A[A[A[A






100%|██████████| 4/4 [00:01<00:00,  3.82it/s]






  5%|▌         | 1/20 [00:40<12:42, 40.14s/it][A[A[A[A[A[A






  0%|          | 0/30 [00:00<?, ?it/s][A[A[A[A[A[A[A


----------------------------
Test Accuracy: 0.263021 (ε = 4.54, δ = 8e-05) for α = 3.6
----------------------------



  0%|          | 0/30 [00:00<?, ?it/s]


RuntimeError: ignored

In [None]:
for idx, tup in enumerate(train_loader):
  print(tup)
  

In [None]:
test(model, test_loader, privacy_engine, device=device)

In [91]:
print(counter)

15


In [None]:
model_nodp = CharNNClassifier(
    embedding_size,
    hidden_size,
    len(ds.labels),
    n_lstm_layers,
    bidirectional_lstm,
).to(device)


optimizer_nodp = torch.optim.SGD(model_nodp.parameters(), lr=0.5)

In [None]:
for epoch in tqdm(range(epochs)):
    train(model_nodp, criterion, optimizer_nodp, train_loader, epoch, device=device)
    if test_every:
        if epoch % test_every == 0:
            test(model_nodp, test_loader, None, device=device)

test(model_nodp, test_loader, None, device=device)