In [2]:
import torch
import torch.nn as nn
from torchtext.data.functional import generate_sp_model, load_sp_model, sentencepiece_tokenizer, sentencepiece_numericalizer
from torchtext.vocab import build_vocab_from_iterator
import torchtext.transforms as T
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
import io
import math
from torch.utils.data import Dataset, DataLoader
import os
import re
import pandas as pd



In [4]:

# Data preprocessing
root = "data"
with open(os.path.join(root, "train.csv")) as f:
    with open(os.path.join(root, "data.txt"), "w") as f2:
        for line in f:
            text_only = "".join(line.split(",")[1:])
            filtered = re.sub(r'\\|\\n|;', ' ', text_only.replace('"', ' ').replace('\n', ' '))
            filtered = filtered.replace(' #39;', "'")
            filtered = filtered.replace(' #38;', "&")
            filtered = filtered.replace(' #36;', "$")
            filtered = filtered.replace(' #151;', "-")
            f2.write(filtered.lower() + "\n")

generate_sp_model(os.path.join(root, "data.txt"), vocab_size=20000, model_prefix='SentencePiece/transformer')

sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=data/data.txt --model_prefix=SentencePiece/transformer --vocab_size=20000 --model_type=unigram
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: data/data.txt
  input_format: 
  model_prefix: SentencePiece/transformer
  model_type: UNIGRAM
  vocab_size: 20000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piec

In [16]:
# Custom Dataset
class AGNews(Dataset):
    def __init__(self, num_datapoints, set="train"):
        self.df = pd.read_csv(os.path.join(root, set + ".csv"), names=["Class", "Title", "Content"])
        self.df.fillna('', inplace=True)
        self.df['Article'] = self.df['Title'] + " : " + self.df['Content']
        self.df.drop(['Title', 'Content'], axis=1, inplace=True)
        self.df['Article'] = self.df['Article'].str.replace(r'\\n|\\|\\r|\\r\\n|\n|"', ' ', regex=True)
        self.df['Article'] = self.df['Article'].replace({' #39;': "'", ' #38;': "&", ' #36;': "$", ' #151;': "-"}, regex=True)
        if num_datapoints is not None:
            self.df = self.df.head(num_datapoints)

    def __getitem__(self, index):
        text = self.df.loc[index]["Article"].lower()
        class_index = int(self.df.loc[index]["Class"]) - 1
        return class_index, text

    def __len__(self):
        return len(self.df)

train_dataset = AGNews(num_datapoints=12000, set="train")
test_dataset = AGNews(num_datapoints=10000, set="test")

In [17]:
batch_size = 128

dataloader_train = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
dataloader_test = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=True)

In [20]:
def yield_tokens(file_path):
    with io.open(file_path, encoding='utf-8') as f:
        for line in f:
            yield [line.split("\t")[0]]

vocab = build_vocab_from_iterator(yield_tokens("SentencePiece/transformer.vocab"), specials=['<pad>', '<sos>', '<eos>', '<unk>'], special_first=True)
vocab.set_default_index(vocab['<unk>'])


In [26]:
text_transform = T.Sequential(
    T.SentencePieceTokenizer("SentencePiece/transformer.model"),
    T.VocabTransform(vocab=vocab),
    T.AddToken(vocab['<sos>'], begin=True),
    T.Truncate(max_seq_len=max_len),
    T.AddToken(vocab['<eos>'], begin=False),
    T.ToTensor(padding_value=vocab['<pad>'])
)

In [52]:
class TokenDrop(nn.Module):
    def __init__(self, prob=0.1, pad_token=0, num_special=4):
        super().__init__()
        self.prob = prob
        self.num_special = num_special
        self.pad_token = pad_token

    def __call__(self, sample):
        mask = torch.bernoulli(self.prob * torch.ones_like(sample)).long()
        can_drop = (sample >= self.num_special).long()
        mask = mask * can_drop
        replace_with = (self.pad_token * torch.ones_like(sample)).long()
        sample_out = (1 - mask) * sample + mask * replace_with
        return sample_out

In [50]:
class SinusoidalPosEmb(nn.Module):
    def __init__(self, hidden_size):
        super(SinusoidalPosEmb, self).__init__()
        self.hidden_size = hidden_size

    def forward(self, x):
        device = x.device
        half_dim = self.hidden_size // 2
        emb_scale = math.log(10000) / (half_dim - 1)
        emb_frequencies = torch.exp(torch.arange(half_dim, device=device) * -emb_scale)
        emb = x[:, None] * emb_frequencies[None, :]
        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
        return emb

In [32]:
class NanoTransformer(nn.Module):
    def __init__(self, num_emb, output_size, hidden_size=128, num_heads=4):
        super(NanoTransformer, self).__init__()
        self.embedding = nn.Embedding(num_emb, hidden_size)
        self.embedding.weight.data = 0.001 * self.embedding.weight.data
        self.pos_emb = SinusoidalPosEmb(hidden_size)
        self.multihead_attn = nn.MultiheadAttention(hidden_size, num_heads=num_heads, batch_first=True)
        self.mlp = nn.Sequential(nn.Linear(hidden_size, hidden_size),
                                 nn.LayerNorm(hidden_size),
                                 nn.ELU(),
                                 nn.Linear(hidden_size, hidden_size))
        self.fc_out = nn.Linear(hidden_size, output_size)

    def forward(self, input_seq):
        batch_size, l = input_seq.shape
        input_embs = self.embedding(input_seq)
        seq_indx = torch.arange(l, device=input_seq.device)
        pos_emb = self.pos_emb(seq_indx).reshape(1, l, -1).expand(batch_size, l, -1)
        embs = input_embs + pos_emb
        output, attn_map = self.multihead_attn(embs, embs, embs)
        output = self.mlp(output)
        return self.fc_out(output), attn_map

In [56]:
device = torch.device(0 if torch.cuda.is_available() else 'cpu')

hidden_size = 256
learning_rate = 1e-4
nepochs = 20
batch_size = 128
max_len = 128
output_size = 4
num_heads = 4

tf_classifier = NanoTransformer(num_emb=len(vocab), output_size=4, hidden_size=hidden_size, num_heads=num_heads).to(device)
optimizer = optim.Adam(tf_classifier.parameters(), lr=learning_rate, weight_decay=1e-4)
lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=nepochs, eta_min=0)
loss_fn = nn.CrossEntropyLoss()
td = TokenDrop(prob=0.5)

training_loss_list = []
test_loss_list = []
training_acc_list = []
test_acc_list = []

num_model_params = sum(p.numel() for p in tf_classifier.parameters())
print(f"This Model Has {num_model_params} (Approximately {num_model_params // 1e6} Million) Parameters!")



This Model Has 5517060 (Approximately 5.0 Million) Parameters!


In [65]:
# Training Loop
for epoch in range(nepochs):
    train_acc = 0
    test_acc = 0
    tf_classifier.train()
    train_steps = 0
    for batch_idx, (labels, texts) in enumerate(dataloader_train):
        batch_size = labels.shape[0]
        batch_tensor = text_transform(list(texts)).to(device)
        labels = labels.to(device)
        batch_tensor = td(batch_tensor)
        pred, _ = tf_classifier(batch_tensor)
        loss = loss_fn(pred[:, 0, :], labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        training_loss_list.append(loss.item())
        train_acc += (pred[:, 0, :].argmax(1) == labels).sum().item()
        train_steps += batch_size

        if (batch_idx % 40 == 0):
            print("hello")
            
    train_acc /= train_steps
    training_acc_list.append(train_acc)
    lr_scheduler.step()
    tf_classifier.eval()
    test_steps = 0
    with torch.no_grad():
        for batch_idx, (labels, texts) in enumerate(dataloader_test):
            batch_size = labels.shape[0]
            batch_tensor = text_transform(list(texts)).to(device)
            labels = labels.to(device)
            pred, _ = tf_classifier(batch_tensor)
            loss = loss_fn(pred[:, 0, :], labels)
            test_loss_list.append(loss.item())
            test_acc += (pred[:, 0, :].argmax(1) == labels).sum().item()
            test_steps += batch_size
    test_acc /= test_steps
    test_acc_list.append(test_acc)
    
    # Print out the results for this epoch
    print(f'Epoch {epoch+1}/{nepochs}')
    print(f'Training Accuracy: {train_acc*100:.2f}%')
    print(f'Testing Accuracy: {test_acc*100:.2f}%')

hello
hello
hello
Epoch 1/20
Training Accuracy: 25.87%
Testing Accuracy: 24.38%
hello
hello
hello


KeyboardInterrupt: 

In [None]:
_ = plt.figure(figsize=(10, 5))
_ = plt.plot(np.linspace(0, nepochs, len(training_loss_list)), training_loss_list)
_ = plt.plot(np.linspace(0, nepochs, len(test_loss_list)), test_loss_list)
_ = plt.legend(["Train", "Test"])
_ = plt.title("Training Vs Test Loss")
_ = plt.xlabel("Epochs")
_ = plt.ylabel("Loss")

_ = plt.figure(figsize=(10, 5))
_ = plt.plot(np.linspace(0, nepochs, len(training_acc_list)), training_acc_list)
_ = plt.plot(np.linspace(0, nepochs, len(test_acc_list)), test_acc_list)
_ = plt.legend(["Train", "Test"])
_ = plt.title("Training Vs Test Accuracy")
_ = plt.xlabel("Epochs")
_ = plt.ylabel("Accuracy")
print("Max Test Accuracy %.2f%%" % (np.max(test_acc_list) * 100))