In [46]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
import math
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
from sklearn.metrics import confusion_matrix

In [47]:
DEVICE = "cpu"

In [48]:
csv_file = 'dataset.csv'

# read only 1000 reviews for speed
df = pd.read_csv(csv_file, nrows=1_000, index_col=0)

df.head()

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x1514,x1515,x1516,x1517,x1518,x1519,x1520,x1521,y0,y1
0,6119,13074,19176,15352,15816,6625,8335,479,1636,6347,...,13620,13620,13620,13620,13620,13620,13620,13620,1,0
1,6119,13869,10751,18110,8296,19019,17036,17036,6119,15352,...,13620,13620,13620,13620,13620,13620,13620,13620,1,0
2,6119,14532,16234,1933,5839,13869,10751,2378,17997,6255,...,13620,13620,13620,13620,13620,13620,13620,13620,1,0
3,6119,13457,11703,18341,13869,14317,17787,13869,18110,7915,...,13620,13620,13620,13620,13620,13620,13620,13620,0,1
4,6119,11973,14650,18341,9152,18717,17360,15352,10268,19176,...,13620,13620,13620,13620,13620,13620,13620,13620,1,0


In [49]:
vocabulary_size = df.max().max() + 1
vocabulary_size

19504

In [50]:
positive_mask = df.y0 == 1
negative_mask = df.y0 == 0

positive_reviews = df[positive_mask]
negative_reviews = df[negative_mask]

## lets do 80:20 instead of 40:10
train_count = 80 // 2
test_count = 20 // 2

train = pd.concat([positive_reviews.iloc[:train_count,:], negative_reviews.iloc[:train_count,:]])
test = pd.concat([positive_reviews.iloc[train_count:train_count + test_count,:], negative_reviews.iloc[train_count:train_count + test_count,:]])
train.shape

(80, 1524)

In [51]:
test.shape

(20, 1524)

In [52]:
train_x, train_y = train.iloc[:,:-2], train.iloc[:,-2:]
test_x, test_y = test.iloc[:,:-2], test.iloc[:,-2:]
test_y.shape

(20, 2)

In [53]:
class TransformerModule(nn.Module):
    def __init__(self, vocabulary_size, max_length, d_model):
        super().__init__()
        self.max_length = max_length
        self.d_model = d_model

        # apparently nn.Embedding should take vocabulary size as first argument
        # if we pass a number greater than the first argument we get an IndexError
        # in our dataset, the number of words greatly exceeds the length of the max sentence
        # so max length is unfit to be first argument here
        self.embedding = nn.Embedding(vocabulary_size, d_model)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

        global DEVICE
        self.device = 'cpu'
        if 'DEVICE' in globals():
            self.device = DEVICE

        self.V = nn.Parameter(torch.rand(max_length, d_model).to(self.device))

    def gen_pe(self, n):
        pe = np.zeros((self.max_length, self.d_model), dtype=np.float32)
        for k in np.arange(self.max_length):
            for i in np.arange(self.d_model):
                theta = k / (n ** (i/self.d_model))
                if i % 2 == 0:
                    pe[k,i] = math.sin(theta)
                else:
                    pe[k,i] = math.cos(theta)
        return pe
    
    def forward(self, x: torch.FloatTensor):
        # x is a lookup tensor
        initial_embeddings = self.relu(self.embedding(x))
        positional_embeddings = torch.tensor(self.gen_pe(1000)).to(self.device)
        
        input_embeddings = torch.add(initial_embeddings, positional_embeddings)

        _ = torch.matmul(input_embeddings, torch.transpose(input_embeddings, 0 , 1))
        _ = torch.divide(_, math.sqrt(self.d_model))
        _ = self.softmax(_)

        return self.relu(torch.matmul(_,self.V))

In [70]:
class ClassificationHead(nn.Module):
    def __init__(self, embedding_length, head_length):
        super().__init__()
        self.embedding_length = embedding_length
        self.head_length = head_length

        self.output_layer = nn.Linear(self.embedding_length, self.head_length)
    
    def forward(self, x):
        y = self.output_layer(x)

        # no more sigmoid; we will use BCEWithLogitsLoss which uses a starting sigmoid layer and is more numerically stable
        return y

In [71]:
class Classifier(nn.Module):
    def __init__(self, vocabulary_size, max_length, d_model, head_length):
        super().__init__()
        self.vocabulary_size = vocabulary_size
        self.max_length = max_length
        self.d_model = d_model
        self.head_length = head_length
        self.expected_unravel_length = max_length * d_model
        
        self.body = TransformerModule(self.vocabulary_size, self.max_length, self.d_model)
        self.head = ClassificationHead(self.expected_unravel_length, self.head_length)

    def forward(self, x):
        x = self.body(x)
        x = x.view(-1)
        x = self.head(x)
        # softmax removed; inherently applied by CE, and not applied by BCE
        return x

In [79]:
class MyCustomDataset(Dataset):
    def __init__(self, x, y):
        self.x = torch.Tensor(x)
        self.y = torch.Tensor(y)
        self.n_samples = len(x)
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]
    
    def __len__(self):
        return self.n_samples

custom_dataset = MyCustomDataset(train_x.values, train_y.values)

loader = DataLoader(
    custom_dataset,
    batch_size = 5,
    shuffle = True
)

In [80]:
def train_fn(loader, model, optimizer, loss_fn, device="cpu"):
    loop = tqdm(loader)

    average_loss = 0
    count = 0

    for batch_idx, (data, targets) in enumerate(loop):
        data = data.to(device=device).long()
        targets = targets.to(device=device)
        # print(data.shape)
        # print(targets.shape)

        for inst_idx in range(data.shape[0]):
            # Forward
            predictions = model.forward(data[inst_idx])
            loss = loss_fn(predictions, targets[inst_idx])
            # Backward
            optimizer.zero_grad()

            loss.backward()

            optimizer.step()

            # Update tqdm
            loop.set_postfix(loss=loss.item())

            average_loss += loss.item()
            count += 1

    average_loss = average_loss / count

    return average_loss

In [81]:
NUM_EPOCHS = 15
EMBEDDING_LENGTH = 100
model = Classifier(vocabulary_size, train_x.shape[1], EMBEDDING_LENGTH, train_y.shape[1]).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

for i in range(NUM_EPOCHS):
    ave_loss = train_fn(loader, model, optimizer, criterion, device=DEVICE)
    print(f'Epoch {i+1}: {ave_loss}')

  0%|          | 0/16 [00:00<?, ?it/s]

100%|██████████| 16/16 [00:21<00:00,  1.32s/it, loss=3.53]   


Epoch 1: 21.07241003876461


100%|██████████| 16/16 [00:22<00:00,  1.39s/it, loss=14.8]   


Epoch 2: 12.100481945992577


100%|██████████| 16/16 [00:20<00:00,  1.30s/it, loss=35.9]    


Epoch 3: 7.222106258306807


100%|██████████| 16/16 [00:20<00:00,  1.27s/it, loss=9.16]   


Epoch 4: 11.27949922601083


100%|██████████| 16/16 [00:20<00:00,  1.27s/it, loss=0]      


Epoch 5: 5.424119432189877


100%|██████████| 16/16 [00:20<00:00,  1.26s/it, loss=0]       


Epoch 6: 4.821735120186232


100%|██████████| 16/16 [00:20<00:00,  1.26s/it, loss=0.222]   


Epoch 7: 3.887200743908042


100%|██████████| 16/16 [00:20<00:00,  1.28s/it, loss=3.29]    


Epoch 8: 3.9672189953813275


100%|██████████| 16/16 [00:20<00:00,  1.27s/it, loss=3.08]    


Epoch 9: 3.3134499155608497


100%|██████████| 16/16 [00:20<00:00,  1.27s/it, loss=1.13e-5] 


Epoch 10: 1.8181079905977786


100%|██████████| 16/16 [00:20<00:00,  1.30s/it, loss=5.96e-8] 


Epoch 11: 1.4996442721872285


100%|██████████| 16/16 [00:21<00:00,  1.31s/it, loss=0.000801]


Epoch 12: 1.777118430981775


100%|██████████| 16/16 [00:20<00:00,  1.29s/it, loss=0]       


Epoch 13: 0.26687145277465235


100%|██████████| 16/16 [00:21<00:00,  1.34s/it, loss=0]       


Epoch 14: 0.45976361441714636


100%|██████████| 16/16 [00:20<00:00,  1.26s/it, loss=1.18]    

Epoch 15: 0.8203867801319445





In [84]:
test_loader = DataLoader(
    MyCustomDataset(test_x.values, test_y.values),
    batch_size = 5,
    shuffle = True
)

y_pred = []
y_true = []

model.eval()

for batch_idx, (data, targets) in enumerate(test_loader):
    data = data.to(DEVICE).long()
    targets = targets.to(DEVICE)

    for inst_idx in range(data.shape[0]):
        with torch.no_grad():
            pred = model(data[inst_idx])

        # print(pred)
        # print(targets[inst_idx])
        # print(nn.BCEWithLogitsLoss()(pred, targets[inst_idx]))
        pred = pred.cpu().numpy()
        pred = np.argmax(pred)
        y_pred.append(pred)

        truth = np.argmax(targets[inst_idx].cpu().numpy())
        y_true.append(truth)

tp, fn, fp, tn = confusion_matrix(y_true, y_pred).ravel()
print(f'True positives: {tp}')
print(f'True negatives: {tn}')
print(f'False positives: {fp}')
print(f'False negatives: {fn}')
print()
print(f'Accuracy: {(tn + tp) / (tn + tp + fn + fp)}')

True positives: 2
True negatives: 6
False positives: 4
False negatives: 8

Accuracy: 0.4


In [83]:
print(y_pred)
print(y_true)

[1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1]
[1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0]
