In [1]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
import math
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
from sklearn.metrics import confusion_matrix

In [2]:
DEVICE = "cpu"

In [3]:
csv_file = 'dataset.csv'

# read only 1000 reviews for speed
df = pd.read_csv(csv_file, nrows=1_000, index_col=0)

df.head()

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x1514,x1515,x1516,x1517,x1518,x1519,x1520,x1521,y0,y1
0,16873,4461,1358,16459,17069,15270,17018,17850,2645,11639,...,6951,6951,6951,6951,6951,6951,6951,6951,1,0
1,16873,6836,13769,14426,10141,17968,10774,10774,16873,16459,...,6951,6951,6951,6951,6951,6951,6951,6951,1,0
2,16873,4575,16785,14295,1042,6836,13769,1191,6913,8524,...,6951,6951,6951,6951,6951,6951,6951,6951,1,0
3,16873,5934,11631,19305,6836,2485,7785,6836,14426,6151,...,6951,6951,6951,6951,6951,6951,6951,6951,0,1
4,16873,17219,10203,19305,6372,66,4279,16459,19112,1358,...,6951,6951,6951,6951,6951,6951,6951,6951,1,0


In [4]:
vocabulary_size = df.max().max() + 1
vocabulary_size

19504

In [5]:
positive_mask = df.y0 == 1
negative_mask = df.y0 == 0

positive_reviews = df[positive_mask]
negative_reviews = df[negative_mask]

## lets do 80:20 instead of 40:10
train_count = 80 // 2
test_count = 20 // 2

train = pd.concat([positive_reviews.iloc[:train_count,:], negative_reviews.iloc[:train_count,:]])
test = pd.concat([positive_reviews.iloc[train_count:train_count + test_count,:], negative_reviews.iloc[train_count:train_count + test_count,:]])
train.shape

(80, 1524)

In [6]:
test.shape

(20, 1524)

In [7]:
train_x, train_y = train.iloc[:,:-2], train.iloc[:,-2:]
test_x, test_y = test.iloc[:,:-2], test.iloc[:,-2:]
test_y.shape

(20, 2)

In [8]:
class TransformerModule(nn.Module):
    def __init__(self, vocabulary_size, max_length, d_model):
        super().__init__()
        self.max_length = max_length
        self.d_model = d_model

        # apparently nn.Embedding should take vocabulary size as first argument
        # if we pass a number greater than the first argument we get an IndexError
        # in our dataset, the number of words greatly exceeds the length of the max sentence
        # so max length is unfit to be first argument here
        self.embedding = nn.Embedding(vocabulary_size, d_model)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

        global DEVICE
        self.device = 'cpu'
        if 'DEVICE' in globals():
            self.device = DEVICE

        self.V = nn.Parameter(torch.rand(max_length, d_model).to(self.device))

    def gen_pe(self, n):
        pe = np.zeros((self.max_length, self.d_model), dtype=np.float32)
        for k in np.arange(self.max_length):
            for i in np.arange(self.d_model):
                theta = k / (n ** (i/self.d_model))
                if i % 2 == 0:
                    pe[k,i] = math.sin(theta)
                else:
                    pe[k,i] = math.cos(theta)
        return pe
    
    def forward(self, x: torch.FloatTensor):
        # x is a lookup tensor
        initial_embeddings = self.relu(self.embedding(x))
        positional_embeddings = torch.tensor(self.gen_pe(1000)).to(self.device)
        
        input_embeddings = torch.add(initial_embeddings, positional_embeddings)

        _ = torch.matmul(input_embeddings, torch.transpose(input_embeddings, 0 , 1))
        _ = torch.divide(_, math.sqrt(self.d_model))
        _ = self.softmax(_)

        return self.relu(torch.matmul(_,self.V))

In [9]:
class ClassificationHead(nn.Module):
    def __init__(self, embedding_length, head_length):
        super().__init__()
        self.embedding_length = embedding_length
        self.head_length = head_length

        self.output_layer = nn.Linear(self.embedding_length, self.head_length)
    
    def forward(self, x):
        y = self.output_layer(x)

        # no more sigmoid; we will use BCEWithLogitsLoss which uses a starting sigmoid layer and is more numerically stable
        return y

In [10]:
class Classifier(nn.Module):
    def __init__(self, vocabulary_size, max_length, d_model, head_length):
        super().__init__()
        self.vocabulary_size = vocabulary_size
        self.max_length = max_length
        self.d_model = d_model
        self.head_length = head_length
        self.expected_unravel_length = max_length * d_model
        
        self.body = TransformerModule(self.vocabulary_size, self.max_length, self.d_model)
        self.head = ClassificationHead(self.expected_unravel_length, self.head_length)

    def forward(self, x):
        x = self.body(x)
        x = x.view(-1)
        x = self.head(x)
        # softmax removed; inherently applied by CE, and not applied by BCE
        return x

In [11]:
class MyCustomDataset(Dataset):
    def __init__(self, x, y):
        self.x = torch.Tensor(x)
        self.y = torch.Tensor(y)
        self.n_samples = len(x)
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]
    
    def __len__(self):
        return self.n_samples

custom_dataset = MyCustomDataset(train_x.values, train_y.values)

loader = DataLoader(
    custom_dataset,
    batch_size = 5,
    shuffle = True
)

In [12]:
def train_fn(loader, model, optimizer, loss_fn, device="cpu"):
    loop = tqdm(loader)

    average_loss = 0
    count = 0

    for batch_idx, (data, targets) in enumerate(loop):
        data = data.to(device=device).long()
        targets = targets.to(device=device)
        # print(data.shape)
        # print(targets.shape)

        for inst_idx in range(data.shape[0]):
            # Forward
            predictions = model.forward(data[inst_idx])
            loss = loss_fn(predictions, targets[inst_idx])
            # Backward
            optimizer.zero_grad()

            loss.backward()

            optimizer.step()

            # Update tqdm
            loop.set_postfix(loss=loss.item())

            average_loss += loss.item()
            count += 1

    average_loss = average_loss / count

    return average_loss

In [13]:
NUM_EPOCHS = 100
EMBEDDING_LENGTH = 100
model = Classifier(vocabulary_size, train_x.shape[1], EMBEDDING_LENGTH, train_y.shape[1]).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

for i in range(NUM_EPOCHS):
    ave_loss = train_fn(loader, model, optimizer, criterion, device=DEVICE)
    print(f'Epoch {i+1}: {ave_loss}')

100%|██████████| 16/16 [00:12<00:00,  1.32it/s, loss=0.0152] 


Epoch 1: 15.76221891522664


100%|██████████| 16/16 [00:07<00:00,  2.17it/s, loss=0]      


Epoch 2: 11.433651211790664


100%|██████████| 16/16 [00:08<00:00,  1.97it/s, loss=21.4]    


Epoch 3: 7.610777846023479


100%|██████████| 16/16 [00:07<00:00,  2.11it/s, loss=33.4]   


Epoch 4: 8.526924401578336


100%|██████████| 16/16 [00:07<00:00,  2.11it/s, loss=3.1]     


Epoch 5: 5.843084724024884


100%|██████████| 16/16 [00:07<00:00,  2.15it/s, loss=30.8]   


Epoch 6: 7.479396602158654


100%|██████████| 16/16 [00:07<00:00,  2.18it/s, loss=24]     


Epoch 7: 5.32001615417961


100%|██████████| 16/16 [00:07<00:00,  2.14it/s, loss=1.19e-7] 


Epoch 8: 2.762518312776533


100%|██████████| 16/16 [00:07<00:00,  2.11it/s, loss=4.19]   


Epoch 9: 1.2727260898626382


100%|██████████| 16/16 [00:07<00:00,  2.04it/s, loss=29.3]    


Epoch 10: 0.9734423692100392


100%|██████████| 16/16 [00:07<00:00,  2.04it/s, loss=0.000519]


Epoch 11: 2.439997638114928


100%|██████████| 16/16 [00:07<00:00,  2.07it/s, loss=1.22]    


Epoch 12: 0.5129541198632506


100%|██████████| 16/16 [00:07<00:00,  2.13it/s, loss=0.603]   


Epoch 13: 0.4664283319188261


100%|██████████| 16/16 [00:07<00:00,  2.00it/s, loss=0.00954] 


Epoch 14: 0.688538336862343


100%|██████████| 16/16 [00:07<00:00,  2.17it/s, loss=2.25e-5] 


Epoch 15: 0.30419731925681825


100%|██████████| 16/16 [00:07<00:00,  2.13it/s, loss=0.000454]


Epoch 16: 0.06217310401452321


100%|██████████| 16/16 [00:07<00:00,  2.18it/s, loss=0.000169]


Epoch 17: 0.006411477454173697


100%|██████████| 16/16 [00:07<00:00,  2.12it/s, loss=8.7e-6]  


Epoch 18: 0.012661197854044293


100%|██████████| 16/16 [00:07<00:00,  2.18it/s, loss=0.000112]


Epoch 19: 0.0025193303443585614


100%|██████████| 16/16 [00:07<00:00,  2.07it/s, loss=0.000551]


Epoch 20: 0.002104015391718184


100%|██████████| 16/16 [00:07<00:00,  2.15it/s, loss=4.35e-6] 


Epoch 21: 0.0017124292793190055


100%|██████████| 16/16 [00:07<00:00,  2.15it/s, loss=0.000689]


Epoch 22: 0.0015439327783657753


100%|██████████| 16/16 [00:07<00:00,  2.16it/s, loss=8.94e-7] 


Epoch 23: 0.0013508947303058072


100%|██████████| 16/16 [00:07<00:00,  2.16it/s, loss=3.96e-5] 


Epoch 24: 0.0014748759848079106


100%|██████████| 16/16 [00:07<00:00,  2.15it/s, loss=0.00227] 


Epoch 25: 0.0013843523029795969


100%|██████████| 16/16 [00:07<00:00,  2.06it/s, loss=0.00207] 


Epoch 26: 0.001105033254689225


100%|██████████| 16/16 [00:07<00:00,  2.15it/s, loss=3.56e-5] 


Epoch 27: 0.0017795526678621165


100%|██████████| 16/16 [00:07<00:00,  2.05it/s, loss=5.96e-8] 


Epoch 28: 0.001043025556116728


100%|██████████| 16/16 [00:09<00:00,  1.73it/s, loss=0.000333]


Epoch 29: 0.0011511756672253348


100%|██████████| 16/16 [00:07<00:00,  2.01it/s, loss=8.67e-5] 


Epoch 30: 0.0009538605158382563


100%|██████████| 16/16 [00:07<00:00,  2.07it/s, loss=0.00043] 


Epoch 31: 0.0008977984670095474


100%|██████████| 16/16 [00:07<00:00,  2.06it/s, loss=2.88e-5] 


Epoch 32: 0.0008672983454423467


100%|██████████| 16/16 [00:08<00:00,  1.94it/s, loss=2.35e-5] 


Epoch 33: 0.0008287449242977196


100%|██████████| 16/16 [00:07<00:00,  2.16it/s, loss=0.00323] 


Epoch 34: 0.0006670853241948383


100%|██████████| 16/16 [00:07<00:00,  2.04it/s, loss=0.000178]


Epoch 35: 0.0008211313092462369


100%|██████████| 16/16 [00:08<00:00,  1.95it/s, loss=0.000495]


Epoch 36: 0.0010157208420423559


100%|██████████| 16/16 [00:08<00:00,  1.82it/s, loss=5.96e-8] 


Epoch 37: 0.0009777171755519288


100%|██████████| 16/16 [00:08<00:00,  1.93it/s, loss=0.000165]


Epoch 38: 0.0006759520546174791


100%|██████████| 16/16 [00:08<00:00,  1.99it/s, loss=3.46e-5] 


Epoch 39: 0.0006905232138991746


100%|██████████| 16/16 [00:08<00:00,  1.99it/s, loss=0.000137]


Epoch 40: 0.0007887605619202098


100%|██████████| 16/16 [00:07<00:00,  2.14it/s, loss=0.00111] 


Epoch 41: 0.0005448418726294868


100%|██████████| 16/16 [00:07<00:00,  2.18it/s, loss=8.59e-5] 


Epoch 42: 0.0005377363099736243


100%|██████████| 16/16 [00:07<00:00,  2.00it/s, loss=0]       


Epoch 43: 0.0006732875835124385


100%|██████████| 16/16 [00:07<00:00,  2.18it/s, loss=3.19e-5] 


Epoch 44: 0.0005617791881533218


100%|██████████| 16/16 [00:07<00:00,  2.18it/s, loss=0.000295]


Epoch 45: 0.0005093613363086113


100%|██████████| 16/16 [00:07<00:00,  2.16it/s, loss=5.19e-5] 


Epoch 46: 0.00044422175336751303


100%|██████████| 16/16 [00:07<00:00,  2.18it/s, loss=0.000202]


Epoch 47: 0.00044942321710044196


100%|██████████| 16/16 [00:07<00:00,  2.16it/s, loss=0]       


Epoch 48: 0.0004599723644435283


100%|██████████| 16/16 [00:07<00:00,  2.17it/s, loss=0.000433]


Epoch 49: 0.0005494904825389835


100%|██████████| 16/16 [00:07<00:00,  2.17it/s, loss=1.26e-5] 


Epoch 50: 0.000548509921579976


100%|██████████| 16/16 [00:07<00:00,  2.10it/s, loss=0.00251] 


Epoch 51: 0.0004077265634342631


100%|██████████| 16/16 [00:07<00:00,  2.15it/s, loss=7.45e-6] 


Epoch 52: 0.0003926825443148374


100%|██████████| 16/16 [00:07<00:00,  2.16it/s, loss=2.59e-5] 


Epoch 53: 0.00037528798914183704


100%|██████████| 16/16 [00:07<00:00,  2.16it/s, loss=0.00273] 


Epoch 54: 0.0003601851430996117


100%|██████████| 16/16 [00:07<00:00,  2.17it/s, loss=2.92e-6] 


Epoch 55: 0.0003702554513940637


100%|██████████| 16/16 [00:07<00:00,  2.12it/s, loss=6.56e-7] 


Epoch 56: 0.00035629281059277316


100%|██████████| 16/16 [00:07<00:00,  2.13it/s, loss=3.46e-5] 


Epoch 57: 0.00038855962491157747


100%|██████████| 16/16 [00:07<00:00,  2.13it/s, loss=8.76e-6] 


Epoch 58: 0.00033470504477342013


100%|██████████| 16/16 [00:07<00:00,  2.10it/s, loss=0.000106]


Epoch 59: 0.0002937368828106468


100%|██████████| 16/16 [00:07<00:00,  2.12it/s, loss=0]       


Epoch 60: 0.00031883161524031144


100%|██████████| 16/16 [00:07<00:00,  2.11it/s, loss=0.000274]


Epoch 61: 0.000287093017804807


100%|██████████| 16/16 [00:07<00:00,  2.21it/s, loss=0.000175]


Epoch 62: 0.0002770016110218432


100%|██████████| 16/16 [00:07<00:00,  2.13it/s, loss=1.35e-5] 


Epoch 63: 0.00027699420879794267


100%|██████████| 16/16 [00:07<00:00,  2.14it/s, loss=6.2e-6]  


Epoch 64: 0.0002468603799833602


100%|██████████| 16/16 [00:07<00:00,  2.07it/s, loss=0.000122]


Epoch 65: 0.00022184536548053302


100%|██████████| 16/16 [00:07<00:00,  2.16it/s, loss=0.000406]


Epoch 66: 0.00029950729427219436


100%|██████████| 16/16 [00:07<00:00,  2.11it/s, loss=0]       


Epoch 67: 0.00023061920367584143


100%|██████████| 16/16 [00:08<00:00,  1.94it/s, loss=9.72e-5] 


Epoch 68: 0.0002215765296323724


100%|██████████| 16/16 [00:08<00:00,  1.97it/s, loss=0.000427]


Epoch 69: 0.00024818587560417884


100%|██████████| 16/16 [00:08<00:00,  1.87it/s, loss=0.000162]


Epoch 70: 0.0002415882405216063


100%|██████████| 16/16 [00:07<00:00,  2.00it/s, loss=6.85e-6] 


Epoch 71: 0.0002184061065058085


100%|██████████| 16/16 [00:07<00:00,  2.22it/s, loss=0.00307] 


Epoch 72: 0.00021782975561914774


100%|██████████| 16/16 [00:08<00:00,  2.00it/s, loss=0]       


Epoch 73: 0.00019838147884811974


100%|██████████| 16/16 [00:07<00:00,  2.06it/s, loss=1.13e-6] 


Epoch 74: 0.0001804651642369759


100%|██████████| 16/16 [00:07<00:00,  2.07it/s, loss=0]       


Epoch 75: 0.00017734449351287652


100%|██████████| 16/16 [00:08<00:00,  1.98it/s, loss=8.17e-6] 


Epoch 76: 0.00017485195333826197


100%|██████████| 16/16 [00:07<00:00,  2.05it/s, loss=7.64e-5] 


Epoch 77: 0.00018314033213147064


100%|██████████| 16/16 [00:08<00:00,  1.95it/s, loss=2.02e-5] 


Epoch 78: 0.00017568574083632883


100%|██████████| 16/16 [00:08<00:00,  1.97it/s, loss=4.17e-6] 


Epoch 79: 0.00017870776169601755


100%|██████████| 16/16 [00:08<00:00,  1.91it/s, loss=0]       


Epoch 80: 0.00015370810218051645


100%|██████████| 16/16 [00:07<00:00,  2.09it/s, loss=3.58e-7] 


Epoch 81: 0.00015054092685531372


100%|██████████| 16/16 [00:07<00:00,  2.05it/s, loss=0.000243]


Epoch 82: 0.00014488368365839221


100%|██████████| 16/16 [00:08<00:00,  1.94it/s, loss=8.2e-5]  


Epoch 83: 0.00014170543732516805


100%|██████████| 16/16 [00:07<00:00,  2.02it/s, loss=0.000119]


Epoch 84: 0.0001444929841902365


100%|██████████| 16/16 [00:07<00:00,  2.08it/s, loss=1.66e-5] 


Epoch 85: 0.00012804451488204193


100%|██████████| 16/16 [00:07<00:00,  2.02it/s, loss=0.000131]


Epoch 86: 0.00013238510362931777


100%|██████████| 16/16 [00:07<00:00,  2.03it/s, loss=1.61e-6] 


Epoch 87: 0.0001297180111154983


100%|██████████| 16/16 [00:07<00:00,  2.18it/s, loss=6.56e-7] 


Epoch 88: 0.00011898494284814375


100%|██████████| 16/16 [00:08<00:00,  1.98it/s, loss=0.00033] 


Epoch 89: 0.00011726104787541658


100%|██████████| 16/16 [00:07<00:00,  2.03it/s, loss=3.07e-5] 


Epoch 90: 0.00010708591434855918


100%|██████████| 16/16 [00:07<00:00,  2.10it/s, loss=1.55e-6] 


Epoch 91: 0.00010561135052462589


100%|██████████| 16/16 [00:08<00:00,  1.98it/s, loss=0.000194]


Epoch 92: 0.00011584344333464003


100%|██████████| 16/16 [00:08<00:00,  1.90it/s, loss=5.96e-6] 


Epoch 93: 9.994802436135153e-05


100%|██████████| 16/16 [00:08<00:00,  1.97it/s, loss=0]       


Epoch 94: 0.00010044231395305303


100%|██████████| 16/16 [00:08<00:00,  1.90it/s, loss=0.000369]


Epoch 95: 9.238814632901437e-05


100%|██████████| 16/16 [00:08<00:00,  1.92it/s, loss=6.74e-6] 


Epoch 96: 8.856727062394043e-05


100%|██████████| 16/16 [00:08<00:00,  1.96it/s, loss=3.85e-5] 


Epoch 97: 8.570545065245128e-05


100%|██████████| 16/16 [00:08<00:00,  1.97it/s, loss=3.58e-7] 


Epoch 98: 7.508381398735331e-05


100%|██████████| 16/16 [00:08<00:00,  1.93it/s, loss=0]       


Epoch 99: 9.483420338050053e-05


100%|██████████| 16/16 [00:07<00:00,  2.01it/s, loss=0.000236]

Epoch 100: 8.681200673765765e-05





In [14]:
test_loader = DataLoader(
    MyCustomDataset(test_x.values, test_y.values),
    batch_size = 5,
    shuffle = True
)

y_pred = []
y_true = []

model.eval()

for batch_idx, (data, targets) in enumerate(test_loader):
    data = data.to(DEVICE).long()
    targets = targets.to(DEVICE)

    for inst_idx in range(data.shape[0]):
        with torch.no_grad():
            pred = model(data[inst_idx])

        # print(pred)
        # print(targets[inst_idx])
        # print(nn.BCEWithLogitsLoss()(pred, targets[inst_idx]))
        pred = pred.cpu().numpy()
        pred = np.argmax(pred)
        y_pred.append(pred)

        truth = np.argmax(targets[inst_idx].cpu().numpy())
        y_true.append(truth)

tp, fn, fp, tn = confusion_matrix(y_true, y_pred).ravel()
print(f'True positives: {tp}')
print(f'True negatives: {tn}')
print(f'False positives: {fp}')
print(f'False negatives: {fn}')
print()
print(f'Accuracy: {(tn + tp) / (tn + tp + fn + fp)}')

True positives: 2
True negatives: 7
False positives: 3
False negatives: 8

Accuracy: 0.45


In [15]:
print(y_pred)
print(y_true)

[1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1]
[0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0]
