In [158]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
import math
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm

In [177]:
DEVICE = "cuda"

In [159]:
csv_file = 'dataset.csv'

# read only 1000 reviews for speed
df = pd.read_csv(csv_file, nrows=1_000, index_col=0)

df

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x1514,x1515,x1516,x1517,x1518,x1519,x1520,x1521,y0,y1
0,7544,2013,18331,1572,6857,13995,8065,7425,3024,5327,...,3377,3377,3377,3377,3377,3377,3377,3377,1,0
1,7544,2893,11047,11363,13776,17485,3678,3678,7544,1572,...,3377,3377,3377,3377,3377,3377,3377,3377,1,0
2,7544,5561,12464,9056,8625,2893,11047,18359,10540,4291,...,3377,3377,3377,3377,3377,3377,3377,3377,1,0
3,7544,12216,9631,13595,2893,7049,6854,2893,11363,18847,...,3377,3377,3377,3377,3377,3377,3377,3377,0,1
4,7544,296,9673,13595,14109,18180,12414,1572,4655,18331,...,3377,3377,3377,3377,3377,3377,3377,3377,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,7544,16999,15682,7030,17485,7544,6650,18810,15392,13890,...,3377,3377,3377,3377,3377,3377,3377,3377,1,0
996,7544,5561,15464,3693,17485,7544,5561,4503,18292,1791,...,3377,3377,3377,3377,3377,3377,3377,3377,0,1
997,7544,5561,8592,10020,10540,15698,13793,3510,9560,9994,...,3377,3377,3377,3377,3377,3377,3377,3377,0,1
998,7544,17943,4322,83,11522,15682,370,10540,11531,9056,...,3377,3377,3377,3377,3377,3377,3377,3377,0,1


In [160]:
vocabulary_size = df.max().max() + 1
vocabulary_size

19504

In [161]:
# train - 20 positive, 20 negative
# test - 5 positive, 5 negative

positive_mask = df.y0 == 1
negative_mask = df.y0 == 0

positive_reviews = df[positive_mask]
negative_reviews = df[negative_mask]

train = pd.concat([positive_reviews.iloc[:20,:], negative_reviews.iloc[:20,:]])
test = pd.concat([positive_reviews.iloc[:5,:], negative_reviews.iloc[:5,:]])
train

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x1514,x1515,x1516,x1517,x1518,x1519,x1520,x1521,y0,y1
0,7544,2013,18331,1572,6857,13995,8065,7425,3024,5327,...,3377,3377,3377,3377,3377,3377,3377,3377,1,0
1,7544,2893,11047,11363,13776,17485,3678,3678,7544,1572,...,3377,3377,3377,3377,3377,3377,3377,3377,1,0
2,7544,5561,12464,9056,8625,2893,11047,18359,10540,4291,...,3377,3377,3377,3377,3377,3377,3377,3377,1,0
4,7544,296,9673,13595,14109,18180,12414,1572,4655,18331,...,3377,3377,3377,3377,3377,3377,3377,3377,1,0
5,7544,1785,3107,16262,18750,10845,11861,2893,3172,18331,...,3377,3377,3377,3377,3377,3377,3377,3377,1,0
6,7544,5561,10855,2139,83,10540,11531,2893,16037,18331,...,3377,3377,3377,3377,3377,3377,3377,3377,1,0
9,7544,17943,4322,83,8966,2987,247,11112,4322,15839,...,3377,3377,3377,3377,3377,3377,3377,3377,1,0
14,7544,9056,2893,18635,10845,18331,9592,2700,16492,9873,...,3377,3377,3377,3377,3377,3377,3377,3377,1,0
16,7544,17686,4307,6650,16559,14403,14224,15698,8897,17485,...,3377,3377,3377,3377,3377,3377,3377,3377,1,0
18,7544,5561,1770,9056,12404,11861,3693,8625,1572,13946,...,3377,3377,3377,3377,3377,3377,3377,3377,1,0


In [162]:
test

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x1514,x1515,x1516,x1517,x1518,x1519,x1520,x1521,y0,y1
0,7544,2013,18331,1572,6857,13995,8065,7425,3024,5327,...,3377,3377,3377,3377,3377,3377,3377,3377,1,0
1,7544,2893,11047,11363,13776,17485,3678,3678,7544,1572,...,3377,3377,3377,3377,3377,3377,3377,3377,1,0
2,7544,5561,12464,9056,8625,2893,11047,18359,10540,4291,...,3377,3377,3377,3377,3377,3377,3377,3377,1,0
4,7544,296,9673,13595,14109,18180,12414,1572,4655,18331,...,3377,3377,3377,3377,3377,3377,3377,3377,1,0
5,7544,1785,3107,16262,18750,10845,11861,2893,3172,18331,...,3377,3377,3377,3377,3377,3377,3377,3377,1,0
3,7544,12216,9631,13595,2893,7049,6854,2893,11363,18847,...,3377,3377,3377,3377,3377,3377,3377,3377,0,1
7,7544,9056,2417,8625,6691,3647,11861,13632,17822,5038,...,3377,3377,3377,3377,3377,3377,3377,3377,0,1
8,7544,10477,6322,1572,7674,709,7281,9056,12404,12805,...,3377,3377,3377,3377,3377,3377,3377,3377,0,1
10,7544,3508,1572,13382,15682,2013,18331,9449,3646,4307,...,3377,3377,3377,3377,3377,3377,3377,3377,0,1
11,7544,5561,18147,9056,10845,9994,5561,8625,7281,12335,...,3377,3377,3377,3377,3377,3377,3377,3377,0,1


In [163]:
train_x, train_y = train.iloc[:,:-2], train.iloc[:,-2:]
test_x, test_y = test.iloc[:,:-2], test.iloc[:,-2:]
test_y

Unnamed: 0,y0,y1
0,1,0
1,1,0
2,1,0
4,1,0
5,1,0
3,0,1
7,0,1
8,0,1
10,0,1
11,0,1


In [238]:
class TransformerModule(nn.Module):
    def __init__(self, vocabulary_size, max_length, d_model):
        super().__init__()
        self.max_length = max_length
        self.d_model = d_model

        # apparently nn.Embedding should take vocabulary size as first argument
        # if we pass a number greater than the first argument we get an IndexError
        # in our dataset, the number of words greatly exceeds the length of the max sentence
        # so max length is unfit to be first argument here
        self.embedding = nn.Embedding(vocabulary_size, d_model)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

        global DEVICE
        self.device = 'cpu'
        if 'DEVICE' in globals():
            self.device = DEVICE

        self.V = nn.Parameter(torch.rand(max_length, d_model).to(self.device))

    def gen_pe(self, n):
        pe = np.zeros((self.max_length, self.d_model), dtype=np.float32)
        for k in np.arange(self.max_length):
            for i in np.arange(self.d_model):
                theta = k / (n ** (i/self.d_model))
                if i % 2 == 0:
                    pe[k,i] = math.sin(theta)
                else:
                    pe[k,i] = math.cos(theta)
        return pe
    
    def forward(self, x: torch.FloatTensor):
        # x is a lookup tensor
        initial_embeddings = self.relu(self.embedding(x))
        positional_embeddings = torch.tensor(self.gen_pe(1000)).to(self.device)
        
        input_embeddings = torch.add(initial_embeddings, positional_embeddings)

        _ = torch.matmul(input_embeddings, torch.transpose(input_embeddings, 0 , 1))
        _ = torch.divide(_, math.sqrt(self.d_model))
        _ = self.softmax(_)

        return self.relu(torch.matmul(_,self.V))

In [239]:
class ClassificationHead(nn.Module):
    def __init__(self, embedding_length, head_length):
        super().__init__()
        self.embedding_length = embedding_length
        self.head_length = head_length

        self.output_layer = nn.Linear(self.embedding_length, self.head_length)
    
    def forward(self, x):
        y = self.output_layer(x)

        # no more sigmoid; we will use BCEWithLogitsLoss which uses a starting sigmoid layer and is more numerically stable
        return y

In [240]:
class Classifier(nn.Module):
    def __init__(self, vocabulary_size, max_length, d_model, head_length):
        super().__init__()
        self.vocabulary_size = vocabulary_size
        self.max_length = max_length
        self.d_model = d_model
        self.head_length = head_length
        self.expected_unravel_length = max_length * d_model
        
        self.body = TransformerModule(self.vocabulary_size, self.max_length, self.d_model)
        self.head = ClassificationHead(self.expected_unravel_length, self.head_length)

    def forward(self, x):
        x = self.body(x)
        x = x.view(-1)
        x = self.head(x)
        # softmax removed; inherently applied by CE, and not applied by BCE
        return x

In [241]:
class MyCustomDataset(Dataset):
    def __init__(self, x, y):
        self.x = torch.Tensor(x)
        self.y = torch.Tensor(y)
        self.n_samples = len(x)
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]
    
    def __len__(self):
        return self.n_samples

custom_dataset = MyCustomDataset(train_x.values, train_y.values)

loader = DataLoader(
    custom_dataset,
    batch_size = 5,
    shuffle = False
)

In [242]:
def train_fn(loader, model, optimizer, loss_fn, device="cpu"):
    loop = tqdm(loader)

    average_loss = 0
    count = 0

    for batch_idx, (data, targets) in enumerate(loop):
        data = data.to(device=device).long()
        targets = targets.to(device=device)
        # print(data.shape)
        # print(targets.shape)

        for inst_idx in range(data.shape[0]):
            # Forward
            predictions = model.forward(data[inst_idx])
            loss = loss_fn(predictions, targets[inst_idx])
            # Backward
            optimizer.zero_grad()

            loss.backward()

            optimizer.step()

            # Update tqdm
            loop.set_postfix(loss=loss.item())

            average_loss += loss.item()
            count += 1

    average_loss = average_loss / count

    return average_loss

In [243]:
NUM_EPOCHS = 10
EMBEDDING_LENGTH = 100
model = Classifier(vocabulary_size, train_x.shape[1], EMBEDDING_LENGTH, train_y.shape[1]).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

for i in range(NUM_EPOCHS):
    ave_loss = train_fn(loader, model, optimizer, criterion, device=DEVICE)
    print(f'Epoch {i+1}: {ave_loss}')

  0%|          | 0/8 [00:00<?, ?it/s]

100%|██████████| 8/8 [00:08<00:00,  1.03s/it, loss=0]   


Epoch 1: 43.08872239589691


100%|██████████| 8/8 [00:08<00:00,  1.05s/it, loss=0]       


Epoch 2: 223.04023152872543


100%|██████████| 8/8 [00:08<00:00,  1.03s/it, loss=0]   


Epoch 3: 161.7241340637207


100%|██████████| 8/8 [00:08<00:00,  1.06s/it, loss=0]   


Epoch 4: 138.45208115577697


100%|██████████| 8/8 [00:08<00:00,  1.08s/it, loss=0]   


Epoch 5: 121.35619025230407


100%|██████████| 8/8 [00:08<00:00,  1.05s/it, loss=0]   


Epoch 6: 106.9187525510788


100%|██████████| 8/8 [00:08<00:00,  1.05s/it, loss=0]      


Epoch 7: 94.0045701161027


100%|██████████| 8/8 [00:08<00:00,  1.06s/it, loss=0]      


Epoch 8: 82.21853457242213


100%|██████████| 8/8 [00:08<00:00,  1.03s/it, loss=0]      


Epoch 9: 70.46523972302498


100%|██████████| 8/8 [00:08<00:00,  1.04s/it, loss=0]      

Epoch 10: 51.19357512101645





In [None]:
### TODO
# change NUM_EPOCHS to 100 and run it! 
# preferably if you have a cuda gpu and have a cuda installation of pytorch so it will be fast
# i will probably just do this myself 
# but if you do and you get a bug from running change DEVICE = "cpu" at cell 2

# do number 4!