In [26]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
import math
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm

In [27]:
csv_file = 'dataset.csv'

# read only 1000 reviews for speed
df = pd.read_csv(csv_file, nrows=1_000, index_col=0)

df

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x1514,x1515,x1516,x1517,x1518,x1519,x1520,x1521,y0,y1
0,1060,5668,12479,12117,8228,11179,18076,454,9417,7365,...,8902,8902,8902,8902,8902,8902,8902,8902,1,0
1,1060,1093,17690,5866,1616,11418,13144,13144,1060,12117,...,8902,8902,8902,8902,8902,8902,8902,8902,1,0
2,1060,16587,15347,13189,3605,1093,17690,11664,14419,10056,...,8902,8902,8902,8902,8902,8902,8902,8902,1,0
3,1060,16903,11012,3953,1093,10685,6225,1093,5866,11616,...,8902,8902,8902,8902,8902,8902,8902,8902,0,1
4,1060,8627,17469,3953,2885,18817,16336,12117,19260,12479,...,8902,8902,8902,8902,8902,8902,8902,8902,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1060,5047,723,16813,11418,1060,3686,9990,6390,2520,...,8902,8902,8902,8902,8902,8902,8902,8902,1,0
996,1060,16587,11236,12440,11418,1060,16587,2645,5010,13216,...,8902,8902,8902,8902,8902,8902,8902,8902,0,1
997,1060,16587,10091,17109,14419,18333,5945,19237,8110,16466,...,8902,8902,8902,8902,8902,8902,8902,8902,0,1
998,1060,15241,15700,8651,2255,723,13047,14419,10447,13189,...,8902,8902,8902,8902,8902,8902,8902,8902,0,1


In [28]:
# train - 20 positive, 20 negative
# test - 5 positive, 5 negative

positive_mask = df.y0 == 1
negative_mask = df.y0 == 0

positive_reviews = df[positive_mask]
negative_reviews = df[negative_mask]

train = pd.concat([positive_reviews.iloc[:20,:], negative_reviews.iloc[:20,:]])
test = pd.concat([positive_reviews.iloc[:5,:], negative_reviews.iloc[:5,:]])
train

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x1514,x1515,x1516,x1517,x1518,x1519,x1520,x1521,y0,y1
0,1060,5668,12479,12117,8228,11179,18076,454,9417,7365,...,8902,8902,8902,8902,8902,8902,8902,8902,1,0
1,1060,1093,17690,5866,1616,11418,13144,13144,1060,12117,...,8902,8902,8902,8902,8902,8902,8902,8902,1,0
2,1060,16587,15347,13189,3605,1093,17690,11664,14419,10056,...,8902,8902,8902,8902,8902,8902,8902,8902,1,0
4,1060,8627,17469,3953,2885,18817,16336,12117,19260,12479,...,8902,8902,8902,8902,8902,8902,8902,8902,1,0
5,1060,4133,17685,11455,980,12620,9471,1093,13675,12479,...,8902,8902,8902,8902,8902,8902,8902,8902,1,0
6,1060,16587,19446,2475,8651,14419,10447,1093,4292,12479,...,8902,8902,8902,8902,8902,8902,8902,8902,1,0
9,1060,15241,15700,8651,14803,16919,14820,17519,15700,16080,...,8902,8902,8902,8902,8902,8902,8902,8902,1,0
14,1060,13189,1093,19477,12620,12479,1507,7901,17995,5954,...,8902,8902,8902,8902,8902,8902,8902,8902,1,0
16,1060,3885,12821,3686,2525,12329,7358,18333,11937,11418,...,8902,8902,8902,8902,8902,8902,8902,8902,1,0
18,1060,16587,13325,13189,8937,9471,12440,3605,12117,3893,...,8902,8902,8902,8902,8902,8902,8902,8902,1,0


In [29]:
test

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x1514,x1515,x1516,x1517,x1518,x1519,x1520,x1521,y0,y1
0,1060,5668,12479,12117,8228,11179,18076,454,9417,7365,...,8902,8902,8902,8902,8902,8902,8902,8902,1,0
1,1060,1093,17690,5866,1616,11418,13144,13144,1060,12117,...,8902,8902,8902,8902,8902,8902,8902,8902,1,0
2,1060,16587,15347,13189,3605,1093,17690,11664,14419,10056,...,8902,8902,8902,8902,8902,8902,8902,8902,1,0
4,1060,8627,17469,3953,2885,18817,16336,12117,19260,12479,...,8902,8902,8902,8902,8902,8902,8902,8902,1,0
5,1060,4133,17685,11455,980,12620,9471,1093,13675,12479,...,8902,8902,8902,8902,8902,8902,8902,8902,1,0
3,1060,16903,11012,3953,1093,10685,6225,1093,5866,11616,...,8902,8902,8902,8902,8902,8902,8902,8902,0,1
7,1060,13189,19012,3605,12004,1994,9471,12773,14288,4960,...,8902,8902,8902,8902,8902,8902,8902,8902,0,1
8,1060,2075,8132,12117,13676,18665,13982,13189,8937,3388,...,8902,8902,8902,8902,8902,8902,8902,8902,0,1
10,1060,16130,12117,17537,723,5668,12479,9470,16226,12821,...,8902,8902,8902,8902,8902,8902,8902,8902,0,1
11,1060,16587,10584,13189,12620,16466,16587,3605,13982,8289,...,8902,8902,8902,8902,8902,8902,8902,8902,0,1


In [30]:
train_x, train_y = train.iloc[:,:-2], train.iloc[:,-2:]
test_x, test_y = test.iloc[:,:-2], test.iloc[:,-2:]
test_y

Unnamed: 0,y0,y1
0,1,0
1,1,0
2,1,0
4,1,0
5,1,0
3,0,1
7,0,1
8,0,1
10,0,1
11,0,1


In [37]:
class TransformerModule(torch.nn.Module):
    def __init__(self, max_length, d_model):
        super().__init__()
        self.max_length = max_length
        self.d_model = d_model

        self.embedding = torch.nn.Embedding(max_length, d_model)
        self.relu = nn.ReLU()

    def gen_pe(self, n):
        pe = np.zeros(self.max_length*self.d_model).reshape(self.max_length, self.d_model)
        for k in np.arange(self.max_length):
            for i in np.arange(self.d_model):
                theta = k / (n ** (i/self.d_model))
                if i % 2 == 0:
                    pe[k,i] = math.sin(theta)
                else:
                    pe[k,i] = math.cos(theta)
        
        return pe
    
    def forward(self, x):
        # x is a lookup tensor
        initial_embeddings = self.relu(self.embedding(x))
        positional_embeddings = torch.tensor(self.gen_pe(1000))

        input_embeddings = initial_embeddings + positional_embeddings

        _ = torch.matmul(x, torch.transpose(input_embeddings, 0 , 1))
        _ = torch.divide(_, math.sqrt(self.d_model))
        _ = nn.functional.softmax(_, dim=1)

        V = self.relu(nn.Parameter(torch.rand(x.shape)))

        return torch.matmul(_,V)

In [32]:
class ClassificationHead(nn.Module):
    def __init__(self, embedding_length, head_length):
        super().__init__()
        self.embedding_length = embedding_length
        self.head_length = head_length

        self.sigmoid = nn.Sigmoid()

        self.output_layer = nn.Linear(self.embedding_length, self.head_length)
    
    def forward(self, x):
        y = self.output_layer(x)

        return self.sigmoid(y)

In [33]:
class Classifier(nn.Module):
    def __init__(self, max_length, d_model, head_length):
        super().__init__()
        self.max_length = max_length
        self.d_model = d_model
        self.head_length = head_length
        self.expected_unravel_length = max_length * d_model

        self.softmax = nn.Softmax()
        
        self.body = TransformerModule(self.max_length, self.d_model)
        self.head = ClassificationHead(self.expected_unravel_length, self.head_length)

    def forward(self, x):
        x = self.body(x)
        x = x.view(-1)
        x = self.head(x)
        x = self.softmax(x)
        return x

In [34]:
class MyCustomDataset(Dataset):
    def __init__(self, x, y):
        self.x = torch.Tensor(x)
        self.y = torch.Tensor(y)
        self.n_samples = len(x)
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]
    
    def __len__(self):
        return self.n_samples

custom_dataset = MyCustomDataset(train_x.values, train_y.values)

loader = DataLoader(
    custom_dataset,
    batch_size = 5,
    shuffle = False
)

In [35]:
def train_fn(loader, model, optimizer, loss_fn, device="cpu"):
    loop = tqdm(loader)

    average_loss = 0
    count = 0

    for batch_idx, (data, targets) in enumerate(loop):
        data = data.to(device=device)
        targets = targets.to(device=device)

        # Forward
        predictions = model.forward(data)

        loss = loss_fn(predictions, targets)

        # Backward
        optimizer.zero_grad()

        loss.backward()

        optimizer.step()

        # Update tqdm
        loop.set_postfix(loss=loss.item())

        average_loss += loss.item()
        count += 1

    average_loss = average_loss / count

    return average_loss

In [38]:
NUM_EPOCHS = 100
EMBEDDING_LENGTH = 100
model = Classifier(train_x.shape[1], EMBEDDING_LENGTH, train_y.shape[1])
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.BCELoss()

for i in range(NUM_EPOCHS):
    ave_loss = train_fn(loader, model, optimizer, criterion)
    print(f'Epoch {i}: {ave_loss}')

  0%|          | 0/8 [00:00<?, ?it/s]


RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)