## 1. Import depedencies

In [1]:
import os
import re
import string
import functools
import operator 
import json
import spacy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from dataclasses import dataclass

nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])
stopwords = nlp.Defaults.stop_words

## 2. Data works

In [2]:
class Vocabulary:
    def __init__(self, data):
        self.data = data
        
        self.vocab = {
            '<unk>': 0,
            '<pad>': 1,
            '<sos>': 2,
            '<eos>': 3
        }
        
        self.build_vocab()
        
    def __getitem__(self, index):
        assert type(index) in [str, int], 'Index type must be string or int'
        
        if isinstance(index, str):
            try:
                return self.vocab[index]
            
            except KeyError:
                return self.vocab['<unk>']
        
        elif isinstance(index, int):
            try:
                return list(self.vocab.keys())[list(self.vocab.values()).index(index)]
            except (KeyError,ValueError):
                return self[0]
    
    def __len__(self):
        return len(self.vocab)
    
    def append_word(self, word):
        if not word in self.vocab:
            self.vocab[word] = len(self)
    
    def build_vocab(self):
        bag_of_words = sorted(list(set(self.data)))
        
        for word in bag_of_words:
            self.append_word(word)

In [3]:
class RevDataset(Dataset):
    def __init__(self):
        self.path = 'data/Video_Games_5.json'
        self.prep_path = 'data/preprocessed.csv'
        
        # if preprocessed data already exists - load it
        if os.path.isfile(self.prep_path):
            self.data = pd.read_csv(self.prep_path)[['reviewText', 'overall']]
        
        # else preprocess and save
        else:
            with open(self.path, 'r') as f:
                lines = [json.loads(line.rstrip()) for line in f]
                self.data = pd.DataFrame(lines)[['verified', 'reviewText', 'overall']]
                self.data = self.data[self.data['verified']]

            self.data = self.data.dropna()
            self.data = self.data.reset_index(drop=True)
            self.data = self.data.drop('verified', axis=1)

            self.data['reviewText'] = self.data['reviewText'].apply(self.clean_data)
            self.tok_lemma()

            self.data.to_csv(self.prep_path)
            
        self.build_vocab()
        
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        assert type(index) == int, 'Index must be int'
        
        item = self.data.iloc[index]
        text = str(item['reviewText']).split()
        
        for i, word in enumerate(text):
            text[i] = self.Voc[word]
            
        return text, item['overall']
    
    @staticmethod
    def clean_data(text):
        if type(text) != str:
            return '   '
        
        # lowercase
        text = text.lower()
        
        # remove \n signs
        text = text.replace('\n\n\n\n', ' ').replace('\n\n\n', ' ').replace('\n\n', ' ').replace('\n', ' ')
        
        # remove url
        text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
        
        # remove punctuations
        text = text.translate(str.maketrans('', '', string.punctuation))

        return text 
    
    def tok_lemma(self):
        reviews = self.data['reviewText'].values

        reviews = nlp.pipe(reviews, batch_size=128, n_process=3)

        reviews = [' '.join([word.lemma_ for word in text if not word in stopwords and word.lemma_ and word.text.isalpha()]) for text in reviews]

        self.data['reviewText'] = pd.Series(reviews) 
        
    def build_vocab(self):
        bag_of_words = self.data['reviewText'].apply(lambda x: str(x).split()).tolist()

        bag_of_words = functools.reduce(operator.iconcat, bag_of_words, [])
        
        self.Voc = Vocabulary(bag_of_words)

        self.Voc.build_vocab()

In [4]:
data = RevDataset()

In [5]:
@dataclass
class cfg:
    max_length = 310
    embed_size = 120
    hidden_size = 512
    num_layers = 3
    heads = 8
    batch_size = 32
    lr = 3e-4
    vocab_size = len(data.Voc)

In [6]:
# lens = [len(data[i][0]) for i in range(len(data)) if len(data[i][0]) < 500]
# plt.figure(figsize=(11, 8))
# sns.histplot(data=lens, bins=15, kde=True)
# plt.show()

In [7]:
def pad_seq(batch):
    reviews = []
    overalls = []
    for i, (text, overall) in enumerate(batch):
        
        text_len = len(text)
        
        if text_len == cfg.max_length:
            pass
        elif text_len > cfg.max_length:
            text = text[:cfg.max_length]
        else:
            pad_len = cfg.max_length - text_len
            for j in range(pad_len):
                # 1 - index of <pad> token in Vocabulary
                text.append(1)
        reviews.append(torch.Tensor(text).type(torch.int64))
        overalls.append(int(overall) - 1)
    
    overalls = torch.LongTensor(overalls)
    overalls = F.one_hot(overalls, num_classes=5)
    
    return torch.stack(reviews), overalls

In [8]:
loader = DataLoader(data, batch_size=32, collate_fn=pad_seq)

## 3. Model

Model implementation inspired by Alladin Persson's [movie](https://www.youtube.com/watch?v=U0s0f995w14). [GitHub repo](https://github.com/aladdinpersson/Machine-Learning-Collection/blob/master/ML/Pytorch/more_advanced/transformer_from_scratch/transformer_from_scratch.py).

In [9]:
class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads
        
        assert self.head_dim * heads == embed_size, 'Embed size needs to be divisible by heads'
        
        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)
        
        self.fc_out = nn.Linear(heads*self.head_dim, embed_size)
        
    def forward(self, values, keys, queries, mask):
        N = queries.shape[0]
        value_len, key_len, query_len = values.shape[1], keys.shape[1], queries.shape[1]
        
        values = values.reshape(N, value_len, self.heads, self.head_dim)
        keys = keys.reshape(N, key_len, self.heads, self.head_dim)
        queries = queries.reshape(N, query_len, self.heads, self.head_dim)
        
        values = self.values(values) 
        keys = self.keys(keys)
        queries = self.queries(queries)
        
        energy = torch.einsum('nqhd, nkhd->nhqk', [queries, keys])
        
        if mask is not None:
            # if mask at same point is 0 - shitdown this point - set to -inf, in softmax it will be 0
            energy = energy.masked_fill(mask == 0, -1e20)
            
        attention = torch.softmax(energy / (self.embed_size**(1/2)), dim=3)
        
        # attention shape: N, heads, query_len, key_len
        # values shape: N, value_len, heads, head_dim
        # out shape: N, query_len, heads, head_dim
        out = torch.einsum('nhql, nlhd->nqhd', [attention, values])
        
        out = out.reshape(N, query_len, self.heads*self.head_dim)
        
        return out

In [10]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion):
        super(TransformerBlock, self).__init__()
        self.attention = SelfAttention(embed_size, heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)
        
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion*embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion*embed_size, embed_size)
        )
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, value, key, queries, mask):
        attention = self.attention(value, key, queries, mask)
        
        x = self.dropout(self.norm1(attention + queries))
        forward = self.feed_forward(x)
        out = self.dropout(self.norm2(forward + x))
        
        return out

In [11]:
class Model(nn.Module):
    def __init__(self, vocab_size, embed_size, num_layers, max_length, heads, device, forward_expansion, dropout):
        super(Model, self).__init__()
        
        self.embed_size = embed_size
        self.device = device
        self.word_embedding = nn.Embedding(vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)

        self.layers = nn.ModuleList(
            [
                TransformerBlock(
                    embed_size,
                    heads,
                    dropout=dropout,
                    forward_expansion=forward_expansion,
                )
                for _ in range(num_layers)
            ]
        )
        
        self.fc1 = nn.Linear(max_length*embed_size, max_length*forward_expansion)
        self.fc2 = nn.Linear(max_length*forward_expansion, max_length)
        
        self.fc_out = nn.Linear(max_length, 5)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask=None):
        N, seq_len = x.shape

        positions = torch.arange(0, seq_len).expand(N, seq_len).to(self.device)

        x = self.dropout(self.word_embedding(x) + self.position_embedding(positions))
        
        for layer in self.layers:
            x = layer(x, x, x, mask)

        # x shape: N, max_length, embed_size
        # flat x
        x = x.reshape(N, -1)
        
        x = self.dropout(F.relu(self.fc1(x)))
        x = self.dropout(F.relu(self.fc2(x)))
        
        # out shape: N, 5
        out = self.fc_out(x)
        
        return out 

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [13]:
model = Model(
    vocab_size=cfg.vocab_size, 
    embed_size=cfg.embed_size, 
    num_layers=cfg.num_layers, 
    max_length=cfg.max_length,
    heads=cfg.heads, 
    device=device, 
    forward_expansion=4, 
    dropout=0.25)