## Vectorization one hot encoded

In [1]:
import numpy as np

In [2]:
thor_review = 'the action scenes were top notch in this movie. thor has never been this epic in the mcu.'

In [3]:
class Dictionary:
    def __init__(self):
        self.word2idx = dict()
        self.idx2word = list()
        self.length = 0
        
    def add_word(self, word):
        if word not in self.idx2word:
            self.idx2word.append(word)
            self.word2idx[word] = self.length + 1
            self.length += 1
        return self.word2idx[word]
    
    def __len__(self):
        return len(self.idx2word)
    
    def onehot_encoded(self, word):
        vec = np.zeros(self.length)
        vec[self.word2idx[word]] = 1
        return vec

In [4]:
dic = Dictionary()
for tok in thor_review.split():
    dic.add_word(tok)

In [5]:
dic.word2idx

{'the': 1,
 'action': 2,
 'scenes': 3,
 'were': 4,
 'top': 5,
 'notch': 6,
 'in': 7,
 'this': 8,
 'movie.': 9,
 'thor': 10,
 'has': 11,
 'never': 12,
 'been': 13,
 'epic': 14,
 'mcu.': 15}

In [6]:
dic.onehot_encoded('the')

array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

## Training word embedding by building a sentiment classifier

In [29]:
from torchtext import data, datasets
from torchtext.vocab import GloVe
import torch.nn as nn
import torch.functional as F

In [8]:
TEXT = data.Field(lower = True, batch_first = True, fix_length = 20)
LABEL = data.Field(sequential = False)

In [9]:
train, test = datasets.IMDB.splits(TEXT, LABEL)

In [10]:
train.fields

{'text': <torchtext.data.field.Field at 0x7fa3c94b7668>,
 'label': <torchtext.data.field.Field at 0x7fa3c94b75f8>}

In [11]:
vars(train[0])

{'text': ['the',
  'premise',
  'of',
  'this',
  'movie',
  'has',
  'been',
  'tickling',
  'my',
  'imagination',
  'for',
  'quite',
  'some',
  'time',
  'now.',
  "we've",
  'all',
  'heard',
  'or',
  'read',
  'about',
  'it',
  'in',
  'some',
  'kind',
  'of',
  'con-text.',
  'what',
  'would',
  'you',
  'do',
  'if',
  'you',
  'were',
  'all',
  'alone',
  'in',
  'the',
  'world?',
  'what',
  'would',
  'you',
  'do',
  'if',
  'the',
  'entire',
  'world',
  'suddenly',
  'disappeared',
  'in',
  'front',
  'of',
  'your',
  'eyes?',
  'in',
  'fact,',
  'the',
  'last',
  'part',
  'is',
  'actually',
  'what',
  'happens',
  'to',
  'dave',
  'and',
  'andrew,',
  'two',
  'room-mates',
  'living',
  'in',
  'a',
  'run-down',
  'house',
  'in',
  'the',
  'middle',
  'of',
  'a',
  'freeway',
  'system.',
  'andrew',
  'is',
  'a',
  'nervous',
  'wreck',
  'to',
  'say',
  'the',
  'least',
  'and',
  'dave',
  'is',
  'considered',
  'being',
  'one',
  'of',
  't

In [12]:
TEXT.build_vocab(train, vectors = GloVe(name = '6B', dim = 300), max_size = 10000, min_freq = 10)
LABEL.build_vocab(train,)

In [13]:
TEXT.vocab.freqs

Counter({'the': 322198,
         'premise': 541,
         'of': 144462,
         'this': 69714,
         'movie': 30887,
         'has': 16570,
         'been': 9074,
         'tickling': 4,
         'my': 11766,
         'imagination': 187,
         'for': 42843,
         'quite': 3662,
         'some': 15280,
         'time': 7945,
         'now.': 292,
         "we've": 249,
         'all': 19740,
         'heard': 1019,
         'or': 16769,
         'read': 1774,
         'about': 16486,
         'it': 65505,
         'in': 90527,
         'kind': 2545,
         'con-text.': 1,
         'what': 14055,
         'would': 12027,
         'you': 27564,
         'do': 7904,
         'if': 15189,
         'were': 10528,
         'alone': 688,
         'world?': 24,
         'entire': 1456,
         'world': 2507,
         'suddenly': 466,
         'disappeared': 69,
         'front': 524,
         'your': 5600,
         'eyes?': 10,
         'fact,': 832,
         'last': 2699,
        

In [14]:
type(TEXT.vocab.freqs)

collections.Counter

In [15]:
TEXT.vocab.vectors

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0466,  0.2132, -0.0074,  ...,  0.0091, -0.2099,  0.0539],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.7724, -0.1800,  0.2072,  ...,  0.6736,  0.2263, -0.2919],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [16]:
TEXT.vocab.vectors.shape

torch.Size([10002, 300])

In [17]:
TEXT.vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x7fa3bed71748>>,
            {'<unk>': 0,
             '<pad>': 1,
             'the': 2,
             'a': 3,
             'and': 4,
             'of': 5,
             'to': 6,
             'is': 7,
             'in': 8,
             'i': 9,
             'this': 10,
             'that': 11,
             'it': 12,
             '/><br': 13,
             'was': 14,
             'as': 15,
             'for': 16,
             'with': 17,
             'but': 18,
             'on': 19,
             'movie': 20,
             'his': 21,
             'are': 22,
             'not': 23,
             'film': 24,
             'you': 25,
             'have': 26,
             'he': 27,
             'be': 28,
             'at': 29,
             'one': 30,
             'by': 31,
             'an': 32,
             'they': 33,
             'from': 34,
             'all': 35,
             'who': 36,
             'like

In [18]:
train_iter, test_iter = data.BucketIterator.splits((train, test), batch_size = 128, shuffle = True)

In [19]:
batch = next(iter(train_iter))
batch


[torchtext.data.batch.Batch of size 128 from IMDB]
	[.text]:[torch.LongTensor of size 128x20]
	[.label]:[torch.LongTensor of size 128]

In [20]:
batch.text

tensor([[  10,   20,    0,  ..., 5092,    2,  131],
        [ 785,  180,    3,  ...,   58,    2,  333],
        [   9,  365,  148,  ..., 1641,   41,  555],
        ...,
        [ 382,    2, 1031,  ...,  277, 3567,   25],
        [  12,  186,  874,  ...,   20,    8,    2],
        [  10,   24,    7,  ...,    8,    0,   13]])

In [21]:
batch.text.shape

torch.Size([128, 20])

In [22]:
batch.label

tensor([1, 1, 1, 1, 2, 1, 2, 2, 1, 2, 2, 2, 1, 1, 2, 2, 1, 2, 2, 1, 2, 1, 1, 2,
        1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2,
        1, 2, 1, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2,
        2, 1, 2, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1,
        1, 2, 2, 2, 2, 1, 1, 2, 1, 1, 2, 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 1,
        1, 2, 2, 1, 2, 1, 2, 1])

In [30]:
class EmbNet(nn.Module):
    def __init__(self, emb_size, hidden_size1, hidden_size2 = 400):
        super(EmbNet, self).__init__()
        self.embedding = nn.Embedding(emb_size, hidden_size1)
        self.fc = nn.Linear(hidden_size2, 3)
        
    def forward(self, x):
        x = self.embedding(x).view(x.size(0), -1)
        x = self.fc(x)        
        return F.log_softmax(x, dim = -1)

In [31]:
model = EmbNet(emb_size = 20, hidden_size1 = 1024)

In [32]:
model(batch.text)

IndexError: index out of range in self