In [1]:
import dill
import math

import spacy
import torch
from torchtext import data

from mllib import mllib

In [2]:
txt_field = torch.load("./data/txt_field.pt", pickle_module=dill)
label_field = torch.load("./data/label_field.pt", pickle_module=dill)

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

vocab_size = len(txt_field.vocab)
embedding_dim = 100
n_hidden = 64
n_out = 2

In [4]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger', 'ner'])
tokenizer = mllib.create_tokenizer(nlp)

In [None]:
a = txt_field.numericalize(("keep voting for the goonies", 1))
a[0]

In [None]:
a[1].size()

In [None]:
train_val_fields = [
    ('ItemID', None),  # we dont need this, so no processing
    ('Sentiment', label_field),  # process it as label
    ('SentimentSource', None),  # we dont need this, so no processing
    ('SentimentText', txt_field)  # process it as text
]

In [None]:
trainds, valds = data.TabularDataset.splits(path='./data',
                                            format='csv',
                                            train='traindf.csv',
                                            validation='valdf.csv',
                                            fields=train_val_fields,
                                            skip_header=True)


In [None]:
traindl, valdl = data.BucketIterator.splits(datasets=(trainds, valds),  # specify train and validation Tabulardataset
                                            # batch size of train and validation
                                            batch_sizes=(3, 3),
                                            # on what attribute the text should be sorted
                                            sort_key=lambda x: len(
                                                x.SentimentText),
                                            device=None,  # -1 mean cpu and 0 or None mean gpu
                                            sort_within_batch=True,
                                            repeat=False)


In [None]:
testdl = data.BucketIterator.splits(datasets=[trainds],  # specify train and validation Tabulardataset
                                            # batch size of train and validation
                                            batch_sizes=(1, 1),
                                            # on what attribute the text should be sorted
                                            sort_key=lambda x: len(
                                                x.SentimentText),
                                            device=None,  # -1 mean cpu and 0 or None mean gpu
                                            repeat=False)


In [None]:
b = next(iter(testdl))
vars(b)

In [None]:
a = next(iter(valds))
vars(a)

In [5]:
m = mllib.SimpleGRU(vocab_size, embedding_dim, n_hidden, n_out,
                    txt_field.vocab.vectors, device=device).to(device)


In [6]:
m.load_state_dict(torch.load('./data/model.pt'))

In [None]:
X.transpose(0,1).size()

In [89]:
def process_text(text_in, pp_field, model):
    tok = [[t.text.lower() for t in nlp(text_in)]]
    print(tok)
    X, lengths = pp_field.numericalize((tok, [1]))
    r = model(X, lengths)

    pos_certainty = (1 - math.exp(r.tolist()[0][0]))
    if pos_certainty < 0.45:
        print(f'{pos_certainty*100:2.4}% NO SENTIMENT for     {text_in}')
    elif pos_certainty > 0.55:
        print(f'{pos_certainty*100:2.4}% SENTIMENT    for     {text_in}')
    else:
        print(f'{pos_certainty*100:2.4}% UNKNOWN      for     {text_in}')


In [92]:
textos = [
    "I'm so happy tonight",
    "Is slightly hungover after @namesnorris's party last night. Great night!",
    "SUMMER is FINNALY here!!!!",
    "has a massive headache.",
    "no food on the table.",
    "@josiew2012 thanks",
    "Bloody damn damness!",
    "@Taylor510CE I love you!  Hope you have a better day today"
    ]

for t in textos:
    process_text(t, txt_field, m)

[['i', "'m", 'so', 'happy', 'tonight']]
38.04% NO SENTIMENT for     I'm so happy tonight
[['is', 'slightly', 'hungover', 'after', '@namesnorris', "'s", 'party', 'last', 'night', '.', 'great', 'night', '!']]
59.47% SENTIMENT    for     Is slightly hungover after @namesnorris's party last night. Great night!
[['summer', 'is', 'finnaly', 'here', '!', '!', '!', '!']]
78.63% SENTIMENT    for     SUMMER is FINNALY here!!!!
[['has', 'a', 'massive', 'headache', '.']]
43.57% NO SENTIMENT for     has a massive headache.
[['no', 'food', 'on', 'the', 'table', '.']]
6.558% NO SENTIMENT for     no food on the table.
[['@josiew2012', 'thanks']]
59.69% SENTIMENT    for     @josiew2012 thanks
[['bloody', 'damn', 'damness', '!']]
10.39% NO SENTIMENT for     Bloody damn damness!
[['@taylor510ce', 'i', 'love', 'you', '!', ' ', 'hope', 'you', 'have', 'a', 'better', 'day', 'today']]
59.69% SENTIMENT    for     @Taylor510CE I love you!  Hope you have a better day today


In [61]:
for t in textos:
    tok = [t2.text.lower() for t2 in nlp(t)]
    print(tok)
    tmp = txt_field.numericalize(([tok], 1))
    display(f'{tmp[0].size()} from {len(t)} chars / {len(tok)} words')
    display(tmp[0])

['i', "'m", 'so', 'happy', 'tonight']


'torch.Size([5, 1]) from 20 chars / 5 words'

tensor([[  2],
        [  0],
        [ 20],
        [127],
        [129]])

['is', 'slightly', 'hungover', 'after', '@namesnorris', "'s", 'party', 'last', 'night', '.', 'great', 'night', '!']


'torch.Size([13, 1]) from 72 chars / 13 words'

tensor([[  10],
        [1859],
        [2597],
        [ 161],
        [   0],
        [   0],
        [ 297],
        [  98],
        [  78],
        [   0],
        [ 105],
        [  78],
        [   0]])

['summer', 'is', 'finnaly', 'here', '!', '!', '!', '!']


'torch.Size([8, 1]) from 26 chars / 8 words'

tensor([[  242],
        [   10],
        [23393],
        [   92],
        [    0],
        [    0],
        [    0],
        [    0]])

['has', 'a', 'massive', 'headache', '.']


'torch.Size([5, 1]) from 23 chars / 5 words'

tensor([[ 107],
        [   5],
        [1918],
        [ 497],
        [   0]])

['no', 'food', 'on', 'the', 'table', '.']


'torch.Size([6, 1]) from 21 chars / 6 words'

tensor([[  41],
        [ 375],
        [  18],
        [   4],
        [1874],
        [   0]])

In [43]:
txt_field.vocab.itos[10]

'is'

In [57]:
type(list(tok)[0])

spacy.tokens.token.Token

In [50]:
type(tok)

list

In [68]:
txt_field.numericalize??