In [1]:
from util.tokenization import *
from util.processor import *
import collections

In [2]:
tokenizer = BasicTokenizer()

In [3]:
tokenizer.tokenize("hello world! this is supper!")

['hello', 'world', '!', 'this', 'is', 'supper', '!']

In [4]:
sst5_p = SST5_Processor() # t, d, t
imdb_p = IMDb_Processor() # t, t
yelp5_p = Yelp5_Processor() # t, t
semeval_p = SemEval_Processor() # t, t

In [5]:
all = [sst5_p.get_train_examples("../datasets/SST5")] + \
      [sst5_p.get_dev_examples("../datasets/SST5")] + \
      [sst5_p.get_test_examples("../datasets/SST5")] + \
      [imdb_p.get_train_examples("../datasets/IMDb")] + \
      [imdb_p.get_test_examples("../datasets/IMDb")] + \
      [yelp5_p.get_train_examples("../datasets/Yelp5", sentence_limit=25000)] + \
      [yelp5_p.get_test_examples("../datasets/Yelp5")] + \
      [semeval_p.get_train_examples("../datasets/SemEval")] + \
      [semeval_p.get_test_examples("../datasets/SemEval")]
      

sentence limit= None
0
guid= train-0
text_a= the
text_b= None
label= 2
sentence limit= None
0
guid= dev-0
text_a= in his first stab at the form , jacquot takes a slightly anarchic approach that works only sporadically .
text_b= None
label= 2
sentence limit= None
0
guid= test-0
text_a= no movement , no yuks , not much of anything .
text_b= None
label= 1
sentence limit= None
0
guid= train-0
text_a= Bizarre horror movie filled with famous faces but stolen by Cristina Raines (later of TV's "Flamingo Road") as a pretty but somewhat unstable model with a gummy smile who is slated to pay for her attempted suicides by guarding the Gateway to Hell! The scenes with Raines modeling are very well captured, the mood music is perfect, Deborah Raffin is charming as Cristina's pal, but when Raines moves into a creepy Brooklyn Heights brownstone (inhabited by a blind priest on the top floor), things really start cooking. The neighbors, including a fantastically wicked Burgess Meredith and kinky couple 

In [6]:
flat_list = [item for sublist in all for item in sublist]

In [7]:
all_sentence = [e.text_a for e in flat_list]

In [9]:
len(all_sentence)

334694

In [8]:
ts_all = set([])
for s in all_sentence:
    ts = tokenizer.tokenize(s)
    for t in ts:
        ts_all.add(t)

In [44]:
len(ts_all)

337119

In [10]:
import numpy as np

def glove2dict(src_filename):
    """
    GloVe reader.
    Parameters
    ----------
    src_filename : str
        Full path to the GloVe file to be processed.
    Returns
    -------
    dict
        Mapping words to their GloVe vectors as `np.array`.
    """
    # This distribution has some words with spaces, so we have to
    # assume its dimensionality and parse out the lines specially:
    if '840B.300d' in src_filename:
        line_parser = lambda line: line.rsplit(" ", 300)
    else:
        line_parser = lambda line: line.strip().split()
    data = {}
    with open(src_filename, encoding='utf8') as f:
        while True:
            try:
                line = next(f)
                line = line_parser(line)
                data[line[0]] = np.array(line[1: ], dtype=np.float)
            except StopIteration:
                break
            except UnicodeDecodeError:
                pass
    return data

In [11]:
glove = glove2dict("../../GloVe/glove.840B/glove.840B.300d.txt")

In [12]:
len(glove)

2196016

In [13]:
vocab_dict = collections.OrderedDict()
non_exist_ts = set([])
for t in ts_all:
    if t in glove.keys():
        vocab_dict[t] = glove[t]
    else:
        non_exist_ts.add(t)

In [14]:
len(vocab_dict)

108835

In [76]:
vocab_dict

OrderedDict([('kiddush',
              array([-4.2522e-01,  2.5471e-01,  6.1219e-01,  4.0665e-01, -5.1317e-02,
                     -2.6050e-01,  4.8517e-01, -2.9007e-02,  5.4613e-01, -5.1288e-01,
                     -5.1953e-02,  4.4210e-01,  1.0873e+00,  1.6649e-01,  1.6339e-01,
                      1.0735e-01, -2.7459e-01, -4.3407e-01,  3.3759e-01, -1.5466e-01,
                     -9.6561e-02, -7.5467e-02,  1.1108e-01,  2.4608e-01,  4.6201e-01,
                      2.1942e-01, -3.9768e-01,  2.9705e-01, -5.0405e-01,  9.6622e-01,
                     -1.6095e-02, -4.8307e-01, -2.7175e-01,  4.3973e-02, -1.8183e-01,
                      2.0997e-01,  5.1578e-01, -3.8951e-01,  5.2156e-01,  8.9314e-01,
                      1.8892e-01, -5.2435e-01,  2.0873e-01,  1.6428e-01,  4.1112e-01,
                      1.0539e+00,  1.6706e-01, -4.2245e-01, -3.0783e-01,  6.2439e-01,
                     -9.4917e-01,  5.8393e-01, -9.6142e-02, -7.1007e-01,  1.1620e-01,
                     -5.7776e

In [15]:
vocab_new = []
embeddings = []
for word, e in vocab_dict.items():
    vocab_new.append(word)
    embeddings.append(e)

In [22]:
vocab_new = ["[UNK]", "[PAD]"] + vocab_new

In [24]:
import torch
embeddings = torch.tensor(embeddings)

In [33]:
unk_embeddings = embeddings.mean(dim=0, keepdim=True)

In [35]:
pad_embeddings = torch.tensor([[0.]*300], dtype=torch.float64)
pad_embeddings

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

In [37]:
embeddings = torch.cat([unk_embeddings, pad_embeddings, embeddings], dim=0)

In [38]:
embeddings.shape

torch.Size([108837, 300])

In [45]:
import torch.nn as nn
embedding_layer = nn.Embedding.from_pretrained(embeddings, freeze=True)

In [47]:
torch.save(embedding_layer, "../models/Transformer/pytorch_word_embeddings_model.bin")

In [43]:
with open("../models/Transformer/vocab.txt", "w") as f:
    for w in vocab_new:
        f.write(w)
        f.write("\n")

In [41]:
torch.save()

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

In [1]:
import torch

In [2]:
embedding_layer = torch.load("../models/Transformer/pytorch_word_embeddings_model.bin")

In [4]:
embedding_layer.weight

Parameter containing:
tensor([[ 0.1385, -0.0748,  0.0950,  ...,  0.0825, -0.0840, -0.0796],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.3692,  0.5183, -0.2845,  ..., -0.2362, -0.0815, -0.6116],
        ...,
        [ 0.7157, -0.3444, -0.4759,  ..., -0.4876, -0.0403,  0.5718],
        [ 0.1683,  0.6757, -0.0449,  ...,  0.3907, -0.6180,  0.0778],
        [-0.5694,  0.0029, -0.2453,  ..., -0.0532, -0.4128,  0.0395]],
       dtype=torch.float64)

In [6]:
unk = embedding_layer.weight[0].clone()

In [7]:
pad = embedding_layer.weight[1].clone()

In [8]:
embedding_layer.weight[0] = pad
embedding_layer.weight[1] = unk

In [9]:
embedding_layer.weight

Parameter containing:
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.1385, -0.0748,  0.0950,  ...,  0.0825, -0.0840, -0.0796],
        [-0.3692,  0.5183, -0.2845,  ..., -0.2362, -0.0815, -0.6116],
        ...,
        [ 0.7157, -0.3444, -0.4759,  ..., -0.4876, -0.0403,  0.5718],
        [ 0.1683,  0.6757, -0.0449,  ...,  0.3907, -0.6180,  0.0778],
        [-0.5694,  0.0029, -0.2453,  ..., -0.0532, -0.4128,  0.0395]],
       dtype=torch.float64)

In [10]:
torch.save(embedding_layer, "../models/Transformer/pytorch_word_embeddings_model.bin")

In [None]:
imdb_p = IMDb_Processor() # t, t