In [4]:
from gensim.models import Word2Vec
import re
import sentencepiece as spm
import smart_open as sm
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, random_split
import numpy as np
from tqdm import tqdm_notebook

In [8]:
spm.SentencePieceTrainer.Train('--input=python50k.json \
                               --model_prefix=spm \
                               --vocab_size=5000')

True

In [9]:
sp=spm.SentencePieceProcessor()
sp.load('spm.model')
sp.EncodeAsPieces('from')

['▁', 'from']

In [27]:
sents = [sp.EncodeAsPieces(line.strip())
         for line in open('python50k.json', encoding='utf-8')]
w2v = Word2Vec(sents)

w2v.wv.save_word2vec_format('w2v_vectors.bin')
emb_size = w2v.wv.vector_size

def _piece_id_to_vect(piece_id):
    piece = sp.id_to_piece(piece_id)
    if piece in w2v.wv:
        return w2v.wv[piece]
    return np.zeros((emb_size,))

emb = np.array([_piece_id_to_vect(piece_id) for piece_id in range(0, len(sp))])
print(emb)
np.save('vectors.npy', emb)
w2v.wv.most_similar(sp.EncodeAsPieces('from'))


[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 ...
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.03050349 -0.04217246  0.01144386 ...  0.00616499 -0.02676181
  -0.02751306]
 [ 1.6454339  -1.40280998 -0.43510774 ...  0.25090417  0.91573763
   1.06865394]]


[('about', 0.8200346231460571),
 ('▁out', 0.8200070858001709),
 ('▁import', 0.781629741191864),
 ('some', 0.7804829478263855),
 ('▁get', 0.7751960158348083),
 ('▁mode', 0.7644646167755127),
 ('▁global', 0.7623931169509888),
 ('▁open', 0.7623334527015686),
 ('▁so', 0.7619320750236511),
 ('fixed', 0.7593595385551453)]

In [25]:
max_seq_len = 120

def prepare_text(text):
    pieces = sp.EncodeAsIds(text)
    if len(pieces) > max_seq_len:
        pieces = pieces[:max_seq_len]
    to_add = (max_seq_len - len(pieces))
    pieces = pieces + to_add * [sp.pad_id()]
    return np.array(pieces)


def prepare_data(file):
    X = []
    for line in open(file, encoding='utf-8'):
        X.append(prepare_text(line))
    return np.array(X)

X = prepare_data('python50k.json')


In [36]:
emb_layer = nn.Embedding.from_pretrained(torch.tensor(emb), padding_idx=sp.pad_id())


In [37]:
X = torch.LongTensor(X)
l = X.size(0)
l_train, l_test = int(l * 0.7), int(l * 0.2)

data = TensorDataset(X)
train_ds, test_ds, val_ds = random_split(data, [l_train, l_test, l - l_train - l_test])

1000


In [33]:
model = nn.Sequential(nn.EmbeddingBag.from_pretrained(torch.FloatTensor(emb)),
                      nn.Linear(emb.shape[1], 20),
                      nn.Softmax(dim=1))