In [70]:
from gensim.models import Word2Vec
import re
import sentencepiece as spm
import smart_open as sm
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, random_split
import numpy as np
from tqdm import tqdm_notebook

In [71]:
spm.SentencePieceTrainer.Train('--input=union.txt \
                               --model_prefix=sp \
                               --vocab_size=200')

True

In [72]:
sp=spm.SentencePieceProcessor()
sp.load('sp.model')
sp.EncodeAsPieces('from')

['▁from']

In [79]:
sents = [sp.EncodeAsPieces(line.strip())
         for line in open('union.txt', encoding='utf-8')]
w2v = Word2Vec(sents)

w2v.wv.save_word2vec_format('w2v_vectors.bin')
emb_size = w2v.wv.vector_size

def _piece_id_to_vect(piece_id):
    piece = sp.id_to_piece(piece_id)
    if piece in w2v.wv:
        return w2v.wv[piece]
    return np.zeros((emb_size,))

emb = np.array([_piece_id_to_vect(piece_id) for piece_id in range(0, len(sp))])
print(emb)
np.save('vectors.npy', emb)
w2v.wv.most_similar(sp.EncodeAsPieces('def'))


[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 ...
 [-0.17007419 -0.16844232 -0.15023306 ...  0.04665544 -0.1033684
  -0.11344565]
 [-0.07886244 -0.18945538 -0.173351   ...  0.03017846 -0.07857458
  -0.03633285]
 [-0.13162169 -0.33412799 -0.3873907  ... -0.16333987 -0.06541425
   0.20294599]]


[('self', 0.9655777812004089),
 ('endpoint', 0.9486289024353027),
 ('):', 0.9476622939109802),
 ('key', 0.9018864631652832),
 ('(', 0.8716409206390381),
 ('▁self', 0.8660987615585327),
 ('time', 0.8484920263290405),
 ('▁return', 0.8398476839065552),
 ('pack', 0.8293720483779907),
 (',', 0.79929119348526)]

In [136]:
max_seq_len = 2 * emb.shape[1]

def prepare_text(text):
    pieces = sp.EncodeAsIds(text)
    if len(pieces) > max_seq_len:
        pieces = pieces[:max_seq_len]
    to_add = (max_seq_len - len(pieces))
    pieces = pieces + to_add * [sp.pad_id()]
    return np.array(pieces)


def prepare_data(file1, file2, met1, met2):
    X = []
    y = []
    for line in open(file1, encoding='utf-8'):
        X.append(prepare_text(line))
        y.append(met1)
    for line in open(file2, encoding='utf-8'):
        X.append(prepare_text(line))
        y.append(met2)
    return np.array(X), np.array(y)

X, y = prepare_data('pos.txt', 'neg.txt', 1, 0)


In [137]:
emb_layer = nn.Embedding.from_pretrained(torch.tensor(emb), padding_idx=sp.pad_id())


In [138]:
X = torch.LongTensor(X)
y = torch.LongTensor(y)
print(y)
print(X)
print(y.size())
l = X.size(0)
print(X.size())
l_train, l_test = int(l * 0.7), int(l * 0.2)

data = TensorDataset(X,y)
train_ds, test_ds, val_ds = random_split(data, [l_train, l_test, l - l_train - l_test])


tensor([1, 1, 1,  ..., 0, 0, 0])
tensor([[  3, 100, 192,  ...,  -1,  -1,  -1],
        [107,   3, 114,  ...,  -1,  -1,  -1],
        [  3, 114,   6,  ...,  -1,  -1,  -1],
        ...,
        [ 20,  87,  46,  ...,  -1,  -1,  -1],
        [  3,  13,  31,  ...,  -1,  -1,  -1],
        [ 87,  46,   6,  ...,  -1,  -1,  -1]])
torch.Size([9517])
torch.Size([9517, 200])


In [143]:
emb = np.array([_piece_id_to_vect(piece_id) for piece_id in range(0, len(sp))])
print(emb.shape)


class Lin(torch.nn.Module):
    def __init__(self, input):
        super(Lin, self).__init__()
        self.embedding = nn.Embedding(200, 100)
        self.linear = nn.Linear(200, 2)

    def forward(self, x):
        out = torch.sigmoid(self.linear(x))
        return out


input = emb.shape
model = Lin(input)


(200, 100)


In [145]:
def train_model(model, train_data, max_epochs=20):
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-2)
    loss = nn.CrossEntropyLoss()
    train_loader = DataLoader(dataset=train_data, batch_size=1, shuffle=True)
    for epoch in range(max_epochs):
        correct = 0
        total = 0
        for X_batch, y_batch in train_loader:
            model = model.train()
            y_pred = model(X_batch.float())
            loss1 = loss(y_pred, y_batch)
            total += y_batch.size(0)
            correct += (y_pred.argmax(1) == y_batch).sum().item()
            loss1.backward()
            optimizer.step()
            optimizer.zero_grad()
        if epoch % 3 == 2:
            acc = correct / total
            print(f'Epoch = {epoch}, acc = {acc}, loss = {loss1}')
train_model(model, train_ds)


Epoch = 2, acc = 0.47740579492568685, loss = 0.6931471824645996


Epoch = 5, acc = 0.5060801681429214, loss = 0.6931471824645996


Epoch = 8, acc = 0.5056297853175199, loss = 0.31326165795326233
