In [3]:
from gensim.models import Word2Vec
import sentencepiece as spm
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, random_split
import numpy as np


In [4]:
spm.SentencePieceTrainer.Train('--input=union.txt \
                               --model_prefix=sp \
                               --vocab_size=200')

True

In [5]:
sp=spm.SentencePieceProcessor()
sp.load('sp.model')
sp.EncodeAsPieces('from')

['▁from']

In [6]:
sents = [sp.EncodeAsPieces(line.strip())
         for line in open('union.txt', encoding='utf-8')]
w2v = Word2Vec(sents)

w2v.wv.save_word2vec_format('w2v_vectors.bin')
emb_size = w2v.wv.vector_size

def _piece_id_to_vect(piece_id):
    piece = sp.id_to_piece(piece_id)
    if piece in w2v.wv:
        return w2v.wv[piece]
    return np.zeros((emb_size,))

emb = np.array([_piece_id_to_vect(piece_id) for piece_id in range(0, len(sp))])
print(emb)
np.save('vectors.npy', emb)
w2v.wv.most_similar(sp.EncodeAsPieces('def'))


[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 ...
 [ 0.39919499  0.11708833 -0.0087068  ... -0.09696119 -0.03343872
   0.29463592]
 [ 0.36975488  0.06408616 -0.0073666  ... -0.04508707 -0.05605901
   0.27877277]
 [-0.18721144 -0.38798666  0.00871657 ... -0.14950565  0.06206815
  -0.10428812]]


[('self', 0.9435262680053711),
 ('):', 0.9322147369384766),
 ('key', 0.8546724915504456),
 ('(', 0.851710855960846),
 ('▁self', 0.8409487009048462),
 ('pack', 0.8400065898895264),
 ('time', 0.8267315626144409),
 ('▁return', 0.8126473426818848),
 ('connection', 0.811102032661438),
 ('endpoint', 0.8006761074066162)]

In [7]:
max_seq_len = 2 * emb.shape[1]

def prepare_text(text):
    pieces = sp.EncodeAsIds(text)
    if len(pieces) > max_seq_len:
        pieces = pieces[:max_seq_len]
    to_add = (max_seq_len - len(pieces))
    pieces = pieces + to_add * [sp.pad_id()]
    return np.array(pieces)


def prepare_data(file1, file2, met1, met2):
    X = []
    y = []
    for line in open(file1, encoding='utf-8'):
        X.append(prepare_text(line))
        y.append(met1)
    for line in open(file2, encoding='utf-8'):
        X.append(prepare_text(line))
        y.append(met2)
    return np.array(X), np.array(y)

X, y = prepare_data('pos.txt', 'neg.txt', 1, 0)


In [8]:
emb_layer = nn.Embedding.from_pretrained(torch.tensor(emb), padding_idx=sp.pad_id())


In [9]:
X = torch.LongTensor(X)
y = torch.LongTensor(y)
print(y)
print(X)
print(y.size())
l = X.size(0)
print(X.size())
l_train, l_test = int(l * 0.7), int(l * 0.2)

data = TensorDataset(X,y)
train_ds, test_ds, val_ds = random_split(data, [l_train, l_test, l - l_train - l_test])


tensor([1, 1, 1,  ..., 0, 0, 0])
tensor([[  3,  99, 190,  ...,  -1,  -1,  -1],
        [108,   3, 110,  ...,  -1,  -1,  -1],
        [  3, 110,   7,  ...,  -1,  -1,  -1],
        ...,
        [ 20,  85,  45,  ...,  -1,  -1,  -1],
        [  3,  12,  32,  ...,  -1,  -1,  -1],
        [ 85,  45,   7,  ...,  -1,  -1,  -1]])
torch.Size([9517])
torch.Size([9517, 200])


In [36]:
emb = np.array([_piece_id_to_vect(piece_id) for piece_id in range(0, len(sp))])


class Lin(torch.nn.Module):
    def __init__(self, input):
        super(Lin, self).__init__()
        self.embedding = nn.Embedding(200, 100)
        self.linear = nn.Linear(100, 200)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(200, 2)

    def forward(self, x):
        out = torch.sigmoid(self.linear(x))
        return out


input = emb.shape
model = Lin(input)


(200, 100)


In [39]:
def train_model(model, train_data, max_epochs=20):
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-2)
    loss = nn.CrossEntropyLoss()
    train_loader = DataLoader(dataset=train_data, batch_size=10, shuffle=True)
    for epoch in range(max_epochs):
        correct = 0
        total = 0
        for X_batch, y_batch in train_loader:
            model = model.train()
            y_pred = model(X_batch.float())
            loss1 = loss(y_pred, y_batch)
            total += y_batch.size(0)
            correct += (y_pred.argmax(1) == y_batch).sum().item()
            loss1.backward()
            optimizer.step()
            optimizer.zero_grad()
        if epoch % 3 == 2:
            acc = correct / total
            print(f'Epoch = {epoch}, acc = {acc}, loss = {loss1}')
train_model(model, train_ds)
PATH = 'text'
torch.save(model.state_dict(), PATH)



Epoch = 2, acc = 0.4230596006605615, loss = 0.6931471824645996


Epoch = 5, acc = 0.4230596006605615, loss = 0.6931471824645996


Epoch = 8, acc = 0.4230596006605615, loss = 0.6931471824645996


Epoch = 11, acc = 0.4230596006605615, loss = 0.6931471824645996


Epoch = 14, acc = 0.4230596006605615, loss = 0.6931471824645996


Epoch = 17, acc = 0.4230596006605615, loss = 0.6931471824645996


In [45]:
text = 'import from'
enc = sp.EncodeAsIds(text)
outputs = model(torch.FloatTensor(enc))
_, predicted = torch.max(outputs, 1)
print(predicted)


RuntimeError: size mismatch, m1: [1 x 2], m2: [200 x 2] at ../aten/src/TH/generic/THTensorMath.cpp:752