In [39]:
from gensim.models import Word2Vec
from tokenizers import BertWordPieceTokenizer
import torch
import torch.nn as nn
from torch.utils.data import Dataset, TensorDataset, DataLoader, random_split
import numpy as np


In [40]:
tokenizer = BertWordPieceTokenizer()


In [41]:
tokenizer.train(['union.txt'])
tokenizer.save('/Users/mariiaaksenova/PycharmProjects/python-autofill','tokenizer1')
encoded = tokenizer.encode('from')
encoded.tokens

['from']

In [47]:
sents = []
i = 0
line1 = ''
with open('union.txt', 'r') as f:
    for line in f:
        i += 1
        line1 = line1 + line
        if i % 31 == 0:
            sents.append(tokenizer.encode(line1).tokens)
            line1 = ''
w2v = Word2Vec(sents)

w2v.wv.save_word2vec_format('w2v_vectors.bin')
emb_size = w2v.wv.vector_size

def _piece_id_to_vect(piece_id):
    piece = piece_id.tokens
    for smallpiece in piece:
        if smallpiece in w2v.wv:
            return w2v.wv[smallpiece]
        return np.zeros((emb_size,))

emb = np.array([_piece_id_to_vect(tokenizer.encode(line)) for line in open('tokenizer1-vocab.txt', 'r')])
print(emb)
np.save('vectors.npy', emb)


[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 ...
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [-0.64661413 -0.12680596 -0.55052567 ... -0.65218377 -0.52517068
   0.62986505]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]


In [48]:
def prepare_text(text):
    pieces = tokenizer.encode(text).ids
    if len(pieces) > 120:
        pieces = pieces[:120]
    else:
        for i in range(0, 120 - len(pieces)):
            pieces.append(0)
    return np.array(pieces)


def prepare_data(file1, file2, met1, met2):
    X = []
    y = []
    for line in open(file1, encoding='utf-8'):
        X.append(prepare_text(line))
        y.append(met1)
    for line in open(file2, encoding='utf-8'):
        X.append(prepare_text(line))
        y.append(met2)
    return np.array(X), np.array(y)

X, y = prepare_data('pos.txt', 'neg.txt', 1, 0)
print(X)
print(y)


[[   7    5   18 ...    0    0    0]
 [   6    6    6 ...    0    0    0]
 [2179   41   41 ...    0    0    0]
 ...
 [  32  577   40 ...    0    0    0]
 [ 565   40  626 ...    0    0    0]
 [ 577   40  440 ...    0    0    0]]
[1 1 1 ... 0 0 0]


In [49]:
emb_layer = nn.Embedding.from_pretrained(torch.tensor(emb))


In [50]:
X = torch.LongTensor(X)
y = torch.LongTensor(y)
print(y)
print(X)
print(y.size())
l = X.size(0)
print(X.size())
l_train, l_test = int(l * 0.7), int(l * 0.2)

data = TensorDataset(X, y)
train_ds, test_ds, val_ds = random_split(data, [l_train, l_test, l - l_train - l_test])


tensor([1, 1, 1,  ..., 0, 0, 0])
tensor([[   7,    5,   18,  ...,    0,    0,    0],
        [   6,    6,    6,  ...,    0,    0,    0],
        [2179,   41,   41,  ...,    0,    0,    0],
        ...,
        [  32,  577,   40,  ...,    0,    0,    0],
        [ 565,   40,  626,  ...,    0,    0,    0],
        [ 577,   40,  440,  ...,    0,    0,    0]])
torch.Size([9517])
torch.Size([9517, 120])


In [64]:
class Lin(torch.nn.Module):
    def __init__(self, input):
        super(Lin, self).__init__()
        self.embedding = nn.Embedding(100,2638)
        self.linear = nn.Linear(100, 2)

    def forward(self, *args):
        result = 0
        for arg in args:
            result += self.embedding(arg)
        out = torch.sigmoid(self.linear(result))
        return out


input = emb.shape
print(input)
model = Lin(input)


(2638, 100)


In [67]:
model = nn.Sequential(nn.EmbeddingBag.from_pretrained(torch.FloatTensor(emb)),
                      nn.Linear(emb.shape[1], 20),
                      nn.Linear(20, 2),
                      nn.Softmax(dim=1))

In [69]:
def train_model(model, train_data, max_epochs=20):
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-2)
    loss = nn.CrossEntropyLoss()
    train_loader = DataLoader(dataset=train_data, batch_size=10, shuffle=True)
    for epoch in range(max_epochs):
        correct = 0
        total = 0
        for X_batch, y_batch in train_loader:
            model = model.train()
            y_pred = model(X_batch)
            loss1 = loss(y_pred, y_batch)
            total += y_batch.size(0)
            correct += (y_pred.argmax(1) == y_batch).sum().item()
            loss1.backward()
            optimizer.step()
            optimizer.zero_grad()
        if epoch % 3 == 2:
            acc = correct / total
            print(f'Epoch = {epoch}, acc = {acc}, loss = {loss1}')
train_model(model, train_ds)
PATH = 'text'
torch.save(model.state_dict(), PATH)



Epoch = 2, acc = 0.5808437171595856, loss = 0.5428919792175293


Epoch = 5, acc = 0.5808437171595856, loss = 0.5368549227714539


Epoch = 8, acc = 0.5808437171595856, loss = 0.5479018092155457


Epoch = 11, acc = 0.5808437171595856, loss = 0.5291348099708557


Epoch = 14, acc = 0.5808437171595856, loss = 0.8719375133514404


Epoch = 17, acc = 0.5808437171595856, loss = 0.8611154556274414


RuntimeError: size mismatch, m1: [1 x 2], m2: [200 x 2] at ../aten/src/TH/generic/THTensorMath.cpp:752