In [1]:
import numpy as np
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
from gensim.models import KeyedVectors

uk_vectors_file = '../data/vectors/news.lowercased.tokenized.word2vec.300d'
uk_vectors = KeyedVectors.load_word2vec_format(uk_vectors_file, binary=False)

In [3]:
from repository import Document, Repository1551

In [4]:
# обраховано в попередньому завданні (відфільтрована рос. мова, також є поле з лематизованим текстом)
r1551 = Repository1551.load('../data/r1551.pickle')

In [5]:
x_train, x_test, y_train, y_test = train_test_split(r1551.lems, r1551.labels, test_size=0.2, stratify=r1551.labels)

In [6]:
import regex as re

words = set()
for l in r1551.lems:
    words.update([x for x in l.split(' ') if re.match(r'\p{L}+', x)])

In [7]:
vocab_size = 1000
embedding_dims = 300
maxlen = int(np.percentile([len(t) for t in r1551.texts], 95))

In [8]:
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(x_train)
x_train = tokenizer.texts_to_matrix(x_train)
x_test = tokenizer.texts_to_matrix(x_test)

x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

In [9]:
le = LabelEncoder()
le.fit(r1551.labels)

y_train = le.transform(y_train)
y_test = le.transform(y_test)

In [10]:
embedding_matrix = np.zeros((vocab_size, embedding_dims))

for word, index in tokenizer.word_index.items():
    if index > vocab_size - 1:
        break
    else:
        try:
            embedding_vector = uk_vectors.get_vector(word)
        except KeyError:
            pass
        else:
            embedding_matrix[index] = embedding_vector

In [11]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding, LSTM
from keras.layers import Conv1D, Flatten, MaxPooling1D


model = Sequential()
model.add(Embedding(vocab_size, embedding_dims, input_length=maxlen, weights=[embedding_matrix], trainable=False))
model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dense(177, activation='softmax'))
model.compile(optimizer='rmsprop',
              loss='sparse_categorical_crossentropy',
              metrics=['acc'])
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1316, 300)         300000    
_________________________________________________________________
flatten_1 (Flatten)          (None, 394800)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               101069056 
_________________________________________________________________
dense_2 (Dense)              (None, 177)               45489     
Total params: 101,414,545
Trainable params: 101,114,545
Non-trainable params: 300,000
_________________________________________________________________


In [12]:
model.fit(x_train, y_train,
          batch_size=128,
          epochs=1,
          validation_data=(x_test, y_test))

Instructions for updating:
Use tf.cast instead.
Train on 46565 samples, validate on 11642 samples
Epoch 1/1


<keras.callbacks.History at 0x7f4087220b70>

### Max Pooling

In [15]:
from keras.layers import GlobalMaxPool1D

In [16]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_dims, input_length=maxlen, weights=[embedding_matrix], trainable=False))
model.add(GlobalMaxPool1D())
model.add(Dense(256, activation='relu'))
model.add(Dense(177, activation='softmax'))
model.compile(optimizer='rmsprop',
              loss='sparse_categorical_crossentropy',
              metrics=['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1316, 300)         300000    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 300)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 256)               77056     
_________________________________________________________________
dense_4 (Dense)              (None, 177)               45489     
Total params: 422,545
Trainable params: 122,545
Non-trainable params: 300,000
_________________________________________________________________


In [18]:
model.fit(x_train, y_train,
          batch_size=128,
          epochs=5,
          validation_data=(x_test, y_test))

Train on 46565 samples, validate on 11642 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f406ab066a0>

### CNN

In [19]:
from keras.layers import Conv1D

In [20]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_dims, input_length=maxlen, weights=[embedding_matrix], trainable=False))
model.add(Conv1D(64, 5, activation='relu'))
model.add(GlobalMaxPool1D())
model.add(Dense(256, activation='relu'))
model.add(Dense(177, activation='softmax'))
model.compile(optimizer='rmsprop',
              loss='sparse_categorical_crossentropy',
              metrics=['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 1316, 300)         300000    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1312, 64)          96064     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 64)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 256)               16640     
_________________________________________________________________
dense_6 (Dense)              (None, 177)               45489     
Total params: 458,193
Trainable params: 158,193
Non-trainable params: 300,000
_________________________________________________________________


In [21]:
model.fit(x_train, y_train,
          batch_size=128,
          epochs=1,
          validation_data=(x_test, y_test))

Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 46565 samples, validate on 11642 samples
Epoch 1/1


<keras.callbacks.History at 0x7f40586be160>