In [1]:
pip install pymorphy2

Collecting pymorphy2
[?25l  Downloading https://files.pythonhosted.org/packages/a3/33/fff9675c68b5f6c63ec8c6e6ff57827dda28a1fa5b2c2d727dffff92dd47/pymorphy2-0.8-py2.py3-none-any.whl (46kB)
[K     |███████                         | 10kB 26.9MB/s eta 0:00:01[K     |██████████████▏                 | 20kB 3.4MB/s eta 0:00:01[K     |█████████████████████▎          | 30kB 4.6MB/s eta 0:00:01[K     |████████████████████████████▍   | 40kB 4.9MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 3.0MB/s 
[?25hCollecting dawg-python>=0.7
  Downloading https://files.pythonhosted.org/packages/6a/84/ff1ce2071d4c650ec85745766c0047ccc3b5036f1d03559fd46bb38b5eeb/DAWG_Python-0.7.2-py2.py3-none-any.whl
Collecting pymorphy2-dicts<3.0,>=2.4
[?25l  Downloading https://files.pythonhosted.org/packages/02/51/2465fd4f72328ab50877b54777764d928da8cb15b74e2680fc1bd8cb3173/pymorphy2_dicts-2.4.393442.3710985-py2.py3-none-any.whl (7.1MB)
[K     |████████████████████████████████| 7.1MB 10.3MB/s

In [2]:
pip install stop_words

Collecting stop_words
  Downloading https://files.pythonhosted.org/packages/1c/cb/d58290804b7a4c5daa42abbbe2a93c477ae53e45541b1825e86f0dfaaf63/stop-words-2018.7.23.tar.gz
Building wheels for collected packages: stop-words
  Building wheel for stop-words (setup.py) ... [?25l[?25hdone
  Created wheel for stop-words: filename=stop_words-2018.7.23-cp36-none-any.whl size=32917 sha256=122caa22a32202c94e7d4ce7b9407f1fd0b3abc6f5699ca7dc22f3a0d4de811b
  Stored in directory: /root/.cache/pip/wheels/75/37/6a/2b295e03bd07290f0da95c3adb9a74ba95fbc333aa8b0c7c78
Successfully built stop-words
Installing collected packages: stop-words
Successfully installed stop-words-2018.7.23


In [29]:
import pandas as pd
import numpy as np

from string import punctuation
from pymorphy2 import MorphAnalyzer
from stop_words import get_stop_words
import re
import nltk

from sklearn.model_selection import train_test_split

import keras
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, Dense, LSTM, GRU, Activation, Dropout, Masking
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping

In [7]:
df = pd.read_excel('/content/отзывы за лето.xls')

df.head()

Unnamed: 0,Rating,Content,Date
0,5,It just works!,2017-08-14
1,4,В целом удобноное приложение...из минусов хотя...,2017-08-14
2,5,Отлично все,2017-08-14
3,5,Стал зависать на 1% работы антивируса. Дальше ...,2017-08-14
4,5,"Очень удобно, работает быстро.",2017-08-14


In [58]:
punct = set(punctuation)
stop_words_set = set(get_stop_words('ru') + get_stop_words('en'))
ma = MorphAnalyzer()

def preprocess(txt):
  r = str(txt).lower()
  r = ''.join(t for t in r if t not in punct)
  
  r = re.sub(r'[^a-zA-Zа-яА-ЯёЁ0-9]+', ' ', r)

  r = " ".join([ma.parse(word)[0].normal_form for word in r.split() if word not in stop_words_set])
  
  r = re.sub(r'не', 'не\s', r)
  r = re.sub(r'\snot', 'not', r)

  r = r.strip()

  return r

In [59]:
df['clear_content'] = df['Content'].apply(preprocess)
df.drop(df[df['clear_content'] == ''].index, inplace=True)

df['target'] = df['Rating'] > 3
df['target'] = df['target'].astype('int')

In [60]:
df.head()

Unnamed: 0,Rating,Content,Date,clear_content,target
0,4,It just works!,2017-08-14,just works,1
1,3,В целом удобноное приложение...из минусов хотя...,2017-08-14,целое удобноной приложениеиз минус хотеть боль...,0
2,4,Отлично все,2017-08-14,отлично,1
3,4,Стал зависать на 1% работы антивируса. Дальше ...,2017-08-14,зависать 1 работа антивирус ране\sе пользовать...,1
4,4,"Очень удобно, работает быстро.",2017-08-14,удобно работать быстро,1


In [61]:
df_train, df_val = train_test_split(df, train_size=0.7)

In [62]:
corpus_train = df_train['clear_content'].values
corpus_valid = df_val['clear_content'].values

In [63]:
tokenizer = Tokenizer(num_words=None, 
                     filters='#$%&()*+-<=>@[\\]^_`{|}~\t\n',
                     lower = False, split = ' ')
tokenizer.fit_on_texts(corpus_train)

In [64]:
sequences_train = tokenizer.texts_to_sequences(corpus_train)
sequences_val = tokenizer.texts_to_sequences(corpus_valid)

word_count = len(tokenizer.index_word) + 1
training_length = max([len(i.split()) for i in corpus_train])

X_train = pad_sequences(sequences_train, maxlen=training_length)
X_valid = pad_sequences(sequences_val, maxlen=training_length)

In [65]:
y_train = df_train['target']
y_valid = df_val['target']

In [66]:
batch_size = 512
epochs = 50

## Испытываем SimpleRNN

In [67]:
model = Sequential()

model.add(Embedding(input_dim=word_count, input_length=training_length, output_dim=30, trainable=True, mask_zero=True))
model.add(Masking(mask_value=0.0))

model.add(SimpleRNN(64))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [68]:
early_stopping=EarlyStopping(monitor='val_loss')  

history = model.fit(
    X_train, 
    y_train,
    batch_size=batch_size,
    epochs=epochs,
    verbose=1,
    validation_split=0.1,
    callbacks=[early_stopping]
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50


In [69]:
score = model.evaluate(X_valid, y_valid, batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 0.3390660285949707
Test accuracy: 0.8699581623077393


Неплохо (в моём случае, даже лучше CNN)

## Пробуем LSTM

In [70]:
model = Sequential()

model.add(Embedding(input_dim=word_count, input_length=training_length, output_dim=30, trainable=True, mask_zero=True))
model.add(Masking(mask_value=0.0))

model.add(LSTM(64, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



In [71]:
early_stopping=EarlyStopping(monitor='val_loss')  

history = model.fit(
    X_train, 
    y_train,
    batch_size=batch_size,
    epochs=epochs,
    verbose=1,
    validation_split=0.1,
    callbacks=[early_stopping]
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50


In [72]:
score = model.evaluate(X_valid, y_valid, batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 0.3283085525035858
Test accuracy: 0.8794978857040405


Обучается медленнее, нежели SimpleRNN. Качество немного выше

## Пробуем GRU

In [76]:
model = Sequential()

model.add(Embedding(input_dim=word_count, input_length=training_length, output_dim=30, trainable=True, mask_zero=True))
model.add(Masking(mask_value=0.0))

model.add(GRU(64, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



In [74]:
early_stopping=EarlyStopping(monitor='val_loss')  

history = model.fit(
    X_train, 
    y_train,
    batch_size=batch_size,
    epochs=epochs,
    verbose=1,
    validation_split=0.1,
    callbacks=[early_stopping]
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50


In [75]:
score = model.evaluate(X_valid, y_valid, batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 0.34715941548347473
Test accuracy: 0.8716318011283875


Обучается чуть быстрее LSTM. Качество примерно то же, что у LSTM