## Рекуррентные нейронные сети для анализа текста

In [None]:
import pathlib
import os
import itertools
import re

import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split

In [2]:
base_path = pathlib.Path('../blobs/aclImdb')
train_path = base_path / 'train'
test_path = base_path / 'test'

EOS = '<EOS>'
UNKNOWN = '<UNKNOWN>'
vocab = pd.read_csv(base_path / 'imdb.vocab', header=None, squeeze=True)
full_vocab = pd.Series([EOS, UNKNOWN]).append(vocab, ignore_index=True)

word_to_index = {word: index for index, word in full_vocab.iteritems()}
index_to_word = {index: word for index, word in full_vocab.iteritems()}

In [3]:
# TODO: tf.keras.preprocessing.text.Tokenizer
# TODO: consider punctuation as vocabulary entries?

def get_words(sentence):
    contents = sentence.lower()
    contents = re.sub(r'</? *\w+ */?>', '', contents)
    # an approximate list of special characters, should be ok if we disregard some words
    # contents = re.sub(r'[.,\'"\-0-9:!?\(\)\n;/*%~&{}$\-]', ' ', contents)
    contents = re.sub(r'\W', ' ', contents)
    contents = re.sub(r'\d', '', contents)
    return contents.split()

def read_review(filename):
    with open(filename, mode='r') as f:
        contents = f.read()
        words = get_words(contents)
        return [word_to_index.get(word, word_to_index[UNKNOWN]) for word in words]
    
def get_rating(filename):
    return int(filename.stem.split('_')[1])

In [4]:
# The average train review size is 256.75. Truncating all reviews at 250
REVIEW_SIZE = 250

In [5]:
files = list((train_path / 'pos').glob('*')) + \
    list((train_path / 'neg').glob('*'))
x_train = [read_review(name) for name in files]
y_train = [get_rating(f) for f in files]

x_train = keras.preprocessing.sequence.pad_sequences(
    x_train, 
    value=word_to_index[EOS], 
    maxlen=REVIEW_SIZE,
    padding='post', 
    truncating='post'
)

x_train, y_train = sklearn.utils.shuffle(np.array(x_train), np.array(y_train))

assert(len(x_train) == len(y_train))
assert(all(len(s) == len(x_train[0]) for s in x_train))
print(len(x_train), 'train samples, each padded to', len(x_train[0]), 'words')

25000 train samples, each padded to 250 words


In [6]:
files = list((test_path / 'pos').glob('*')) + \
    list((test_path / 'neg').glob('*'))
x_test = [read_review(name) for name in files]
y_test = [get_rating(f) for f in files]

x_test = keras.preprocessing.sequence.pad_sequences(
    x_test, 
    value=word_to_index[EOS], 
    maxlen=REVIEW_SIZE,
    padding='post', 
    truncating='post'
)

x_test, y_test = sklearn.utils.shuffle(np.array(x_test), np.array(y_test))

assert(len(x_test) == len(y_test))
assert(all(len(s) == len(x_test[0]) for s in x_test))
print(len(x_test), 'test samples, each padded to', len(x_test[0]), 'words')

25000 test samples, each padded to 250 words


In [38]:
def get_processed_review(sentence):
    return ' '.join(index_to_word[word] for word in sentence)

In [39]:
i = np.random.randint(len(x_train))
print('Rating:', y_train[i])
print(get_processed_review(x_train[i]))

Rating: 3
there s a lot of good that can be said for this cartoon the backgrounds are rich lushly colored and full of nicely done art deco details the animation is up to the usual studio standards of the time which are unquestionably higher than those of the present day however i find it tedious for a number of reasons the music it s definitely not up to scott bradley s usual standards although it s probably supposed to be evocative of a great gatsby setting it ends up being dreary sleepy repetitious and monotonous repetitious and monotonous are not the same as beethoven s th symphony attests since most people including me tend to close their eyes when they yawn there s a lot of the visual part of the cartoon that will be missed by the average viewer the storyline i m not giving away any secrets that <UNKNOWN> t already in the plot summary country good city bad this is a common theme in films both animated and live from this era it s a misplaced nostalgia for a nonexistent rural idyll 

I was going to try using one-hot encoding first, but with the vocabulary of around 90k words the train dataset would not fit on the disk drive. Generators?

In [33]:
model = keras.Sequential([
    keras.layers.Embedding(len(full_vocab), 50, input_length=REVIEW_SIZE),
    keras.layers.GRU(units=200, return_sequences=False),
    keras.layers.Dense(1)
])
model.compile('adam', 'mse')
model.summary()

Model: "sequential_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, None, 50)          4476450   
_________________________________________________________________
gru_14 (GRU)                 (None, 200)               150600    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 201       
Total params: 4,627,251
Trainable params: 4,627,251
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(x_train, y_train, epochs=3)

In [None]:
score, accuracy = model.evaluate(x_test, y_test)

In [36]:
foo = model.predict(x_test)

In [60]:
for _ in range(3):
    i = np.random.randint(len(x_test))
    print('Predicted Rating:', np.squeeze(foo[i]))
    print('Actual rating:', y_test[i])
    print('\t', get_processed_review(x_test[i]))

Predicted Rating: 3.856542
Actual rating: 8
	 something i think some people miss about great science fiction is that it predicts some part of the future no other theatrical movie that i can recall predicted that when the space shuttle went to study halley s comet that a disaster would occur some differences were the churchill the shuttle in the movie actually went to the comet the challenger was only in low earth orbit the churchill was merely burned out inside whereas the challenger exploded hey the vampires had to get back to earth one great similarity and this is always bad luck both had mixed male female crews the legend of halley s comet and disaster continues other than this there is not much more to be said about this movie that hasn t been said before as a outer space science fiction horror sex film individually it provided nothing really new but as in all great dishes it is the combination that counts and taken together this was a highly original and satisfying combination i j