## Рекуррентные нейронные сети для анализа текста

In [None]:
import pathlib
import os
import itertools
import re

import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split

In [2]:
base_path = pathlib.Path('../blobs/aclImdb')
train_path = base_path / 'train'
test_path = base_path / 'test'

EOS = '<EOS>'
UNKNOWN = '<UNKNOWN>'
vocab = pd.read_csv(base_path / 'imdb.vocab', header=None, squeeze=True)
full_vocab = pd.Series([EOS, UNKNOWN]).append(vocab, ignore_index=True)

word_to_index = {word: index for index, word in full_vocab.iteritems()}
index_to_word = {index: word for index, word in full_vocab.iteritems()}

In [3]:
# TODO: tf.keras.preprocessing.text.Tokenizer
# TODO: consider punctuation as vocabulary entries?

def get_words(sentence):
    contents = sentence.lower()
    contents = re.sub(r'</? *\w+ */?>', '', contents)
    # an approximate list of special characters, should be ok if we disregard some words
    # contents = re.sub(r'[.,\'"\-0-9:!?\(\)\n;/*%~&{}$\-]', ' ', contents)
    contents = re.sub(r'\W', ' ', contents)
    contents = re.sub(r'\d', '', contents)
    return contents.split()

def read_review(filename):
    with open(filename, mode='r') as f:
        contents = f.read()
        words = get_words(contents)
        return [word_to_index.get(word, word_to_index[UNKNOWN]) for word in words]
    
def get_rating(filename):
    return int(filename.stem.split('_')[1])

In [4]:
# The average train review size is 256.75. Truncating all reviews at 250
REVIEW_SIZE = 250

In [5]:
files = list((train_path / 'pos').glob('*')) + \
    list((train_path / 'neg').glob('*'))
x_train = [read_review(name) for name in files]
y_train = [get_rating(f) for f in files]

x_train = keras.preprocessing.sequence.pad_sequences(
    x_train, 
    value=word_to_index[EOS], 
    maxlen=REVIEW_SIZE,
    padding='post', 
    truncating='post'
)

x_train, y_train = sklearn.utils.shuffle(np.array(x_train), np.array(y_train))

assert(len(x_train) == len(y_train))
assert(all(len(s) == len(x_train[0]) for s in x_train))
print(len(x_train), 'train samples, each padded to', len(x_train[0]), 'words')

25000 train samples, each padded to 250 words


In [6]:
files = list((test_path / 'pos').glob('*')) + \
    list((test_path / 'neg').glob('*'))
x_test = [read_review(name) for name in files]
y_test = [get_rating(f) for f in files]

x_test = keras.preprocessing.sequence.pad_sequences(
    x_test, 
    value=word_to_index[EOS], 
    maxlen=REVIEW_SIZE,
    padding='post', 
    truncating='post'
)

x_test, y_test = sklearn.utils.shuffle(np.array(x_test), np.array(y_test))

assert(len(x_test) == len(y_test))
assert(all(len(s) == len(x_test[0]) for s in x_test))
print(len(x_test), 'test samples, each padded to', len(x_test[0]), 'words')

25000 test samples, each padded to 250 words


In [7]:
def get_processed_review(sentence):
    return ' '.join(index_to_word[word] for word in sentence)

In [8]:
i = np.random.randint(len(x_train))
print('Rating:', y_train[i])
print(get_processed_review(x_train[i]))

Rating: 8
a cry in the dark is a masterful piece of cinema haunting and incredibly though provoking the true story of lindy chamberland who in witnessed a horrific sight seeing her month old baby being brutally taken from their family s tent while camping on the austrailian outback azaria the baby was never seen again and the result of her horrendous disappearance caused a true life frenzy all around the world meryl streep does immaculate justice to the role of lindy as she always does but the one thing that helps a cry in the dark never fall flat is the brilliant direction a truly inspired and accurate outlook on this baffeling case tears are brought to the eyes the concept is nothing less then terrifying and afterwards you are left haunted but also inspired <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <E

I was going to try using one-hot encoding first just for the sake of it, 
but with the vocabulary of around 90k words the train dataset would not even fit on the disk drive. Generators?

In [10]:
model = keras.Sequential([
    keras.layers.Embedding(len(full_vocab), 50, input_length=REVIEW_SIZE),
    keras.layers.Bidirectional(keras.layers.GRU(units=200, return_sequences=False)),
    keras.layers.Dense(1, activation='elu')
])
model.compile('adam', 'mse')
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 250, 50)           4476450   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 400)               301200    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 401       
Total params: 4,778,051
Trainable params: 4,778,051
Non-trainable params: 0
_________________________________________________________________


In [39]:
model.fit(x_train, y_train, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7fc5b43f8fd0>

Interrupted the kernel

<pre>
Epoch 1/3
25000/25000 [==============================] - 439s 18ms/sample - loss: 9.6092
Epoch 2/3
25000/25000 [==============================] - 453s 18ms/sample - loss: 4.0583
Epoch 3/3
 1184/25000 [>.............................] - ETA: 7:27 - loss: 2.6545</pre>

In [40]:
predictions = np.squeeze(model.predict(x_test))

In [41]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, np.round(predictions))
print('Accuracy:', accuracy)

Accuracy: 0.20192


Okay, that's twice better than a random guess. Still, it's not a classification problem and being close enough is ok

In [43]:
for _ in range(3):
    i = np.random.randint(len(x_test))
    print('Predicted Rating:', predictions[i])
    print('Actual rating:', y_test[i])
    print('\t', get_processed_review(x_test[i]))

Predicted Rating: 3.2253666
Actual rating: 1
	 bolo yeung is in the movie ten minutes altogether including when he s serving iced drinks to his boss a lot of street thugs looking like junkyard keepers get instantly overpowered by the asian superhero who talks like an illegal alien just out of the back of a manure truck thug let this to me shirt off gay model like muscles <UNKNOWN> <UNKNOWN> hee <UNKNOWN> hap hap he s dead on the floor with his neck elbow chin or balls broken cheap semi sex scenes where the white broad come out of nowhere digs the asian superhero norton former c action movies star does nothing but pose as an eccentric trendy weapon smuggler who traffics white slutty girls hand picked at a night club where they willingly follow some idiot posing as a millionaire snapping at them you reap what you sow yes the local police captain is involved and yes the first butchered cop is the former patrol teammate of the super hero <UNKNOWN> action scenes are fake like a hee haw chin