# Практическое задание к уроку 8 "Рекуррентные нейронные сети RNN LSTM GRU"

In [2]:

import pandas as pd
from string import punctuation
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
import os
import re

DATA_PATH = '../data'

TRAIN_FILENAME = 'train.csv'
TEST_FILENAME = 'train.csv'
VALIDATION_FILENAME = 'train.csv'

df_train = pd.read_csv(os.path.join(DATA_PATH, TRAIN_FILENAME))
df_test = pd.read_csv(os.path.join(DATA_PATH, TEST_FILENAME))
df_val = pd.read_csv(os.path.join(DATA_PATH, VALIDATION_FILENAME))

In [5]:
df_train.head()

Unnamed: 0,id,text,class
0,0,@alisachachka не уезжаааааааай. :(❤ я тоже не ...,0
1,1,RT @GalyginVadim: Ребята и девчата!\nВсе в кин...,1
2,2,RT @ARTEM_KLYUSHIN: Кто ненавидит пробки ретви...,0
3,3,RT @epupybobv: Хочется котлету по-киевски. Зап...,1
4,4,@KarineKurganova @Yess__Boss босапопа есбоса н...,1


### Выполняем препроцессинг

In [7]:
sw = set(get_stop_words("ru"))
exclude = set(punctuation)
morpher = MorphAnalyzer()

def preprocess_text(txt):
    txt = str(txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = re.sub("\sне", "не", txt)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    return " ".join(txt)

df_train['text'] = df_train['text'].apply(preprocess_text)
df_val['text'] = df_val['text'].apply(preprocess_text)
df_test['text'] = df_test['text'].apply(preprocess_text)

In [8]:
import numpy as np
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D, SimpleRNN, LSTM, GRU, Masking
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import TensorBoard 
from keras.objectives import categorical_crossentropy
from keras.callbacks import EarlyStopping

In [9]:
text_corpus_train = df_train['text'].values
text_corpus_valid = df_val['text'].values
text_corpus_test = df_test['text'].values

In [10]:
tokenizer = Tokenizer(num_words=None, 
                     filters='#$%&()*+-<=>@[\\]^_`{|}~\t\n',
                     lower = False, split = ' ')
tokenizer.fit_on_texts(text_corpus_train)

sequences_train = tokenizer.texts_to_sequences(text_corpus_train)
sequences_val = tokenizer.texts_to_sequences(text_corpus_valid)
sequences_test = tokenizer.texts_to_sequences(text_corpus_test)

word_count = len(tokenizer.index_word) + 1
training_length = max([len(i.split()) for i in text_corpus_train])

X_train = pad_sequences(sequences_train, maxlen=training_length)
X_valid = pad_sequences(sequences_val, maxlen=training_length)

In [11]:
y_train = df_train['class'].values
y_val = df_val['class'].values

## Задание 1

Построить сверточные архитектуры

In [40]:
def build_convolutional_model(word_count, training_length=1000):
    model = Sequential()

    model = Sequential()
    model.add(Embedding(input_dim=word_count, output_dim=30, input_length=training_length))
    model.add(Conv1D(30, 3))
    model.add(Activation("relu"))
    model.add(GlobalMaxPool1D())
    model.add(Dense(10))
    model.add(Activation("relu"))
    model.add(Dense(2))
    model.add(Activation('relu'))

    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    return model

In [44]:
conv_model = build_convolutional_model(word_count, training_length)

In [45]:
early_stopping=EarlyStopping(monitor='val_loss')
conv_history = conv_model.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping])

Epoch 1/10


In [None]:
conv_score = conv_model.evaluate(X_valid, y_val, batch_size=512, verbose=1)
print('\n')
print('Test score:', conv_score[0])
print('Test accuracy:', conv_score[1])

In [26]:
def build_rnn_model():
    model = Sequential()

    model.add(
        Embedding(
            input_dim=word_count,
            input_length=training_length,
            output_dim=30,
            trainable=True,
            mask_zero=True
        )
    )

    model.add(Masking(mask_value=0.0))

    model.add(SimpleRNN(64))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    return model

In [27]:
rnn_model = build_rnn_model()

In [34]:
early_stopping=EarlyStopping(monitor='val_loss')
rnn_history = rnn_model.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping]
)



In [46]:
rnn_score = rnn_model.evaluate(X_valid, y_val, batch_size=512, verbose=1)
print('\n')
print('Test score:', rnn_score[0])
print('Test accuracy:', rnn_score[1])



Test score: 0.21560560166835785
Test accuracy: 0.9695426821708679


In [47]:
def build_lstm_model():
  model = Sequential()

  model.add(
      Embedding(input_dim=word_count,
                input_length=training_length,
                output_dim=30,
                trainable=True,
                mask_zero=True))
  model.add(Masking(mask_value=0.0))
  model.add(LSTM(64, recurrent_dropout=0.2))
  model.add(Dense(64, activation='relu'))
  model.add(Dropout(0.5))
  model.add(Dense(1, activation='sigmoid'))

  model.compile(
      optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

  return model

In [48]:
lstm_model = build_lstm_model()

In [49]:
early_stopping=EarlyStopping(monitor='val_loss')
lstm_history = lstm_model.fit(X_train, y_train,
                    batch_size=512,
                    epochs=1,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping]
)



In [50]:
lstm_score = lstm_model.evaluate(X_valid, y_val, batch_size=512, verbose=1)
print('\n')
print('Test score:', lstm_score[0])
print('Test accuracy:', lstm_score[1])



Test score: 0.6933350563049316
Test accuracy: 0.4973245859146118


In [None]:
results = pd.Dataframe([conv_score, rnn_score, lstm_score])
results