## Задание

Данные берем отзывы за лето

На вебинаре мы говорили, что долгое время CNN и RNN архитектуры были конурируещими выяснить какая архитектура больше подходит для нашей задачи:

    1. построить свёрточные архитектуры
    2. построить различные архитектуры с RNN
    3. построить совместные архитектуры CNN -> RNN или (RNN -> CNN)

---

In [2]:
!pip install --upgrade xlrd
!pip install pymorphy2
!pip install stop-words

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting xlrd
  Downloading xlrd-2.0.1-py2.py3-none-any.whl (96 kB)
[K     |████████████████████████████████| 96 kB 3.4 MB/s 
[?25hInstalling collected packages: xlrd
  Attempting uninstall: xlrd
    Found existing installation: xlrd 1.1.0
    Uninstalling xlrd-1.1.0:
      Successfully uninstalled xlrd-1.1.0
Successfully installed xlrd-2.0.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 2.9 MB/s 
[?25hCollecting dawg-python>=0.7.1
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting pymorphy2-dicts-ru<3.0,>=2.4
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[K     |████████████████████████████████| 8.2 MB 15.6 MB/s 
Installing collected packages: pymorph

In [3]:
import pandas as pd
import re

from string import punctuation
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer

from sklearn.model_selection import train_test_split

import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D, SimpleRNN, LSTM, GRU, Masking, Flatten
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import TensorBoard
from keras.callbacks import EarlyStopping
from keras.utils import np_utils

In [4]:
df = pd.read_excel('отзывы за лето.xls')

df.sample(15)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20659 entries, 0 to 20658
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Rating   20659 non-null  int64 
 1   Content  20656 non-null  object
 2   Date     20659 non-null  object
dtypes: int64(1), object(2)
memory usage: 484.3+ KB


In [5]:
df.Content = df.Content.astype(str)

In [6]:
sw = set(get_stop_words("ru"))
exclude = set(punctuation)
morpher = MorphAnalyzer()

In [7]:
def preprocess_text(txt):
    txt = str(txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = re.sub("не\s", "не", txt)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    return " ".join(txt)

In [8]:
df['Content'] = df['Content'].apply(preprocess_text)

df.sample(5)

Unnamed: 0,Rating,Content,Date
6454,5,отличный приложение пользоваться минус мелочь,2017-08-04
13477,5,мненравиться,2017-07-25
6809,2,установка последний обновление непогружаться п...,2017-08-04
3080,5,👍,2017-08-09
17506,5,сбербанк отличный банк приложение мненравиться...,2017-07-11


In [9]:
train_df, test_val_df = train_test_split(df, test_size = 0.3, random_state = 49)
test_df, val_df = train_test_split(test_val_df, test_size = 0.5, random_state = 49)

In [10]:
train_df.to_csv('train_df.csv')
test_df.to_csv('test_df.csv')
val_df.to_csv('val_df.csv')
test_val_df.to_csv('test_val_df.csv')

In [11]:
text_corpus_train = train_df['Content'].values
text_corpus_valid = val_df['Content'].values
text_corpus_test = test_df['Content'].values

In [12]:
num_classes = len(df['Rating'].unique())
num_classes

5

In [13]:
tokenizer = Tokenizer(num_words = None, 
                     filters = '#$%&()*+-<=>@[\\]^_`{|}~\t\n',
                     lower = False, split = ' ')
tokenizer.fit_on_texts(text_corpus_train)

sequences_train = tokenizer.texts_to_sequences(text_corpus_train)
sequences_val = tokenizer.texts_to_sequences(text_corpus_valid)
sequences_test = tokenizer.texts_to_sequences(text_corpus_test)

word_count = len(tokenizer.index_word) + 1
training_length = max([len(i.split()) for i in text_corpus_train])

X_train = pad_sequences(sequences_train, maxlen=training_length)
X_valid = pad_sequences(sequences_val, maxlen=training_length)

y_train = keras.utils.np_utils.to_categorical(train_df['Rating'], num_classes+1)
y_test = keras.utils.np_utils.to_categorical(test_df['Rating'], num_classes+1)
y_val = keras.utils.np_utils.to_categorical(val_df['Rating'], num_classes+1)

---

In [14]:
# RNN

model = Sequential()

model.add(
    Embedding(input_dim = word_count,
              input_length = training_length,
              output_dim = 30,
              trainable = True,
              mask_zero = True))
model.add(Masking(mask_value = 0.0))

model.add(SimpleRNN(132))
model.add(Dense(132, activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes+1, activation = 'softmax'))

model.compile(
    optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

early_stopping = EarlyStopping(monitor = 'val_loss', patience = 2, restore_best_weights = 1)

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 132, 30)           294870    
                                                                 
 masking (Masking)           (None, 132, 30)           0         
                                                                 
 simple_rnn (SimpleRNN)      (None, 132)               21516     
                                                                 
 dense (Dense)               (None, 132)               17556     
                                                                 
 dropout (Dropout)           (None, 132)               0         
                                                                 
 dense_1 (Dense)             (None, 6)                 798       
                                                                 
Total params: 334,740
Trainable params: 334,740
Non-trai

In [15]:
history = model.fit(X_train, y_train,
                    batch_size = 512,
                    epochs = 10,
                    verbose = 1,
                    validation_split = 0.1,
                    callbacks = [early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


In [17]:
score = model.evaluate(X_valid, y_val, batch_size = 512, verbose = 1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])



Test score: 0.6869747042655945
Test accuracy: 0.7715392112731934


In [18]:
results = []

results.append(['RNN', score[0], score[1]])

---

In [19]:
# LSTM

model = Sequential()

model.add(
    Embedding(input_dim = word_count,
              input_length = training_length,
              output_dim = 30,
              trainable = True,
              mask_zero = True))
model.add(Masking(mask_value = 0.0))
model.add(LSTM(132, recurrent_dropout = 0.2))
model.add(Dense(132, activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes+1, activation = 'softmax'))

In [20]:
model.compile(
    optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 132, 30)           294870    
                                                                 
 masking_1 (Masking)         (None, 132, 30)           0         
                                                                 
 lstm (LSTM)                 (None, 132)               86064     
                                                                 
 dense_2 (Dense)             (None, 132)               17556     
                                                                 
 dropout_1 (Dropout)         (None, 132)               0         
                                                                 
 dense_3 (Dense)             (None, 6)                 798       
                                                                 
Total params: 399,288
Trainable params: 399,288
Non-tr

In [21]:
early_stopping = EarlyStopping(monitor = 'val_loss', patience = 2, restore_best_weights = 1)  

history = model.fit(X_train, y_train,
                    batch_size = 512,
                    epochs = 10,
                    verbose = 1,
                    validation_split = 0.1,
                    callbacks = [early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


In [22]:
score = model.evaluate(X_valid, y_val, batch_size = 512, verbose = 1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])

results.append(['LSTM', score[0], score[1]])



Test score: 0.6737221479415894
Test accuracy: 0.7708938121795654


---

In [23]:
# GRU

model = Sequential()

model.add(
    Embedding(input_dim = word_count,
              input_length = training_length,
              output_dim = 30,
              trainable = True,
              mask_zero = True))
model.add(Masking(mask_value = 0.0))
model.add(GRU(64, recurrent_dropout = 0.2))
model.add(Dense(64, activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes+1, activation = 'softmax'))
model.compile(
    optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 132, 30)           294870    
                                                                 
 masking_2 (Masking)         (None, 132, 30)           0         
                                                                 
 gru (GRU)                   (None, 64)                18432     
                                                                 
 dense_4 (Dense)             (None, 64)                4160      
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_5 (Dense)             (None, 6)                 390       
                                                                 
Total params: 317,852
Trainable params: 317,852
Non-tr

In [24]:
early_stopping = EarlyStopping(monitor = 'val_loss',patience = 2, restore_best_weights = 1)  


history = model.fit(X_train, y_train,
                    batch_size = 512,
                    epochs = 10,
                    verbose = 1,
                    validation_split = 0.1,
                    callbacks = [early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10


In [25]:
score = model.evaluate(X_valid, y_val, batch_size = 512, verbose = 1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])

results.append(['GRU', score[0], score[1]])



Test score: 0.6928486227989197
Test accuracy: 0.7599225640296936


---

In [26]:
# CNN

model = Sequential()

model.add(
    Embedding(input_dim = word_count,
              input_length = training_length,
              output_dim = 30,
              trainable = True,
              mask_zero = True))
model.add(Conv1D(128, 3))
model.add(Activation("relu"))
model.add(GlobalMaxPool1D())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Dropout(0.5))
model.add(Dense(num_classes+1, activation = 'softmax'))

model.compile(
    optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 132, 30)           294870    
                                                                 
 conv1d (Conv1D)             (None, 130, 128)          11648     
                                                                 
 activation (Activation)     (None, 130, 128)          0         
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense_6 (Dense)             (None, 10)                1290      
                                                                 
 activation_1 (Activation)   (None, 10)                0         
                                                      

In [27]:
early_stopping = EarlyStopping(monitor = 'val_loss', patience = 2, restore_best_weights = 1)  


history = model.fit(X_train, y_train,
                    batch_size = 512,
                    epochs = 10,
                    verbose = 1,
                    validation_split = 0.1,
                    callbacks = [early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [28]:
score = model.evaluate(X_valid, y_val, batch_size = 512, verbose = 1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])

results.append(['CNN', score[0], score[1]])



Test score: 0.6996115446090698
Test accuracy: 0.7721845507621765


---

In [29]:
# RNN + CNN

model = Sequential()

model.add(
    Embedding(input_dim = word_count,
              input_length = training_length,
              output_dim = 30,
              trainable = True,
              mask_zero = True))
model.add(SimpleRNN(132, recurrent_dropout = 0.2, return_sequences = "True"))
model.add(Conv1D(132, 3, activation = "linear"))
model.add(Conv1D(64, 1, activation = "linear")) 
model.add(Flatten())                      
model.add(Dropout(0.5)) 
model.add(Dense(num_classes+1, activation = "softmax"))      


model.compile(
    optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 132, 30)           294870    
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 132, 132)          21516     
                                                                 
 conv1d_1 (Conv1D)           (None, 130, 132)          52404     
                                                                 
 conv1d_2 (Conv1D)           (None, 130, 64)           8512      
                                                                 
 flatten (Flatten)           (None, 8320)              0         
                                                                 
 dropout_4 (Dropout)         (None, 8320)              0         
                                                                 
 dense_8 (Dense)             (None, 6)                

In [30]:
early_stopping = EarlyStopping(monitor = 'val_loss', patience = 2, restore_best_weights = 1)  


history = model.fit(X_train, y_train,
                    batch_size = 512,
                    epochs = 10,
                    verbose = 1,
                    validation_split = 0.1,
                    callbacks = [early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


In [31]:
score = model.evaluate(X_valid, y_val, batch_size = 512, verbose = 1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])

results.append(['RNN + CNN', score[0], score[1]])



Test score: 0.7065978646278381
Test accuracy: 0.7554049491882324


---

In [32]:
results_df = pd.DataFrame(results, columns = ['Model', 'Test score', 'Test accuracy'])
results_df

Unnamed: 0,Model,Test score,Test accuracy
0,RNN,0.686975,0.771539
1,LSTM,0.673722,0.770894
2,GRU,0.692849,0.759923
3,CNN,0.699612,0.772185
4,RNN + CNN,0.706598,0.755405
