In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn import model_selection, preprocessing, linear_model
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

import numpy as np

C:\Users\snetkova\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll
C:\Users\snetkova\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll
  stacklevel=1)


In [2]:
from string import punctuation
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
import re

import nltk
from nltk.tokenize import word_tokenize
nltk.download("punkt")
from nltk.probability import FreqDist

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\snetkova\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
gpu = gpus[0]
tf.config.experimental.set_memory_growth(gpu, True)

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D

In [4]:
data = pd.read_excel("D:/GeekBrains/nlp/les07/data.xls")
data.head()

Unnamed: 0,Rating,Content,Date
0,5,It just works!,2017-08-14
1,4,В целом удобноное приложение...из минусов хотя...,2017-08-14
2,5,Отлично все,2017-08-14
3,5,Стал зависать на 1% работы антивируса. Дальше ...,2017-08-14
4,5,"Очень удобно, работает быстро.",2017-08-14


In [5]:
sw = set(get_stop_words("ru"))
exclude = set(punctuation)
morpher = MorphAnalyzer()

def preprocess_text(txt):
    txt = str(txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(txt))

    txt = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•`'·&()]|[+=]|[[]|[]]|[/]|", '', txt)
    txt = re.sub("[+=]|[[]|[]]|[/]|", '', txt)
    txt = re.sub("\\'", "'", txt)
    txt = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", '', txt)
    txt = re.sub(r'[\xad]', '', txt.strip())

    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    return " ".join(txt)

In [6]:
data['Content'].iloc[3]

'Стал зависать на 1% работы антивируса. Дальше никуда. Ранее больше года пользовался нормально.'

In [7]:
preprocess_text(data['Content'].iloc[3])

  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':


'зависать работа антивирус ранее пользоваться нормально'

In [8]:
data['Content'] = data['Content'].apply(lambda x: preprocess_text(x))

In [9]:
data['Rating'] = data['Rating'] - 1
data.head()

Unnamed: 0,Rating,Content,Date
0,4,it just works,2017-08-14
1,3,целое удобноной приложениеиз минус хотеть боль...,2017-08-14
2,4,отлично,2017-08-14
3,4,зависать работа антивирус ранее пользоваться н...,2017-08-14
4,4,удобно работать быстро,2017-08-14


In [10]:
train_corpus = " ".join(data['Content'])
train_corpus = train_corpus.lower()

In [11]:
tokens = word_tokenize(train_corpus)
tokens_filtered = [word for word in tokens if word.isalnum()]

In [12]:
len(set(tokens_filtered))

10483

In [13]:
sentences_list = data['Content'].tolist()
max([len(s.split()) for s in sentences_list])

130

In [14]:
max_words = len(set(tokens_filtered))
max_len = max([len(s.split()) for s in sentences_list])
num_classes = 5

# Training
epochs = 5
batch_size = 128

In [15]:
dist = FreqDist(tokens_filtered)
tokens_filtered_top = [pair[0] for pair in dist.most_common(max_words-1)]
tokens_filtered_top[:10]

['приложение',
 'удобно',
 'работать',
 'удобный',
 'отлично',
 'нравиться',
 'хороший',
 'телефон',
 'отличный',
 'супер']

In [16]:
vocabulary = {v: k for k, v in dict(enumerate(tokens_filtered_top, 1)).items()}

In [17]:
len(vocabulary)

10482

In [18]:
def text_to_sequence(text, maxlen):
    result = []
    tokens = word_tokenize(text.lower())
    tokens_filtered = [word for word in tokens if word.isalnum()]
    for word in tokens_filtered:
        if word in vocabulary:
            result.append(vocabulary[word])
    padding = [0]*(maxlen-len(result))
    return padding + result[-maxlen:]

In [19]:
full_data = np.asarray([text_to_sequence(text, max_len) for text in data['Content']], dtype=np.int32)

In [20]:
full_target = to_categorical(data['Rating'], num_classes)

In [21]:
X_train, X_val_test, y_train, y_val_test = train_test_split(full_data, full_target, test_size = 0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size = 0.3, random_state = 42)

In [22]:
full_target

array([[0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       ...,
       [0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.]], dtype=float32)

In [23]:
X_train.shape, X_val.shape, X_test.shape

((14461, 130), (4338, 130), (1860, 130))

In [24]:
X_train[1]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   9,   1,   2, 220,  28])

#### Trainable embedding

In [25]:
model1 = Sequential()
model1.add(Embedding(input_dim=max_words, output_dim=300, input_length=max_len))
model1.add(Conv1D(300, 3))
model1.add(Activation("relu"))
model1.add(GlobalMaxPool1D())
model1.add(Dense(10))
model1.add(Activation("relu"))
model1.add(Dense(num_classes))
model1.add(Activation('softmax'))
model1.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 130, 300)          3144900   
_________________________________________________________________
conv1d (Conv1D)              (None, 128, 300)          270300    
_________________________________________________________________
activation (Activation)      (None, 128, 300)          0         
_________________________________________________________________
global_max_pooling1d (Global (None, 300)               0         
_________________________________________________________________
dense (Dense)                (None, 10)                3010      
_________________________________________________________________
activation_1 (Activation)    (None, 10)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 5

In [26]:
model1.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [27]:
model1.fit(X_train, y_train, validation_data = (X_val,y_val),
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
         )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1f32721c188>

In [28]:
model1.evaluate(X_test, y_test, batch_size = 32)[1]



0.773655891418457

In [29]:
# https://keras.io/examples/nlp/pretrained_word_embeddings/
# https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/
# https://www.kaggle.com/danielwillgeorge/glove6b100dtxt

#### Pretrained embedding

In [30]:
# http://vectors.nlpl.eu/repository/20/180.zip
embeddings_index = dict()
f = open('D:/GeekBrains/nlp/les07/model.txt', encoding="utf-8")
for line in f:
    values = line.split()
    word = values[0]
    word = re.sub("_.+", "", word)
    coefs = np.array(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 154617 word vectors.


In [31]:
len(embeddings_index['приложение'])

300

In [33]:
EMBEDDING_DIM = 300
embedding_matrix = np.zeros((len(vocabulary) + 1, EMBEDDING_DIM))
for word, i in vocabulary.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [34]:
model2 = Sequential()
model2.add(Embedding(input_dim=max_words, output_dim=300, input_length=max_len, weights=[embedding_matrix], trainable=False))
model2.add(Conv1D(300, 3))
model2.add(Activation("relu"))
model2.add(GlobalMaxPool1D())
model2.add(Dense(10))
model2.add(Activation("relu"))
model2.add(Dense(num_classes))
model2.add(Activation('softmax'))
model2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 130, 300)          3144900   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 128, 300)          270300    
_________________________________________________________________
activation_3 (Activation)    (None, 128, 300)          0         
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 300)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 10)                3010      
_________________________________________________________________
activation_4 (Activation)    (None, 10)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 5)                

In [35]:
model2.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [36]:
model2.fit(X_train, y_train, validation_data = (X_val,y_val),
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
         )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1f37e320648>

In [37]:
model2.evaluate(X_test, y_test, batch_size = 32)[1]



0.7532258033752441

Результат модели с пре-тренированным эмбеддингом оказался чуть ниже. Веорятно, это связано с безграмотностью отзывов - в претренированном эмбеддинге таких слов просто нет