## Практическое задание к уроку 7 (Сверточные нейронные сети для анализа текста)

In [1]:
!pip install stop_words
!pip install pymorphy2



### Задание

### Берем отызывы за лето (из архива с материалами или предыдущего занятия)

In [2]:
import pandas as pd

In [3]:
df = pd.read_excel('./reviews.xls')
df.head(10)

Unnamed: 0,Rating,Content,Date
0,5,It just works!,2017-08-14
1,4,В целом удобноное приложение...из минусов хотя...,2017-08-14
2,5,Отлично все,2017-08-14
3,5,Стал зависать на 1% работы антивируса. Дальше ...,2017-08-14
4,5,"Очень удобно, работает быстро.",2017-08-14
5,5,Всё удобно норм 👍👍👍,2017-08-14
6,5,Очень удобное приложение.,2017-08-14
7,5,Все устраивает,2017-08-14
8,5,У меня работает все четко. В отличии от банком...,2017-08-14
9,5,Очень все хорошо👍,2017-08-14


### Препроцессинг

In [4]:
max_words = 200
max_len = 40
num_classes = 1

# Training
epochs = 20
batch_size = 512
print_batch_n = 100

In [5]:
from string import punctuation
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
import re

In [6]:
sw = set(get_stop_words("ru"))
exclude = set(punctuation)
morpher = MorphAnalyzer()

def preprocess_text(txt):
    txt = str(txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = re.sub("\sне", "не", txt)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    return " ".join(txt)

df['Content'] = df['Content'].apply(preprocess_text)
df['Rating'] = df['Rating'].apply(lambda x: f'{x}')

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Rating', 'Date']), df['Rating'], test_size=0.33, random_state=42)

In [8]:
train_corpus = " ".join(X_train["Content"])
train_corpus = train_corpus.lower()

In [9]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download("punkt")

tokens = word_tokenize(train_corpus)

[nltk_data] Downloading package punkt to /Users/tombelov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
tokens_filtered = [word for word in tokens if word.isalnum()]
tokens_filtered[:10]

['наконецтый',
 'исправить',
 'чушь',
 'снеоргинальный',
 'прошивка',
 'приложение',
 'удобно',
 'пользоваться',
 'удобно',
 'использование']

In [11]:
from nltk.probability import FreqDist
dist = FreqDist(tokens_filtered)
tokens_filtered_top = [pair[0] for pair in dist.most_common(max_words-1)]
tokens_filtered_top[:10]

['приложение',
 'удобно',
 'работать',
 'удобный',
 'отлично',
 'нравиться',
 'хороший',
 'отличный',
 'телефон',
 'супер']

In [12]:
vocabulary = {v: k for k, v in dict(enumerate(tokens_filtered_top, 1)).items()}
vocabulary

{'приложение': 1,
 'удобно': 2,
 'работать': 3,
 'удобный': 4,
 'отлично': 5,
 'нравиться': 6,
 'хороший': 7,
 'отличный': 8,
 'телефон': 9,
 'супер': 10,
 'быстро': 11,
 'обновление': 12,
 'пароль': 13,
 'мочь': 14,
 'пользоваться': 15,
 'антивирус': 16,
 'банк': 17,
 'вход': 18,
 'устраивать': 19,
 'сбербанк': 20,
 'раз': 21,
 'прошивка': 22,
 'карта': 23,
 'проблема': 24,
 'рута': 25,
 'программа': 26,
 'ошибка': 27,
 'разработчик': 28,
 'сделать': 29,
 'приходиться': 30,
 'вводить': 31,
 'перевод': 32,
 'счёт': 33,
 'писать': 34,
 'норма': 35,
 'деньга': 36,
 'довольный': 37,
 'около': 38,
 'постоянно': 39,
 'нормально': 40,
 'код': 41,
 'исправить': 42,
 'смс': 43,
 'платёж': 44,
 'понятно': 45,
 'последний': 46,
 'функция': 47,
 'зайти': 48,
 'свой': 49,
 'вылетать': 50,
 'мобильный': 51,
 'стать': 52,
 'шаблон': 53,
 'приходить': 54,
 'возможность': 55,
 'право': 56,
 'делать': 57,
 'иня': 58,
 'проверка': 59,
 'класс': 60,
 'установить': 61,
 'root': 62,
 'заходить': 63,
 '5': 

In [13]:
import numpy as np

def text_to_sequence(text, maxlen):
    result = []

    tokens = word_tokenize(text.lower())
    tokens_filtered = [word for word in tokens if word.isalnum()]
    
    for word in tokens_filtered:
        if word in vocabulary:
            result.append(vocabulary[word])

    padding = [0] * (maxlen - len(result))

    return padding + result[-maxlen:]

In [14]:
X_train_seq = np.asarray([text_to_sequence(text, max_len) for text in X_train["Content"]], dtype=np.int32)
X_test_seq = np.asarray([text_to_sequence(text, max_len) for text in X_test["Content"]], dtype=np.int32)

In [15]:
X_train_seq, X_test_seq

(array([[  0,   0,   0, ...,   1,   2,  15],
        [  0,   0,   0, ...,   0,   2, 181],
        [  0,   0,   0, ...,   0,   0,   5],
        ...,
        [  0,   0,   0, ..., 164,  27,  84],
        [  0,   0,   0, ...,   0,   0,  20],
        [  0,   0,   0, ...,   0, 113,   5]], dtype=int32),
 array([[ 0,  0,  0, ...,  0,  0,  5],
        [ 0,  0,  0, ...,  0,  6,  2],
        [ 0,  0,  0, ...,  1, 14, 48],
        ...,
        [ 0,  0,  0, ...,  1,  2, 11],
        [ 0,  0,  0, ..., 15,  1, 42],
        [ 0,  0,  0, ..., 32,  1, 91]], dtype=int32))

In [16]:
#!pip install keras
#!pip install tensorflow

In [17]:
import numpy as np
import keras
from tensorflow.keras.utils import to_categorical
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import TensorBoard 
from keras.objectives import categorical_crossentropy
from keras.callbacks import EarlyStopping

In [18]:
y_train.shape

(13841,)

In [19]:
num_classes = len(df['Rating'].value_counts()) + 1
num_classes

6

In [20]:
y_train_cat = to_categorical(y_train, num_classes)
y_test_cat = to_categorical(y_test, num_classes)

1. Создаем и учим модель с обычным Embedding

In [21]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(Conv1D(128, 3))
model.add(Activation("relu"))
model.add(GlobalMaxPool1D())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [22]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [23]:
tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
early_stopping=EarlyStopping(monitor='val_loss')  

history = model.fit(X_train_seq, y_train_cat,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[tensorboard, early_stopping])

Epoch 1/20


In [24]:
score = model.evaluate(X_test_seq, y_test_cat, batch_size=batch_size, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])



Test score: 1.0688270330429077
Test accuracy: 0.7125256657600403


In [25]:
results = model.predict(X_test_seq, batch_size=batch_size, verbose=1)



In [26]:
results

array([[0.1002029 , 0.03017209, 0.04071685, 0.04369523, 0.04680741,
        0.73840547],
       [0.09800393, 0.02920604, 0.03998629, 0.04268521, 0.04471024,
        0.7454083 ],
       [0.10008722, 0.05559361, 0.05772931, 0.06640156, 0.06564388,
        0.6545444 ],
       ...,
       [0.09835306, 0.03222388, 0.04173975, 0.04388751, 0.0472176 ,
        0.7365782 ],
       [0.10750321, 0.07144999, 0.06960043, 0.07972618, 0.07727428,
        0.59444594],
       [0.10361601, 0.05191559, 0.05654161, 0.06454043, 0.06424002,
        0.6591463 ]], dtype=float32)

Сравнить две архитектуры с предобученными весами и когда tf.keras.layers.Embedding обучается сразу со всей сеточкой, что получилось лучше

2. Создаем и учим модель с предзагруженным Embedding

грузим с сайта https://rusvectores.org/ru/  
модель ruwikiruscorpora_tokens_elmo_1024_2019

In [27]:
# !pip install --upgrade simple_elmo

In [28]:
from simple_elmo import ElmoModel
model = ElmoModel()
model.load('./model')


2021-07-28 21:10:44,082 : INFO : Loading model from ./model...
2021-07-28 21:10:44,083 : INFO : We will cache the vocabulary of 100 tokens.


'The model is now loaded.'

In [32]:
vectors = model.get_elmo_vectors(df['Content'])

2021-07-28 21:35:33,762 : INFO : Warming up ELMo on 32 sentences...
2021-07-28 21:35:37,175 : INFO : Warming up finished.
2021-07-28 21:35:37,185 : INFO : Texts in the current batch: 32
2021-07-28 21:35:40,399 : INFO : Texts in the current batch: 32
2021-07-28 21:35:45,129 : INFO : Texts in the current batch: 32
2021-07-28 21:35:48,319 : INFO : Texts in the current batch: 32
2021-07-28 21:35:52,342 : INFO : Texts in the current batch: 32
2021-07-28 21:35:57,771 : INFO : Texts in the current batch: 32
2021-07-28 21:36:03,968 : INFO : Texts in the current batch: 32
2021-07-28 21:36:05,966 : INFO : Texts in the current batch: 32
2021-07-28 21:36:09,703 : INFO : Texts in the current batch: 32
2021-07-28 21:36:17,120 : INFO : Texts in the current batch: 32
2021-07-28 21:36:27,999 : INFO : Texts in the current batch: 32
2021-07-28 21:36:36,156 : INFO : Texts in the current batch: 32
2021-07-28 21:36:53,836 : INFO : Texts in the current batch: 32
2021-07-28 21:37:03,809 : INFO : Texts in the 

In [None]:
import pickle
with open('embeddings.pickle', 'wb') as f:
    pickle.dump(vectors, f)


NameError: name 'vectors' is not defined

In [38]:
from keras.models import model_from_json

json_file = open('./model/options.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)

ValueError: Improper config format: {'bidirectional': True, 'char_cnn': {'activation': 'relu', 'embedding': {'dim': 16}, 'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], 'max_characters_per_token': 50, 'n_characters': 262, 'n_highway': 2}, 'dropout': 0.1, 'lstm': {'cell_clip': 3, 'dim': 2048, 'n_layers': 2, 'proj_clip': 3, 'projection_dim': 512, 'use_skip_connections': True}, 'all_clip_norm_val': 10.0, 'n_epochs': 3, 'n_train_tokens': 989077087, 'batch_size': 192, 'n_tokens_vocab': 250003, 'unroll_steps': 20, 'n_negative_samples_batch': 4096}

## ЗДЕСЬ ЗАКАНЧИВАЕТСЯ ПРАКТИЧЕСКОЕ ЗАДАНИЕ - ДАЛЬШЕ МАТЕРИАЛЫ ЛЕКЦИИИ

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score 

In [None]:
vect = TfidfVectorizer(ngram_range=(1, 2), analyzer='word', lowercase=False)

In [None]:
train_ft = vect.fit_transform(X_train['Content'])
valid_ft = vect.transform(X_test['Content'])

IndexError: ignored

In [None]:
lgr = LogisticRegression()

In [None]:
lgr.fit(train_ft, X_train['Rating'].to_numpy())

NameError: ignored

In [None]:
y_pred = lgr.predict(valid_ft)

In [None]:
accuracy_score(df_val['class'].to_numpy(), y_pred)

0.7362341841908037

In [None]:
from gensim.models import Word2Vec

In [None]:
df_train['text']

0         alisachachkaн уезжаааааааать ❤ тожена хотеть у...
1         rt galyginvadim ребята девчата кино любовь зав...
2           rt artemklyushin ктоненавидеть пробка ретвит rt
3         rt epupybobv хотеться котлета покиевск запретн...
4         karinekurganova yessboss босапоп есбосан боять...
                                ...                        
181462                     классный новый httptcolejaguxnwu
181463                       видеть человек привет игнорита
181464    julia69styles длинный диагноз вкратце аллергич...
181465    technoslav ух серенький кнопочень телефон дост...
181466    kris1d07 собираться сделатьd навешать фотка ва...
Name: text, Length: 181467, dtype: object

In [None]:
modelW2V = Word2Vec(sentences=df_train['text'].apply(str.split), size=100, window=5, min_count=5, workers=8)

In [None]:
modelW2V.wv.vocab

{'❤': <gensim.models.keyedvectors.Vocab at 0x7f7d606bb1f0>,
 'тожена': <gensim.models.keyedvectors.Vocab at 0x7f7d606bb790>,
 'хотеть': <gensim.models.keyedvectors.Vocab at 0x7f7c62c34f10>,
 'уезжать': <gensim.models.keyedvectors.Vocab at 0x7f7c62c34f70>,
 'rt': <gensim.models.keyedvectors.Vocab at 0x7f7c62c34880>,
 'ребята': <gensim.models.keyedvectors.Vocab at 0x7f7c62c346d0>,
 'девчата': <gensim.models.keyedvectors.Vocab at 0x7f7c62c34ca0>,
 'кино': <gensim.models.keyedvectors.Vocab at 0x7f7c62c34bb0>,
 'любовь': <gensim.models.keyedvectors.Vocab at 0x7f7c62c34ee0>,
 'завтра': <gensim.models.keyedvectors.Vocab at 0x7f7d7b8135e0>,
 'artemklyushin': <gensim.models.keyedvectors.Vocab at 0x7f7d7b813c40>,
 'ктоненавидеть': <gensim.models.keyedvectors.Vocab at 0x7f7d58143310>,
 'пробка': <gensim.models.keyedvectors.Vocab at 0x7f7d58143190>,
 'ретвит': <gensim.models.keyedvectors.Vocab at 0x7f7d58143340>,
 'хотеться': <gensim.models.keyedvectors.Vocab at 0x7f7c562c1040>,
 'котлета': <gensi

In [None]:
vect_idf = TfidfVectorizer()
vect_idf.fit_transform(df_train['text'])
tfidf = dict(zip(vect_idf.get_feature_names(), vect_idf.idf_))

In [None]:
tfidf

{'00': 9.32464497468466,
 '000': 8.71438545393048,
 '0000': 9.707637226940765,
 '00000': 12.01022231993481,
 '000000': 12.415687428042975,
 '0000000': 12.01022231993481,
 '00000001': 12.415687428042975,
 '000009': 12.01022231993481,
 '00000asrova': 12.415687428042975,
 '00008': 12.415687428042975,
 '0000ибо': 12.415687428042975,
 '0000нуль': 12.415687428042975,
 '0000пойти': 12.415687428042975,
 '0001': 12.415687428042975,
 '0005': 12.415687428042975,
 '0006': 12.415687428042975,
 '0009': 12.415687428042975,
 '000nana000': 11.49939669616882,
 '000к': 12.415687428042975,
 '000ноль': 12.415687428042975,
 '000ный': 12.415687428042975,
 '000рубль': 12.415687428042975,
 '000ть': 12.415687428042975,
 '001': 12.415687428042975,
 '0019': 12.415687428042975,
 '002': 12.415687428042975,
 '0024': 12.415687428042975,
 '003': 12.415687428042975,
 '0030': 11.49939669616882,
 '004anna': 12.415687428042975,
 '005': 12.415687428042975,
 '0050': 12.415687428042975,
 '0053': 12.415687428042975,
 '007': 1

In [None]:
rt = vect_idf.vocabulary_.items()

In [None]:
tfidf['alisachachka']

12.415687428042975

In [None]:
vect_idf.idf_[vect_idf.vocabulary_['alisachachka']]

12.415687428042975

In [None]:
len(tfidf)

256172

In [None]:
from collections import defaultdict

In [None]:
max_idf = max(vect_idf.idf_)

word2weight = defaultdict(
    lambda: max_idf,
    [(w, vect_idf.idf_[i]) for w, i in vect_idf.vocabulary_.items()])

In [None]:
def get_vect_mean(txt):
    vector_w2v = np.zeros(100)
    n_w2v = 0
    for wrd in txt.split():
        if wrd in modelW2V:
            vector_w2v += modelW2V[wrd]
            n_w2v += 1
    if n_w2v > 0:
        vector_w2v = vector_w2v / n_w2v
    return vector_w2v

def get_vect_idf(txt):
    vector_w2v = np.zeros(100)
    n_w2v = 0
    for wrd in txt.split():
        if wrd in modelW2V:
            iddf_ = tfidf.get(wrd, 1.)
            vector_w2v += modelW2V[wrd]*iddf_
            n_w2v += iddf_
    if n_w2v > 0:
        vector_w2v = vector_w2v / n_w2v
    return vector_w2v

In [None]:
from tqdm import tqdm_notebook

In [None]:
arr_vect = []
for txt in tqdm_notebook(df_train['text']):
    arr_vect.append(get_vect_mean(txt))
    
arr_vect_valid = []
for txt in tqdm_notebook(df_val['text']):
    arr_vect_valid.append(get_vect_mean(txt))
    
train_w2v = np.asarray(arr_vect)    
valid_w2v = np.asarray(arr_vect_valid)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for txt in tqdm_notebook(df_train['text']):


HBox(children=(FloatProgress(value=0.0, max=181467.0), HTML(value='')))

  if wrd in modelW2V:
  vector_w2v += modelW2V[wrd]





Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for txt in tqdm_notebook(df_val['text']):


HBox(children=(FloatProgress(value=0.0, max=22683.0), HTML(value='')))




In [None]:
lgr_w2v = LogisticRegression()

In [None]:
lgr_w2v.fit(train_w2v, df_train['class'].to_numpy())



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
y_pred = lgr_w2v.predict(valid_w2v)

In [None]:
accuracy_score(df_val['class'].to_numpy(), y_pred)

0.6555129392055724

In [None]:
arr_vect = []
for txt in tqdm_notebook(df_train['text']):
    arr_vect.append(get_vect_idf(txt))
    
arr_vect_valid = []
for txt in tqdm_notebook(df_val['text']):
    arr_vect_valid.append(get_vect_idf(txt))
    
train_w2v = np.asarray(arr_vect)    
valid_w2v = np.asarray(arr_vect_valid)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for txt in tqdm_notebook(df_train['text']):


HBox(children=(FloatProgress(value=0.0, max=181467.0), HTML(value='')))

  if wrd in modelW2V:
  vector_w2v += modelW2V[wrd]*iddf_





Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for txt in tqdm_notebook(df_val['text']):


HBox(children=(FloatProgress(value=0.0, max=22683.0), HTML(value='')))




In [None]:
lgr_w2v = LogisticRegression()
lgr_w2v.fit(train_w2v, df_train['class'].to_numpy())
y_pred = lgr_w2v.predict(valid_w2v)



In [None]:
accuracy_score(df_val['class'].to_numpy(), y_pred)

0.6500903760525504