# Классификация текстов

In [0]:
!wget -O articles_lemmatized_noSW.csv "https://uca8a647930017bafae1a971bcb5.dl.dropboxusercontent.com/cd/0/get/AVPAVw3QgxWgZdgwbwTceIUPEuyKIrDghLBhCCpq0m6l3W14JtFIRGYiD7SePlTQtRQgtokZJgk_RKKqFUjIwL4WA3Uz8SApdX06JYfPpvURekcz_Vl5omWiyrCNDYqzgu2rHuV7gHrU6e04WCsXWm_Zo1TwCbp7C25CWpEHP-QL8iAwwkvsKvGGvR-6o5DuoT2bLYUY8I5BTxMqF5yBoGyZ/file?_download_id=49814860698607033332646797212727695979727785552338405713668967967&_notify_domain=www.dropbox.com&dl=1"

In [0]:
!wget -O ru.tar.gz "https://www.dropbox.com/s/0x7oxso6x93efzj/ru.tar.gz?dl=1"

In [0]:
!tar -xvzf ru.tar.gz

In [0]:
!ls

## Предобработка

In [0]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from keras.optimizers import Adam

from keras.layers import Embedding, Input, Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from keras.models import Model, Sequential

import pandas as pd
import numpy as np

import seaborn as sns

import matplotlib.pyplot as plt

from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report, confusion_matrix

%matplotlib inline

In [0]:
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)
import random
random.seed(3)
import tensorflow as tf
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1,
                              inter_op_parallelism_threads=1)

from keras import backend as K

tf.set_random_seed(1234)

sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)


In [0]:
data = pd.read_csv("articles_lemmatized_noSW.csv", index_col="Unnamed: 0")
data = data[data.topic.isin(("Культура", "Общество", "Политика", "Экономика", "Спорт"))] 
train_corpus = pd.DataFrame(columns=data.columns)
test_corpus = pd.DataFrame(columns=data.columns)

for topic in data.topic.unique():
    corpus_sample = data[data.topic==topic].sample(110)
    train_corpus = train_corpus.append(corpus_sample.iloc[:100, :])
    test_corpus = test_corpus.append(corpus_sample.iloc[100:, :])

In [0]:
data.head()

In [0]:
TEXT_LENGTH = 1800
VOCABULARY_SIZE = 250000
EMBEDDING_DIM = 100
DIMS = 250
MAX_FEATURES = 5000
batch_size = 32

nb_filter = 250
filter_length = 3
hidden_dims = 250
nb_epoch = 10

## Сеть прямого распространения

In [0]:
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(train_corpus.body)
tokenizer

#### Представим последовательность в виде вектора длины vocab_size, где на каждой позиции i стоит количество слова под номером i в данной последовательности.

In [0]:
sequences = tokenizer.texts_to_sequences(train_corpus.body)
X_train = tokenizer.sequences_to_matrix(sequences, mode='count')
sequences = tokenizer.texts_to_sequences(test_corpus.body)
X_test = tokenizer.sequences_to_matrix(sequences, mode='count')

In [0]:
print('First seq:', sequences[0])
print('First doc:',X_train[0])

In [0]:
le = LabelEncoder()
le.fit(train_corpus.topic.unique())
y_train = np_utils.to_categorical(le.transform(train_corpus.topic), 5)
y_test = np_utils.to_categorical(le.transform(test_corpus.topic), 5)
y_true = le.transform(test_corpus.topic)
print(y_train[0])

#### создайте архитектуру полносвязной нейронной сети

In [0]:
model = Sequential()

<your code here>

model.add(Dense(5, activation = 'softmax'))
model.compile(loss='categorical_crossentropy', optimizer=Adam(0.0005, 0.8, 0.99), metrics=['accuracy'])
model.summary()

In [0]:
model.fit(X_train, y_train, epochs=nb_epoch, batch_size=batch_size,  validation_split=0.1)

In [0]:
y_pred = model.predict_classes(X_test)
print(classification_report(y_true, y_pred))
labels = test_corpus.topic.unique()
y_true = le.inverse_transform(y_true)
y_pred = le.inverse_transform(y_pred)
sns.heatmap(data=confusion_matrix(y_true, y_pred, labels = labels), annot=True, fmt="d", cbar=False, xticklabels=labels, yticklabels=labels)
plt.title("Confusion matrix")
plt.show()

#### Сопоставим каждому слову его эмбеддинг, который будем обучать.

In [0]:
sequences = tokenizer.texts_to_sequences(train_corpus.body)
X_train = pad_sequences(sequences, maxlen=TEXT_LENGTH)
sequences = tokenizer.texts_to_sequences(test_corpus.body)
X_test = pad_sequences(sequences, maxlen=TEXT_LENGTH)

In [0]:
X_train[0]

In [0]:
from keras.optimizers import Adam

#### создайте архитектуру сети со слоем эмбеддингов

In [0]:
model = Sequential()

<your code here>

model.add(Dense(5, activation = 'softmax'))
model.compile(loss='categorical_crossentropy', optimizer=Adam(0.0005, 0.8, 0.99), metrics=['accuracy'])
model.summary()

In [0]:
try:
  model.fit(X_train, y_train, epochs=50, batch_size=batch_size,  validation_split=0.1)
except KeyboardInterrupt:
  print("Interrupted...")

In [0]:
y_pred = model.predict_classes(X_test)
y_pred = le.inverse_transform(y_pred)

In [0]:
print(classification_report(y_true, y_pred))
sns.heatmap(data=confusion_matrix(y_true, y_pred, labels = labels), annot=True, fmt="d", cbar=False, xticklabels=labels, yticklabels=labels)
plt.title("Confusion matrix")
plt.show()

#### Используем предобученные эмбеддинги в качестве инициализации

In [0]:
%%time

import numpy as np
emb_path = 'ru.vec'

words = []

embeddings_index = {}
f = open(emb_path)
for line in f:
    values = line.split()
    if len(values) == 301:
        word = values[0]
        words.append(word)
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
f.close()

In [0]:
print(len(embeddings_index))

In [0]:
word_index = tokenizer.word_index
len(word_index)

In [0]:
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

#### создайте архитектуру сети со слоем предобученых эмбеддингов

In [0]:
model = Sequential()

<your code here>

model.add(Dense(5, activation = 'softmax'))
model.compile(loss='categorical_crossentropy', optimizer=Adam(0.0005, 0.8, 0.99), metrics=['accuracy'])
model.summary()

In [0]:
model.fit(X_train, y_train, epochs=30, batch_size=batch_size,  validation_split=0.1)

In [0]:
y_pred = model.predict_classes(X_test)
y_pred = le.inverse_transform(y_pred)

In [0]:
print(classification_report(y_true, y_pred))
sns.heatmap(data=confusion_matrix(y_true, y_pred, labels = labels), annot=True, fmt="d", cbar=False, xticklabels=labels, yticklabels=labels)
plt.title("Confusion matrix")
plt.show()

## Сверточные нейронные сети [Convolutional neural networks, CNN]

* Заимствованы из области компьютерного зрения
* Пик популярности пришелся на 2014 (до +10% аккуратности в задачах классификации), со временем были вытеснены рекуррентными нейронными сетями 
* Помогают справится  с проблемой переменной длины входов (CNN VS window-based NN)

In [0]:
from keras.optimizers import Adam

In [0]:
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                    300,
                    input_length=TEXT_LENGTH,
                    weights=[embedding_matrix],
                    trainable=False))

#### Добавьте к модели сверточный слой и пулинг

In [0]:
<your code here>

model.add(Dense(5, activation = 'softmax'))
model.compile(loss='categorical_crossentropy', optimizer=Adam(0.0005, 0.8, 0.99), metrics=['accuracy'])
model.summary()

In [0]:
model.fit(X_train, y_train, epochs=50, batch_size=batch_size,  validation_split=0.1)

In [0]:
y_pred = model.predict_classes(X_test)
y_pred = le.inverse_transform(y_pred)
# y_true = le.inverse_transform(y_true)

In [0]:
model.evaluate(X_test, y_test)

In [0]:
print(classification_report(y_true, y_pred))
labels = test_corpus.topic.unique()
sns.heatmap(data=confusion_matrix(y_true, y_pred, labels = labels), annot=True, fmt="d", cbar=False, xticklabels=labels, yticklabels=labels)
plt.title("Confusion matrix")
plt.show()

### Усложним модель

In [0]:
from keras.models import Model
from keras.layers import Input, Concatenate
from keras.layers.normalization import BatchNormalization

num_filters = 32
filter_sizes = [2, 3, 4, 5]

#### Реализуйте функцию ниже, она должна содержать сверточный слой и слой пулинга

In [0]:
def conv_block(out, flatten=False):
  <your code>
  if flatten:
    conv = Flatten()(conv)
  return conv

#### Сделайте несколько сверток с разным размером фильтра и объедините их

In [0]:
model_input = Input(shape=(TEXT_LENGTH, ))
out = Embedding(len(word_index) + 1,
                300,
                weights=[embedding_matrix],
                trainable=False)(model_input)
conv_blocks = []

<your code here>

out = BatchNormalization()(out)
out = Dropout(0.6)(out)
model_output = Dense(5, activation="sigmoid")(out)

model = Model(model_input, model_output)


model.compile(loss='categorical_crossentropy', optimizer=Adam(0.0005, 0.8, 0.99), metrics=['accuracy'])
model.summary()

In [0]:
model.fit(X_train, y_train, epochs=50, batch_size=16,  validation_split=0.1)

In [0]:
y_pred = model.predict(X_test)
y_pred = le.inverse_transform(y_pred.argmax(-1))
# y_true = le.inverse_transform(y_true)

In [0]:
print(classification_report(y_true, y_pred))
labels = test_corpus.topic.unique()
sns.heatmap(data=confusion_matrix(y_true, y_pred, labels = labels), annot=True, fmt="d", cbar=False, xticklabels=labels, yticklabels=labels)
plt.title("Confusion matrix")
plt.show()

## Рекуррентные нейронные сети

#### Реализуйте рекуррентную сеть с одним рекуррентным слоем

In [0]:
from keras.layers import Activation, LSTM, GRU


model = Sequential()

<your code here>

model.add(Dense(5))
model.add(Activation('sigmoid'))

model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.01, 0.8, 0.9),
              metrics=['accuracy'])

model.summary()

In [0]:
hist_lstm = model.fit(X_train, y_train, epochs=100, batch_size=16,  validation_split=0.1)

In [0]:
y_pred = model.predict_classes(X_test)
y_pred = le.inverse_transform(y_pred)
# y_true = le.inverse_transform(y_true)
print(classification_report(y_true, y_pred))
labels = test_corpus.topic.unique()
sns.heatmap(data=confusion_matrix(y_true, y_pred, labels = labels), annot=True, fmt="d", cbar=False, xticklabels=labels, yticklabels=labels)
plt.title("Confusion matrix")
plt.show()

### Реализуйте BI-LSTM CNN

In [0]:
from keras.layers import Activation, LSTM


model = Sequential()

<your code here>

model.add(Dense(5))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer=Adam(0.0005, 0.8, 0.9),
              metrics=['accuracy'])

model.summary()

In [0]:
hist_lstm2 = model.fit(X_train, y_train, epochs=100, batch_size=16,  validation_split=0.1)

In [0]:
y_pred = model.predict_classes(X_test)
y_pred = le.inverse_transform(y_pred)
# y_true = le.inverse_transform(y_true)
print(classification_report(y_true, y_pred))
labels = test_corpus.topic.unique()
sns.heatmap(data=confusion_matrix(y_true, y_pred, labels = labels), annot=True, fmt="d", cbar=False, xticklabels=labels, yticklabels=labels)
plt.title("Confusion matrix")
plt.show()