In [1]:
import sys
import os
import pandas as pd
import numpy as np
from razdel import sentenize
import zipfile
from gensim.models import KeyedVectors
from feature_extraction import gen_or_load_feats, cosine_features, polarity_features
from feature_extraction import hand_features, word_overlap_features, rouge_features

from tqdm._tqdm_notebook import tqdm_notebook, tqdm

from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
# from keras.utils import to_categorical

from keras.layers import Bidirectional, Embedding, LSTM, Masking, Dense, Dropout
from keras.layers import GlobalMaxPooling1D, Input, TimeDistributed
from keras.layers import concatenate, multiply, subtract
from keras.callbacks import EarlyStopping

from keras.utils import to_categorical
from keras.optimizers import Adam
from keras.models import Model, load_model

import tensorflow as tf
gpu_options = tf.GPUOptions(allow_growth=True)
session = tf.InteractiveSession(config=tf.ConfigProto(gpu_options=gpu_options))

import ipywidgets as widgets
from ipywidgets import interact

from text_preprocessing import Preprocessor
from plot_tools import plot_cm
from my_layers import GlobalMaxPooling1DMasked

Using TensorFlow backend.


In [2]:
with open("../data/external/stopwords.txt", mode="r") as f:
    stopwords = [s.replace("\n", "") for s in f.readlines()]
    
stopwords.remove("не") # несильно влияет на качество, но для отдельных примеров мб решающим
stopwords.remove("нет")

p = Preprocessor(stopwords=stopwords)

In [None]:
data = pd.read_json("../data/raw/final-raw-for-senior-thesis.json", orient='records', lines=True)

In [4]:
@interact
def show_by_source(source=data.source.unique()):
    df=data
    print("Total number of posts: {}".format(df.shape[0]))
    print("Number of posts from source {0}: {1}".format(source, df[df["source"]==source].shape[0]))
    return df[df["source"]==source].head(10)

interactive(children=(Dropdown(description='source', options=('interfax.ru', 'dni.ru', 'utro.ru', 'artificial'…

# Preprocessing

In [5]:
# эта штука разбивает текст на предложения
get_sentences = lambda body : [substring.text for substring in list(sentenize(body))]
# эта штука нормализует разбитый текст
normalize_sentences = lambda text : [p.beautify(sentence, normalize_text=True) for sentence in text]

In [6]:
tqdm_notebook.pandas(desc="Separating text into sentences")

data["body_sentences"] = data["text"].progress_apply(lambda x : get_sentences(x)) # resulting in list of sentences
data["lead"] = data["body_sentences"].apply(lambda x : " ".join(x[:2])) # resulting in two first sentences _s_t_r_i_n_g_

HBox(children=(IntProgress(value=0, description='Separating text into sentences', max=12575, style=ProgressSty…




In [7]:
tqdm_notebook.pandas(desc="Normalizing titles")
data["title_norm"] = data["title"].progress_apply(lambda x : p.beautify(x, normalize_text=True))

tqdm_notebook.pandas(desc="Normalizing leads")
data["lead_norm"] = data["lead"].progress_apply(lambda x : p.beautify(x, normalize_text=True))

tqdm_notebook.pandas(desc="Normalizing texts")
data["body_sentences_norm"] = data["body_sentences"].progress_apply(normalize_sentences)
data["text_norm"] = data["body_sentences_norm"].apply(lambda x : " ".join(x))

HBox(children=(IntProgress(value=0, description='Normalizing titles', max=12575, style=ProgressStyle(descripti…




HBox(children=(IntProgress(value=0, description='Normalizing leads', max=12575, style=ProgressStyle(descriptio…




HBox(children=(IntProgress(value=0, description='Normalizing texts', max=12575, style=ProgressStyle(descriptio…




In [34]:
data.to_json("../data/interim/data-for-fucking-lstm.json", force_ascii=False, orient='records', lines=True)

In [2]:
data = pd.read_json("../data/interim/data-for-fucking-lstm.json", orient='records', lines=True)

# Feature Extraction

In [3]:
if not os.path.exists("../data/interim/features/fucking-lstm"):
    os.mkdir("../data/interim/features/fucking-lstm")

### Headline-body pairwise features

In [4]:
overlap_body = gen_or_load_feats(word_overlap_features,
                                 data.title_norm.tolist(), 
                                 data.text_norm.tolist(), 
                                 "../data/interim/features/fucking-lstm/bodies.overlap.npy")

polarity_body = gen_or_load_feats(polarity_features,
                                  data.title_norm.tolist(), 
                                  data.text_norm.tolist(), 
                                  "../data/interim/features/fucking-lstm/bodies.polarity.npy")

hand_body = gen_or_load_feats(hand_features, 
                              data.title_norm.tolist(), 
                              data.text_norm.tolist(), 
                              "../data/interim/features/fucking-lstm/bodies.hand.npy")

cosine_body = gen_or_load_feats(cosine_features, 
                                data.title_norm.tolist(), 
                                data.text_norm.tolist(), 
                                "../data/interim/features/fucking-lstm/bodies.cosine.npy")

rouge_body = gen_or_load_feats(rouge_features, 
                               data.title_norm.tolist(), 
                               data.text_norm.tolist(), 
                               "../data/interim/features/fucking-lstm/bodies.rouge.npy")

### Headline-lead pairwise features

In [5]:
overlap_lead = gen_or_load_feats(word_overlap_features,
                                 data.title_norm.tolist(), 
                                 data.lead_norm.tolist(), 
                                 "../data/interim/features/fucking-lstm/leads.overlap.npy")

polarity_lead = gen_or_load_feats(polarity_features,
                                  data.title_norm.tolist(), 
                                  data.lead_norm.tolist(), 
                                  "../data/interim/features/fucking-lstm/leads.polarity.npy")

hand_lead = gen_or_load_feats(hand_features, 
                              data.title_norm.tolist(), 
                              data.lead_norm.tolist(), 
                              "../data/interim/features/fucking-lstm/leads.hand.npy")

cosine_lead = gen_or_load_feats(cosine_features, 
                                data.title_norm.tolist(), 
                                data.lead_norm.tolist(), 
                                "../data/interim/features/fucking-lstm/leads.cosine.npy")

rouge_lead = gen_or_load_feats(rouge_features, 
                               data.title_norm.tolist(),
                               data.lead_norm.tolist(), 
                               "../data/interim/features/fucking-lstm/leads.rouge.npy")

#### Let's take a look at the length range of sentences in texts and number of sentences

In [6]:
get_average_sentence_len = lambda text : np.array([len(sentence.split()) for sentence in text]).mean(dtype=int)

In [7]:
data["average_sentence"] = data["body_sentences_norm"].apply(get_average_sentence_len)
data["number_of_sentences"] = data["body_sentences_norm"].apply(lambda x : len(x))
data["average_title"] = data["title_norm"].apply(lambda x : len(x.split()))
data["average_lead"] = data["lead_norm"].apply(lambda x : len(x.split()))

Так как в среднем лид длиннее, чем предложение в тексте, будем брать его как ограничение одного вектора токенов по длине. Текст ограничим исходя из среднего числа предложений в тексте + 2 сигмы.

In [8]:
# get maximum sequence length
avg_lead = data.average_lead.mean()
std_lead = data.average_lead.std()
max_seq_len = int(round(avg_lead + 2*std_lead))

# get maximum number of sentences in text
avg_sents = data.number_of_sentences.mean()
std_sents = data.number_of_sentences.std()
max_sents = int(round(avg_sents + 2*std_sents))

print("Max padded sequence length: {}".format(max_seq_len))
print("Max number of sentences: {}".format(max_sents))

Max padded sequence length: 51
Max number of sentences: 29


### Texts tokenization and padding

Так как токенизируются уже очищенные и нормализованные тексты, токенайзер будет дефолтным.

In [9]:
t = Tokenizer()
t.fit_on_texts((data["title_norm"].astype(str) + data["text_norm"].astype(str)).tolist())

In [25]:
pad_texts = lambda x : pad_sequences(sequences=np.array([np.array(seq) for seq in t.texts_to_sequences(x)]), 
                                     maxlen=max_seq_len)

In [None]:
tqdm_notebook.pandas(desc="Tokenization & padding")

bodies = data["body_sentences_norm"].progress_apply(lambda x : pad_texts(x[:max_sents]))
leads = pad_texts(data["lead_norm"].tolist())
headlines = pad_texts(data["title_norm"].tolist())

HBox(children=(IntProgress(value=0, description='Tokenization & padding', max=12575, style=ProgressStyle(descr…




In [None]:
def pad_bodies(b):
    while b.shape[0] < max_sents:
        b = np.vstack([b, np.zeros_like(b[0])])
    return b

In [None]:
bodies = np.array([pad_bodies(b) for b in bodies])

### Target variable

In [None]:
lb = LabelBinarizer()
y = lb.fit_transform(data["label"].values)

# Model

In [None]:
with zipfile.ZipFile("../data/external/184.zip", 'r') as archive:
        stream = archive.open('model.bin')
        word2vec = KeyedVectors.load_word2vec_format(stream, binary=True)

In [None]:
hidden_units=100

In [None]:
input_headline = Input(shape=(max_seq_len,))
input_body = Input(shape=(max_sents, max_seq_len))
input_lead = Input(shape=(max_seq_len,))

input_overlap_body = Input(shape=(1,))
input_overlap_lead = Input(shape=(1,))

input_polarity_body = Input(shape=(2,))
input_polarity_lead = Input(shape=(2,))

input_hand_body = Input(shape=(26,))
input_hand_lead = Input(shape=(26,))

input_cos_body = Input(shape=(1,))
input_cos_lead = Input(shape=(1,))

input_rouge_body = Input(shape=(3,))
input_rouge_lead = Input(shape=(3,))

In [None]:
early_stop = EarlyStopping(monitor='loss', patience=2, verbose=1, restore_best_weights=True)
embedding_layer = word2vec.get_keras_embedding(train_embeddings=False)
lstm1 = LSTM(hidden_units, implementation=2, return_sequences=True, name='lstm1' )
lstm1 = Bidirectional(lstm1, name='bilstm1')
right_branch_lstm1 = LSTM(hidden_units, implementation=2, return_sequences=True )
right_branch_lstm1 = Bidirectional(right_branch_lstm1)

In [30]:
mask = Masking(mask_value=0, input_shape=(max_seq_len,))(input_headline)
embed = embedding_layer(mask)
l1 = lstm1(embed)
drop1 = Dropout(0.1)(l1)
maxim = GlobalMaxPooling1D()(drop1)
HeadlineEncoder = Model(input_headline, maxim, name='HeadlineEncoder')

In [31]:
body_sentence = TimeDistributed(HeadlineEncoder)(input_body)
body_g1 = right_branch_lstm1(body_sentence)
body_g1 = Dropout(0.1)(body_g1)
body_maxim = GlobalMaxPooling1D()(body_g1)
DocumentEncoder = Model(input_body, body_maxim, name='DocumentEncoder')

##############################

# Combining both representations #

headline_representation = HeadlineEncoder(input_headline)
document_representation = DocumentEncoder(input_body)

# Match between headline and first two sentences from body #

lead_representation = HeadlineEncoder(input_lead)
concat_lead = concatenate([headline_representation, lead_representation])
mul_lead = multiply([headline_representation, lead_representation])
dif_lead = subtract([headline_representation, lead_representation])
final_merge_lead = concatenate([concat_lead, mul_lead, dif_lead, input_overlap_lead, input_polarity_lead, 
                                input_hand_lead, input_cos_lead, input_rouge_lead])
drop3_lead = Dropout(0.1)(final_merge_lead)
dense1_lead = Dense(hidden_units*2, activation='relu')(drop3_lead)
drop4_lead = Dropout(0.1)(dense1_lead)
dense2_lead = Dense(hidden_units, activation='relu')(drop4_lead)
match = Dropout(0.1)(dense2_lead)

# layer_dict -- это, видимо, штука с весами предобученными на MultiNLI
#####################################################

concat = concatenate([headline_representation, document_representation])
mul = multiply([headline_representation, document_representation])
dif = subtract([headline_representation, document_representation])
final_merge = concatenate([concat, mul, dif, input_overlap_body, 
                           input_polarity_body, input_hand_body, 
                           input_cos_body, input_rouge_body])
drop3 = Dropout(0.1)(final_merge)
dense1 = Dense(hidden_units*2, activation='relu', name='dense1')(drop3)
drop4 = Dropout(0.1)(dense1)
dense2 = Dense(hidden_units, activation='relu', name='dense2')(drop4)
drop5 = Dropout(0.1)(dense2)
concat_final = concatenate([drop5, match])

drop6 = Dropout(0.1)(concat_final)
dense3 = Dense(3, activation='softmax')(drop6)
final_model = Model([input_headline, input_body, input_overlap_body, input_polarity_body, 
                     input_hand_body, input_cos_body, input_cos_lead, input_rouge_body, 
                     input_lead, input_overlap_lead, 
                     input_polarity_lead, input_hand_lead, input_rouge_lead], dense3)

In [32]:
final_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 51)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 29, 51)       0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 51)           0                                            
__________________________________________________________________________________________________
HeadlineEncoder (Model)         (None, 200)          75116200    input_1[0][0]                    
                                                                 input_3[0][0]                    
__________

In [33]:
final_model.compile(optimizer=Adam(amsgrad=True), loss='categorical_crossentropy', metrics=['accuracy'])

In [34]:
X = [headlines[:1000], bodies[:1000], overlap_body[:1000], polarity_body[:1000], hand_body[:1000], cosine_body[:1000],
     cosine_lead[:1000], rouge_body[:1000], leads[:1000], overlap_lead[:1000], polarity_lead[:1000], hand_lead[:1000], 
     rouge_lead[:1000]]

In [35]:
final_model.fit(x=X, y=y[:1000], validation_split=0.2, 
                 batch_size=64, epochs=4, verbose=2)

# add callbacks

Train on 800 samples, validate on 200 samples
Epoch 1/4


ResourceExhaustedError: OOM when allocating tensor with shape[1856,300] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node DocumentEncoder_1/time_distributed_2/bilstm1/TensorArrayUnstack/TensorArrayScatter/TensorArrayScatterV3}} = TensorArrayScatterV3[T=DT_FLOAT, _class=["loc:@DocumentEncoder_1/time_distributed_2/bilstm1/transpose"], _device="/job:localhost/replica:0/task:0/device:GPU:0"](DocumentEncoder_1/time_distributed_2/bilstm1/TensorArray_1, DocumentEncoder_1/time_distributed_2/bilstm1/TensorArrayUnstack/range, DocumentEncoder_1/time_distributed_2/bilstm1/transpose, DocumentEncoder_1/time_distributed_2/bilstm1/TensorArray_1:1)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

	 [[{{node metrics_2/acc/Mean/_1121}} = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_12128_metrics_2/acc/Mean", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


In [47]:
data.to_json("../data/interim/training-lstm-w-lengths-11-04.json", force_ascii=False, lines=True, orient='records')

___