## LSTM

### Setup

In [0]:

import sys, os
import numpy as np
import pandas as pd
import re
import time
import matplotlib.pyplot as plt
import tensorflow as tf


from google.colab import drive
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, GlobalMaxPooling1D, Dropout, SpatialDropout1D, Conv1D, MaxPooling1D, Embedding, LSTM, Bidirectional
from keras.initializers import Constant
from keras.models import Model, load_model
from keras import callbacks
from keras import backend as K

%reload_ext autoreload
%autoreload 2

pd.set_option('display.max_colwidth', -1)


In [0]:
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
cd ~/..

/


In [0]:
os.getcwd()

'/'

In [0]:
# Create a symbolic link to omit issues with whitespace in "My Drive"
!ln -s ~/../content/gdrive/"My Drive"/ /MyDrive

In [0]:
PROJECT_HOME_PATH = os.path.join('MyDrive', 'NmtPolishLanguage')
DATA_PATH = os.path.join(PROJECT_HOME_PATH, 'DATA')

In [0]:
os.path.exists(PROJECT_HOME_PATH)

True

In [0]:
ls

[0m[01;34mbin[0m/      [01;34mdatalab[0m/  [01;34mhome[0m/   [01;34mlib64[0m/  [01;36mMyDrive[0m@  [01;34mroot[0m/  [01;34msrv[0m/    [30;42mtmp[0m/    [01;34mvar[0m/
[01;34mboot[0m/     [01;34mdev[0m/      [01;34mlib[0m/    [01;34mmedia[0m/  [01;34mopt[0m/      [01;34mrun[0m/   [01;34mswift[0m/  [01;34mtools[0m/
[01;34mcontent[0m/  [01;34metc[0m/      [01;34mlib32[0m/  [01;34mmnt[0m/    [01;34mproc[0m/     [01;34msbin[0m/  [01;34msys[0m/    [01;34musr[0m/


### Load data

#### Text

In [0]:
train_notes = pd.read_csv(os.path.join(DATA_PATH, 'train', 'train_notes.csv'))
test_notes = pd.read_csv(os.path.join(DATA_PATH, 'test', 'test_notes.csv'))

notes_trn = train_notes['text']
y_trn = train_notes['target']

notes_test = test_notes['text']
y_test = test_notes['target']


In [0]:
notes_trn[:3]

0    zrobilysmy psiapsi matching tattoos monte xd                                                   
1    anonymizedaccount potwornie nie chce                                                           
2    anonymizedaccount anonymizedaccount calym szacunkiem staze klubach zagranicznych nie argumentem
Name: text, dtype: object

#### Load lgove pretrained embeddings

In [0]:
embeddings_index = {}

In [0]:
with open(os.path.join(DATA_PATH, 'glove', 'vectors_15000_5_15_15_50.txt')) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
        
        
print(f'Found {len(embeddings_index)} word vectors.')

Found 6329 word vectors.


#### Vectorize the text into 2D integer tensor

In [0]:
### Calculate average number of words per note
_res = list(notes_trn)
_res = [word.split(" ") for word in _res]
_flat_list = [item for sublist in _res for item in sublist]

avg_num_words = len(_flat_list) / len(notes_trn)

In [0]:
print(f'Average number of words in single note: {avg_num_words:.2f}')

Average number of words in single note: 7.94


In [0]:
maxlen = 16
max_words = 15000
embedding_dim = 50

In [0]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(notes_trn)

In [0]:
word_index = tokenizer.word_index

In [0]:
print(f'Found {len(word_index)} unique tokens.')

Found 19102 unique tokens.


In [0]:
seq_trn = tokenizer.texts_to_sequences(notes_trn)
seq_test = tokenizer.texts_to_sequences(notes_test)

x_trn = pad_sequences(seq_trn, maxlen=maxlen, padding='post', truncating='post')
x_test = pad_sequences(seq_test, maxlen=maxlen, padding='post', truncating='post')

In [0]:
print(f'x_trn shape: {x_trn.shape}\nx_test shape: {x_test.shape}')

x_trn shape: (8031, 16)
x_test shape: (2008, 16)


#### Prepare word embeddings matrix

In [0]:
embedding_matrix = np.zeros((max_words, embedding_dim))

for word, i in word_index.items():
    
    if i < max_words:
        emb_vector = embeddings_index.get(word)
        if emb_vector is not None:
            embedding_matrix[i] = emb_vector
    

#### Define metric

In [0]:
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.
        
        Only computes a batch-wise average of recall.
        
        Computes the recall, a metric for multi-label classification of 
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall
        
    def precision(y_true, y_pred):
        """Precision metric.
        
        Only computes a batch-wise average of precision.
        
        Computes the precision, a metric for multi-label classification of 
        how many relevant items are selected. 
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
        
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)

    return 2*((precision*recall))/(precision+recall+K.epsilon())
                

#### Simple Model Conv1D

In [0]:
input_seq = Input(shape=(maxlen,), dtype='int32')

emb_layer = Embedding(max_words,
                        embedding_dim,
                        embeddings_initializer=Constant(embedding_matrix),
                        input_length=maxlen, trainable=False)
                     
emb_seq = emb_layer(input_seq)
x = Conv1D(64, 3, activation='relu')(emb_seq)
x = MaxPooling1D(2)(x)
x = Conv1D(64, 3, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
output = Dense(1, activation='sigmoid')(x)

model = Model(input_seq, output)

model.compile(loss='binary_crossentropy',
             optimizer='adam',
             metrics=[f1])

model_name = 'model_CNN_64_64_D1'

Instructions for updating:
Colocations handled automatically by placer.


In [0]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 16)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 16, 50)            750000    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 14, 64)            9664      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 7, 64)             0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 5, 64)             12352     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total para

In [0]:
callbacks_list = [
        callbacks.EarlyStopping(
        monitor='val_loss',
        patience=10,    
        ),
        callbacks.ModelCheckpoint(
        filepath = os.path.join(PROJECT_HOME_PATH, 'models', model_name + '.h5'),
        save_best_only=True,
        ),
        callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.1,
        min_lr=0.0001,
        patience=5,
        ),
#         callbacks.TensorBoard(
#         log_dir=os.path.join(PROJECT_HOME_PATH, 'tensorboard'),
#         histogram_freq=1,
#         embeddings_freq=1,
#         ),    
]

In [0]:
history = model.fit(x_trn, y_trn,
                   batch_size=32,
                   epochs=30,
                   callbacks=callbacks_list,
                   validation_split=0.1)

### LSTM

In [0]:
tf.reset_default_graph()

In [0]:
K.clear_session()

In [0]:
input_seq = Input(shape=(maxlen, ), dtype='int32')

emb_layer = Embedding(max_words,
                        embedding_dim,
                        embeddings_initializer=Constant(embedding_matrix),
                        input_length=maxlen, trainable=False)
                     
emb_seq = emb_layer(input_seq)
x = LSTM(128, dropout=0.1, recurrent_dropout=0.2, return_sequences=True)(emb_seq)
x = LSTM(128, dropout=0.1, recurrent_dropout=0.2)(x)
x = Dropout(0.1)(x)
x = Dense(32)(x)
output = Dense(1, activation='sigmoid')(x)

model = Model(input_seq, output)

model_name = 'model_LSTM_128_128_D32_D1'

In [0]:
model.compile(loss='binary_crossentropy',
             optimizer='adam',
             metrics=[f1])

In [0]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 16)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 16, 50)            750000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 16, 128)           91648     
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                4128      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total para

In [0]:
callbacks_list = [
        callbacks.EarlyStopping(
        monitor='val_loss',
        patience=10,    
        ),
        callbacks.ModelCheckpoint(
        filepath = os.path.join(PROJECT_HOME_PATH, 'models', model_name + '.h5'),
        save_best_only=True,
        ),
        callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.1,
        min_lr=0.0001,
        patience=5,
        ),
#         callbacks.TensorBoard(
#         log_dir=os.path.join(PROJECT_HOME_PATH, 'tensorboard'),
#         histogram_freq=1,
#         embeddings_freq=1,
#         ),    
]

In [0]:
history = model.fit(x_trn, y_trn,
                   batch_size=64,
                   epochs=100,
                   callbacks=callbacks_list,
                   validation_split=0.1)