In [3]:
import tensorflow as tf
if tf.test.gpu_device_name():
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
    print("Please install GPU version of TF")

Default GPU Device: /device:GPU:0


In [4]:
import pandas as pd
data = pd.read_excel('politeness_2k_data.xlsx')
data = data.fillna('_NA_')

In [5]:
label_names = ["target"]
y_train = data[label_names].values

In [7]:
import numpy as np
data['doc_len'] = data['comment'].apply(lambda words: len(words.split(" ")))
max_seq_len = np.round(data['doc_len'].mean() + data['doc_len'].std()).astype(int)

In [15]:
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize


from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])


## preprocessing starting 
raw_docs_train = data['comment'].tolist()
num_classes = len(label_names)

print("pre-processing train data...")

processed_docs_train = []
for doc in tqdm(raw_docs_train):
    tokens = word_tokenize(doc)
    filtered = [word for word in tokens if word not in stop_words]
    processed_docs_train.append(" ".join(filtered))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/amiangshu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
 16%|█▌        | 326/2066 [00:00<00:00, 3244.43it/s]

pre-processing train data...


100%|██████████| 2066/2066 [00:00<00:00, 4779.31it/s]


In [21]:
from tensorflow import keras
from tensorflow.keras.preprocessing import sequence

MAX_NB_WORDS = 10000
from tensorflow.keras.preprocessing.text import Tokenizer


print("tokenizing input data...")
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True, char_level=False)
tokenizer.fit_on_texts(processed_docs_train )  #leaky
word_seq_train = tokenizer.texts_to_sequences(processed_docs_train)
word_index = tokenizer.word_index
print("dictionary size: ", len(word_index))

#pad sequences
word_seq_train = sequence.pad_sequences(word_seq_train, maxlen=max_seq_len)

tokenizing input data...
dictionary size:  7245


In [25]:
print('loading word embeddings...')
import os, re, csv, math, codecs

embeddings_index = {}
f = codecs.open('Embedding/crawl-300d-2M.vec', encoding='utf-8')

for line in tqdm(f):
    values = line.rstrip().rsplit(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('found %s word vectors' % len(embeddings_index))

3894it [00:00, 19477.81it/s]

loading word embeddings...


2000005it [01:48, 18455.55it/s]

found 1999997 word vectors





In [27]:
#embedding matrix

print('preparing embedding matrix...')
embed_dim = 300
words_not_found = []
nb_words = min(MAX_NB_WORDS, len(word_index)+1)
embedding_matrix = np.zeros((nb_words, embed_dim))

for word, i in word_index.items():
    if i >= nb_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)
print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

preparing embedding matrix...
number of null word embeddings: 1245


In [39]:
import tensorflow as tf
from sklearn.model_selection import KFold
kf = KFold(n_splits=10, shuffle=True, random_state=125)
from tensorflow.keras.callbacks import EarlyStopping
#from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import  precision_score
from sklearn.metrics import  f1_score

In [42]:

#only LSTM
from tensorflow.keras import regularizers

from tensorflow.keras.layers import BatchNormalization
import tensorflow as tf


#max_features =22248
#nb_words=22248
embedding_dim =300
sequence_length = 100

def LSTM_model():
    model =  keras.Sequential()
    #model.add(tf.keras.layers.Embedding(max_features +1, embedding_dim, input_length=sequence_length,\
                                    #embeddings_regularizer = regularizers.l2(0.005))) 
    model.add( keras.layers.Embedding(nb_words,embed_dim,input_length=max_seq_len,weights=[embedding_matrix],trainable=False))
    model.add( keras.layers.Dropout(0.4))

    model.add( keras.layers.LSTM(embedding_dim,dropout=0.2, recurrent_dropout=0.2,return_sequences=True,\
                                                             kernel_regularizer=regularizers.l2(0.005),\
                                                             bias_regularizer=regularizers.l2(0.005)))

    model.add( keras.layers.Flatten())

    model.add( keras.layers.Dense(512, activation='relu',\
                                kernel_regularizer=regularizers.l2(0.001),\
                                bias_regularizer=regularizers.l2(0.001),))
    model.add( keras.layers.Dropout(0.4))

    model.add( keras.layers.Dense(8, activation='relu',\
                                kernel_regularizer=regularizers.l2(0.001),\
                                bias_regularizer=regularizers.l2(0.001),))
    model.add( keras.layers.Dropout(0.4))


    model.add( keras.layers.Dense(1,activation='sigmoid'))
    
    model.compile(loss=tf.keras.losses.BinaryCrossentropy(),optimizer=tf.keras.optimizers.Adam(1e-3),metrics=['acc'])
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [43]:
LSTM_model().summary()


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 57, 300)           2173800   
_________________________________________________________________
dropout_1 (Dropout)          (None, 57, 300)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 57, 300)           721200    
_________________________________________________________________
flatten (Flatten)            (None, 17100)             0         
_________________________________________________________________
dense (Dense)                (None, 512)               8755712   
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
__________________________

In [47]:
from tensorflow.keras.utils import plot_model

#plot_model(LSTM_model(), to_file='LSTMmodel.png', show_shapes=True, show_layer_names=True)


In [50]:
es_callback = EarlyStopping(monitor='val_loss', patience=3)


In [53]:

lstm_run_precision = []
lstm_run_recall = []
lstm_run_f1score = []
lstm_run_accuracy = []

In [54]:
count = 1
num_epochs = 40

for train_index, test_index in kf.split(word_seq_train):
    x_trn, x_tst = word_seq_train[train_index], word_seq_train[test_index]
    y_trn, y_tst = y_train[train_index], y_train[test_index]
    
    x_new_train, x_val, y_new_train, y_val= train_test_split(x_trn, y_trn, test_size=0.11115, random_state=125)
    
    print("\nFold ", count)
    lstm_model=LSTM_model()

    
    
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)
    
    history =lstm_model.fit( x_new_train, y_new_train, batch_size=256,
          epochs=num_epochs, validation_data=(x_val, y_val), callbacks=[es_callback], shuffle=False)
    
    _, train_acc =lstm_model.evaluate(x_new_train,  y_new_train, verbose=0)
    _, val_acc =  lstm_model.evaluate(x_val, y_val, verbose=0)
    print('Train: %.3f, Test: %.3f' % (train_acc, val_acc))
    
    
     
    #plt.savefig('LSTM with fasttext SE data accuracy graph.png')
    #plt.show()
    
    
    y_pred = lstm_model.predict(x_tst)
    y_pred = (y_pred >= 0.5)
 
    
    from sklearn import metrics
    print(metrics.classification_report(y_tst, y_pred))
    
    lstm_precision = precision_score(y_tst, y_pred, pos_label=1)
    lstm_recall = recall_score(y_tst, y_pred, pos_label=1)
    lstm_f1score = f1_score(y_tst, y_pred, pos_label=1)
    lstm_accuracy = accuracy_score(y_tst, y_pred)

    lstm_run_accuracy.append(lstm_accuracy)
    lstm_run_f1score.append(lstm_f1score)
    lstm_run_precision.append(lstm_precision)
    lstm_run_recall.append(lstm_recall)
    
    count = count+1


Fold  1
Train on 1652 samples, validate on 207 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Train: 0.969, Test: 0.807
              precision    recall  f1-score   support

           0       0.84      0.95      0.89       165
           1       0.58      0.26      0.36        42

    accuracy                           0.81       207
   macro avg       0.71      0.61      0.63       207
weighted avg       0.78      0.81      0.78       207


Fold  2
Train on 1652 samples, validate on 207 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/4

Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Train: 0.935, Test: 0.826
              precision    recall  f1-score   support

           0       0.88      0.94      0.91       172
           1       0.55      0.34      0.42        35

    accuracy                           0.84       207
   macro avg       0.71      0.64      0.66       207
weighted avg       0.82      0.84      0.83       207


Fold  3
Train on 1652 samples, validate on 207 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40


Epoch 40/40
Train: 0.960, Test: 0.850
              precision    recall  f1-score   support

           0       0.87      0.98      0.92       171
           1       0.71      0.28      0.40        36

    accuracy                           0.86       207
   macro avg       0.79      0.63      0.66       207
weighted avg       0.84      0.86      0.83       207


Fold  4
Train on 1652 samples, validate on 207 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Train: 0.944, Test: 0.749
              precision    recall  f1-score   support

           0       0.87      0.84      0.86       158
           1       0.54      0.59      0.56        49

    accuracy                           0.78       

Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Train: 0.929, Test: 0.845
              precision    recall  f1-score   support

           0       0.84      0.98      0.90       167
           1       0.70      0.18      0.29        39

    accuracy                           0.83       206
   macro avg       0.77      0.58      0.59       206
weighted avg       0.81      0.83      0.79       206


Fold  8
Train on 1653 samples, validate on 207 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40


Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Train: 0.966, Test: 0.826
              precision    recall  f1-score   support

           0       0.86      0.98      0.92       169
           1       0.79      0.30      0.43        37

    accuracy                           0.86       206
   macro avg       0.83      0.64      0.68       206
weighted avg       0.85      0.86      0.83       206



In [None]:
## for bilstm model

In [70]:
import tensorflow
from tensorflow.keras.models import Model

from tensorflow.keras.layers import Dense, Input, LSTM,GlobalMaxPool1D
maxlen=max_seq_len
embed_size=300
max_features=nb_words
def Bi_LSTM_base():
    inp = keras.layers.Input(shape=(maxlen,))
    x = tensorflow.keras.layers.Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = tensorflow.keras.layers.Bidirectional(tensorflow.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
    x = tensorflow.keras.layers.GlobalMaxPool1D()(x)
    x = tensorflow.keras.layers.Dense(50, activation="relu")(x)
    x = tensorflow.keras.layers.Dropout(0.1)(x)
    x = tensorflow.keras.layers.Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [71]:
Bi_LSTM_base().summary()


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         [(None, 57)]              0         
_________________________________________________________________
embedding_18 (Embedding)     (None, 57, 300)           2173800   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 57, 100)           140400    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 100)               0         
_________________________________________________________________
dense_50 (Dense)             (None, 50)                5050      
_________________________________________________________________
dropout_50 (Dropout)         (None, 50)                0         
_________________________________________________________________
dense_51 (Dense)             (None, 1)                 51    

In [72]:

blbase_run_precision = []
blbase_run_recall = []
blbase_run_f1score = []
blbase_run_accuracy = []

In [73]:
count = 1

for train_index, test_index in kf.split(word_seq_train):
    x_trn, x_tst = word_seq_train[train_index], word_seq_train[test_index]
    y_trn, y_tst = y_train[train_index], y_train[test_index]
    
    x_new_train, x_val, y_new_train, y_val= train_test_split(x_trn, y_trn, test_size=0.11115, random_state=125)
    
    print("\nFold ", count)
    bilstmbase_model=Bi_LSTM_base()

    #model_lstm_fasttext=model_with_embedding()
    
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)
    
    history =bilstmbase_model.fit( x_new_train, y_new_train, batch_size=32,
          epochs=num_epochs, validation_data=(x_val, y_val), callbacks=[es_callback], shuffle=False)
    
    _, train_acc = bilstmbase_model.evaluate(x_new_train,  y_new_train, verbose=0)
    _, val_acc =  bilstmbase_model.evaluate(x_val, y_val, verbose=0)
    print('Train: %.3f, Test: %.3f' % (train_acc, val_acc))
    
    
    y_pred = bilstmbase_model.predict(x_tst)
    y_pred = (y_pred >= 0.5)
 
    
    from sklearn import metrics
    print(metrics.classification_report(y_tst, y_pred))
    
    blbase_precision = precision_score(y_tst, y_pred, pos_label=1)
    blbase_recall = recall_score(y_tst, y_pred, pos_label=1)
    blbase_f1score = f1_score(y_tst, y_pred, pos_label=1)
    blbase_accuracy = accuracy_score(y_tst, y_pred)

    blbase_run_accuracy.append(blbase_accuracy)
    blbase_run_f1score.append(blbase_f1score)
    blbase_run_precision.append(blbase_precision)
    blbase_run_recall.append(blbase_recall)
    
    count = count+1


Fold  1
Train on 1652 samples, validate on 207 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Train: 0.999, Test: 0.855
              precision    recall  f1-score   support

           0       0.86      0.96      0.91       165
           1       0.73      0.38      0.50        42

    accuracy                           0.85       207
   macro avg       0.79      0.67      0.70       207
weighted avg       0.83      0.85      0.83       207


Fold  2
Train on 1652 samples, validate on 207 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Train: 0.998, Test: 0.836
              precision    recall  f1-score   support

           0       0.94      0.91      0.92       172
           1       0.61      0.71      0.66        35

    accuracy                           0.87       207
   macro avg       0.77      0.81      0.79       207
weighted avg       0.88      0.87      0.88       207


Fold  3
Train on 1652 samples, validate on 207 samples
Epoch

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Train: 1.000, Test: 0.841
              precision    recall  f1-score   support

           0       0.85      0.92      0.88       165
           1       0.52      0.37      0.43        41

    accuracy                           0.81       206
   macro avg       0.69      0.64      0.66       206
weighted avg       0.79      0.81      0.79       206


Fold  9
Train on 1653 samples, validate on 207 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Train: 1.000, Test: 0.816
              precision    recall  f1-score   support

           0       0.93      0.94      0.93       180
           1       0.54      0.50      0.52        26

    accuracy                           0.88       206
   macro avg       0.74      0.72      0.73       206
weighted avg       0.88      0.88      0.88       206


Fold  10
Train on 1653 samples, validate on 207 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Train: 0.996, Te