In [None]:
import numpy as np
import json
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
import random
import re
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
def load_and_shuffle(text_file, label_file):
    texts = []
    labels = []
    with open(text_file, 'r') as f:
        for text in f:
            tokenized = ' '.join([token.lower() for token in word_tokenize(text) if token.isalpha()])
            texts.append(tokenized.strip())
    with open(label_file, 'r') as f:
        for label in f:
            if 'vacuous' in label:
                labels.append(1) 
            else:
                labels.append(0) 
              
    combined = list(zip(texts, labels)) #shuffling
    random.shuffle(combined)

    return zip(*combined)

In [None]:
texts, labels = load_and_shuffle('quotes.txt', 'labels.txt')

In [None]:
def lemmatize(sent):
    wnl = WordNetLemmatizer()
    sentence = []
    for word, tag in pos_tag(word_tokenize(sent)):
        wntag = tag[0].lower()
        if wntag == 'j': #rename adjective tags
            wntag = 'a'
        wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None #adjectives, adverbs, nouns, verbs
        if not wntag:
            lemma = word #unlemmatized
        else:
            lemma = wnl.lemmatize(word, wntag)
        sentence.append(lemma)
    return ' '.join(sentence)

In [None]:
texts_lemma = []
for text in texts:
    texts_lemma.append(lemmatize(text))

In [None]:
def k_folds(texts, labels, k):
    length = len(texts)//k
    folds = []
    folds.append([texts, labels])
    for i in range(1, k):
        inputs = texts[i*length:] + texts[:i*length]
        outputs = labels[i*length:] + labels[:i*length]
        folds.append([inputs, outputs])
    return folds

In [None]:
folds = k_folds(texts, labels, 5)
folds_lemma = k_folds(texts_lemma, labels, 5)

## Hyper-parameters

In [None]:
vocab_size = 10000
embedding_dim = 8
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = int(len(labels) * 0.8)
num_epochs = 30
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
adam_for_RNN = tf.keras.optimizers.Adam(learning_rate=0.0001) #for simpleRNN,LSTM, biLSTM

In [None]:
def sequencing_and_padding(texts, labels):
    training_texts = texts[0:training_size]
    testing_texts = texts[training_size:]
    training_labels = labels[0:training_size]
    testing_labels = labels[training_size:]
    
    tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
    tokenizer.fit_on_texts(training_texts)
    ### Save the tokenizer ###
    '''
    tokenizer_json = tokenizer.to_json()
    with open('tokenizer.json', 'w', encoding='utf-8') as f:
        f.write(json.dumps(tokenizer_json, ensure_ascii=False))
    '''
    word_index = tokenizer.word_index

    training_sequences = tokenizer.texts_to_sequences(training_texts)
    training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

    testing_sequences = tokenizer.texts_to_sequences(testing_texts)
    testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
    return [np.array(training_padded), np.array(training_labels), np.array(testing_padded), np.array(testing_labels)]

In [None]:
folds_padded = []
for fold in folds:
    folds_padded.append(sequencing_and_padding(fold[0], fold[1]))
folds_lemma_padded = []
for fold in folds_lemma:
    folds_lemma_padded.append(sequencing_and_padding(fold[0], fold[1]))

## Multilayer Perceptrons

In [None]:
model_MLP = tf.keras.Sequential([ #list of layers to add to the model
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length), #10k * 8 = 80k
    tf.keras.layers.GlobalAveragePooling1D(), #if Flatten() crashes
    tf.keras.layers.Dense(8, activation='tanh'), #(8+1)*8
    tf.keras.layers.Dense(1, activation='sigmoid') #(8+1)
])
    
model_MLP.compile(loss='binary_crossentropy', optimizer='adam', metrics=['Precision'])
model_MLP.summary()

In [None]:
MLP_history = []
for fold_padded in folds_padded:
    history = model_MLP.fit(fold_padded[0], fold_padded[1], epochs = num_epochs, 
                            validation_data = (fold_padded[2], fold_padded[3]), callbacks=[callback], verbose = 1)
    MLP_history.append(history)

In [None]:
MLP_history_lemma = []
for fold_padded in folds_lemma_padded:
    history = model_MLP.fit(fold_padded[0], fold_padded[1], epochs = num_epochs, 
                            validation_data = (fold_padded[2], fold_padded[3]), callbacks=[callback], verbose = 1)
    MLP_history_lemma.append(history)

## 1D Convolutional Layer

In [None]:
model_CNN = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Conv1D(32, 5, activation='relu'), # [i × f × o] + o, (i = 8, f = 2, o = 4)
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(8, activation='tanh'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model_CNN.compile(loss='binary_crossentropy', optimizer='adam', metrics=['Precision'])
model_CNN.summary()
### Save the model ###
#model_CNN.save('1DConv_model.h5')

In [None]:
CNN_history = []
for fold_padded in folds_padded:
    history = model_CNN.fit(fold_padded[0], fold_padded[1], epochs = num_epochs, 
                            validation_data = (fold_padded[2], fold_padded[3]), callbacks=[callback], verbose = 1)
    CNN_history.append(history)

In [None]:
CNN_history_lemma = []
for fold_padded in folds_lemma_padded:
    history = model_CNN.fit(fold_padded[0], fold_padded[1], epochs = num_epochs, 
                            validation_data = (fold_padded[2], fold_padded[3]), callbacks=[callback], verbose = 1)
    CNN_history_lemma.append(history)

## Simple RNN

In [None]:
model_simpleRNN = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.SimpleRNN(32),
    tf.keras.layers.Dense(8, activation = 'tanh'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model_simpleRNN.compile(loss = 'binary_crossentropy', optimizer = adam_for_RNN, metrics=['Precision'])
model_simpleRNN.summary()

In [None]:
simpleRNN_history = []
for fold_padded in folds_padded:
    history = model_simpleRNN.fit(fold_padded[0], fold_padded[1], epochs = num_epochs, 
                            validation_data = (fold_padded[2], fold_padded[3]), callbacks=[callback], verbose = 1)
    simpleRNN_history.append(history)

In [None]:
simpleRNN_history_lemma = []
for fold_padded in folds_lemma_padded:
    history = model_simpleRNN.fit(fold_padded[0], fold_padded[1], epochs = num_epochs, 
                            validation_data = (fold_padded[2], fold_padded[3]), callbacks=[callback], verbose = 1)
    simpleRNN_history_lemma.append(history)

## GRU

In [None]:
model_GRU = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GRU(32),
    tf.keras.layers.Dense(8, activation='tanh'), 
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model_GRU.compile(loss='binary_crossentropy', optimizer = 'adam', metrics=['Precision'])
model_GRU.summary()

In [None]:
GRU_history = []
for fold_padded in folds_padded:
    history = model_GRU.fit(fold_padded[0], fold_padded[1], epochs = num_epochs, 
                            validation_data = (fold_padded[2], fold_padded[3]), callbacks=[callback], verbose = 1)
    GRU_history.append(history)

In [None]:
GRU_history_lemma = []
for fold_padded in folds_lemma_padded:
    history = model_GRU.fit(fold_padded[0], fold_padded[1], epochs = num_epochs, 
                            validation_data = (fold_padded[2], fold_padded[3]), callbacks=[callback], verbose = 1)
    GRU_history_lemma.append(history)

## LSTM

In [None]:
model_LSTM = tf.keras.Sequential([ 
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.LSTM(32), 
    tf.keras.layers.Dense(8, activation = 'tanh'),
    tf.keras.layers.Dense(1, activation = 'sigmoid') 
]) 

model_LSTM.compile(loss='binary_crossentropy', optimizer = adam_for_RNN, metrics=['Precision'])
model_LSTM.summary()

In [None]:
LSTM_history = []
for fold_padded in folds_padded:
    history = model_LSTM.fit(fold_padded[0], fold_padded[1], epochs = num_epochs, 
                            validation_data = (fold_padded[2], fold_padded[3]), callbacks=[callback], verbose = 1)
    LSTM_history.append(history)

In [None]:
LSTM_history_lemma = []
for fold_padded in folds_lemma_padded:
    history = model_LSTM.fit(fold_padded[0], fold_padded[1], epochs = num_epochs, 
                            validation_data = (fold_padded[2], fold_padded[3]), callbacks=[callback], verbose = 1)
    LSTM_history_lemma.append(history)

## BiLSTM

In [None]:
model_biLSTM = tf.keras.Sequential([ 
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    #tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16, return_sequences=True)), 
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(8, activation = 'tanh'),
    tf.keras.layers.Dense(1, activation = 'sigmoid')
])

model_biLSTM.compile(loss='binary_crossentropy', optimizer = adam_for_RNN, metrics=['Precision'])
model_biLSTM.summary()

In [None]:
biLSTM_history = []
for fold_padded in folds_padded:
    history = model_biLSTM.fit(fold_padded[0], fold_padded[1], epochs = num_epochs, 
                            validation_data = (fold_padded[2], fold_padded[3]), callbacks=[callback], verbose = 1)
    biLSTM_history.append(history)

In [None]:
biLSTM_history_lemma = []
for fold_padded in folds_lemma_padded:
    history = model_biLSTM.fit(fold_padded[0], fold_padded[1], epochs = num_epochs, 
                            validation_data = (fold_padded[2], fold_padded[3]), callbacks=[callback], verbose = 1)
    biLSTM_history_lemma.append(history)

## Plot Results

In [None]:
import matplotlib.pyplot as plt

In [None]:
from operator import add
def cross_validating(histories):
    prec = [0]*num_epochs
    val_prec = [0]*num_epochs
    for history in histories:
        prec = list(map(add, prec, history.history['Precision']))
        val_prec = list(map(add, val_prec, history.history['val_Precision']))
    return [precision/5 for precision in prec], [val_precision/5 for val_precision in val_prec]

In [None]:
MLP_prec, MLP_val_prec = cross_validating(MLP_history)
MLP_lemma_prec, MLP_lemma_val_prec = cross_validating(MLP_history_lemma)
CNN_prec, CNN_val_prec = cross_validating(CNN_history)
CNN_lemma_prec, CNN_lemma_val_prec = cross_validating(CNN_history_lemma)
simpleRNN_prec, simpleRNN_val_prec = cross_validating(simpleRNN_history)
simpleRNN_lemma_prec, simpleRNN_lemma_val_prec = cross_validating(simpleRNN_history_lemma)
GRU_prec, GRU_val_prec = cross_validating(GRU_history)
GRU_lemma_prec, GRU_lemma_val_prec = cross_validating(GRU_history_lemma)
LSTM_prec, LSTM_val_prec = cross_validating(LSTM_history)
LSTM_lemma_prec, LSTM_lemma_val_prec = cross_validating(LSTM_history_lemma)
biLSTM_prec, biLSTM_val_prec = cross_validating(biLSTM_history)
biLSTM_lemma_prec, biLSTM_lemma_val_prec = cross_validating(biLSTM_history_lemma)

In [None]:
simpleRNN_variance = np.array(simpleRNN_prec) - np.array(simpleRNN_val_prec)
simpleRNN_lemma_variance = np.array(simpleRNN_lemma_prec) - np.array(simpleRNN_lemma_val_prec)

In [None]:
from scipy import stats
stats.ttest_ind(simpleRNN_variance, simpleRNN_lemma_variance)

In [None]:
plt.plot(MLP_prec, '--o', MLP_val_prec, '--o', MLP_lemma_prec, '--o', MLP_lemma_val_prec, '--o', markersize=3.5, linewidth=1.5)
#plt.gca().fill_between(epoch, MLP_prec, MLP_lemma_prec, facecolor = 'grey', alpha = 0.5)
plt.xlabel('Epochs')
plt.ylabel('Precision') 
plt.title('MLP')
plt.legend(['Precision', 'Validation Precision', 'Precision (lemma)', 'Validation Precision (lemma)'])

In [None]:
plt.plot(MLP_val_prec, '--o', CNN_val_prec, '--o', simpleRNN_val_prec, '--o', GRU_val_prec, '--o',
         LSTM_val_prec, '--o', biLSTM_val_prec, '--o', markersize=3.5, linewidth=1.5)
plt.xlabel('Epochs')
plt.ylabel('Precision') 
plt.title('Models')
plt.legend(['MLP', 'CNN', 'SimpleRNN', 'GRU', 'LSTM', 'BiLSTM'])

In [None]:
plt.plot(MLP_lemma_val_prec, '--o', CNN_lemma_val_prec, '--o', simpleRNN_lemma_val_prec, '--o', GRU_lemma_val_prec, '--o',
         LSTM_lemma_val_prec, '--o', biLSTM_lemma_val_prec, '--o', markersize=3.5, linewidth=1.5)
plt.xlabel('Epochs')
plt.ylabel('Precision') 
plt.title('Models (lemma)')
plt.legend(['MLP', 'CNN', 'SimpleRNN', 'GRU', 'LSTM', 'BiLSTM'])

## Rebuild an MLP for Word Embeddings

In [None]:
corpus = ' '.join(texts)
corpus = corpus.split()
word_counts = {}
for token in corpus:
    if word_counts.get(token):
        word_counts[token] = word_counts.get(token) + 1
    else:
        word_counts[token] = 1
new_texts = []
for text in texts:
    cleaned = ''
    for token in text.split():
        if word_counts[token] > 10:
            cleaned += token + ' '
    new_texts.append(cleaned.strip())
size = len(set(' '.join(new_texts).split()))

In [None]:
model_MLP_full = tf.keras.Sequential([ 
    tf.keras.layers.Embedding(size, 8, input_length=150), 
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(8, activation='tanh'), 
    tf.keras.layers.Dense(1, activation='sigmoid')
])
    
model_MLP_full.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_MLP_full.summary()

In [None]:
tokenizer = Tokenizer(size, oov_token=oov_tok)
tokenizer.fit_on_texts(new_texts)
word_index = tokenizer.word_index
training_sequences = tokenizer.texts_to_sequences(texts)
training_padded = pad_sequences(training_sequences, maxlen=150, padding=padding_type, truncating=trunc_type)

In [None]:
history_full = model_MLP_full.fit(training_padded, np.array(labels), epochs = 15, verbose = 2)

## Acqure weights

In [None]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) 

e = model_MLP_full.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

## Word Similarities

In [None]:
from collections import Counter
from numpy.linalg import norm
def most_similar(word):
    reversed_index = word_index.get(word, 0) #1-indexed
    if reversed_index > 5001 or reversed_index == 0:
        return '<OOV>'
    similarities = Counter()
    for i in range(1,5001):
        weight_self = weights[reversed_index-1]
        weight = weights[i-1]
        similarities[reverse_word_index[i]] = round(np.dot(weight_self, weight) / (norm(weight_self) * norm(weight)),5)
    return similarities.most_common(20)

In [None]:
most_similar('flaws')

## Output weights
metadat and vectors can be uploaded at http://projector.tensorflow.org/ for visualizations

In [None]:
import io

out_v = io.open('vecs_pseudo-profound.tsv', 'w', encoding='utf-8')
out_m = io.open('meta_pseudo-profound.tsv', 'w', encoding='utf-8')
for word_num in range(1, 5001):
  word = reverse_word_index[word_num] #1-indexed
  embeddings = weights[word_num-1] #0-indexed
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

## Prediction

In [None]:
example_texts = ["You are basic!", 'Live, laugh, love!']
sequences = tokenizer.texts_to_sequences(example_texts )
padded = pad_sequences(sequences, maxlen=150, padding=padding_type, truncating=trunc_type)
print(model_MLP_full.predict(padded)[0][0])
print(model_MLP_full.predict(padded)[1][0])