In [1]:
import os
import random
import numpy as np

from collections import namedtuple

from sklearn.preprocessing import LabelEncoder

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

Using TensorFlow backend.


# TREC - Question Answering (multi-class)

In [2]:
# TREC - Question Answering
# http://cogcomp.cs.illinois.edu/Data/QA/QC/

TREC_Question = namedtuple("TREC_Question", "label question")

trec_train = set()
trec_test = set()

for filename in os.listdir("TREC/"):
    with open("TREC/"+filename,'r', encoding='latin_1') as f_input:
        for line in f_input:
            label, question = line.split(' ', 1)
            label = label.split(':')[0]
            question = TREC_Question(label, question.strip())
            if filename=='TREC_10.label':
                trec_test.add(question)
            else:
                trec_train.add(question)

In [3]:
print("Train Samples: {}".format(len(trec_train)))
print("Test Samples : {}".format(len(trec_test)))
print("Labels       : {}".format({x.label for x in trec_train}))

Train Samples: 5381
Test Samples : 500
Labels       : {'ENTY', 'LOC', 'ABBR', 'NUM', 'HUM', 'DESC'}


In [4]:
# built two lists with sentences and labels
questions_train = [x.question for x in trec_train]
labels_train = [x.label for x in trec_train]

# convert list of tokens/words to indexes
tokenizer = Tokenizer()
tokenizer.fit_on_texts(questions_train)
sequences_train = tokenizer.texts_to_sequences(questions_train)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

# get the max sentence lenght, needed for padding
max_input_lenght = max([len(x) for x in sequences_train])
print("Max. sequence lenght: ", max_input_lenght)

# pad all the sequences of indexes to the 'max_input_lenght'
data_train = pad_sequences(sequences_train, maxlen=max_input_lenght, padding='post', truncating='post')

# Encode the labels, each must be a vector with dim = num. of possible labels
le = LabelEncoder()
le.fit(labels_train)
labels_encoded_train = le.transform(labels_train)
categorical_labels_train = to_categorical(labels_encoded_train, num_classes=None)
print('Shape of train data tensor:', data_train.shape)
print('Shape of train label tensor:', categorical_labels_train.shape)

Found 8461 unique tokens.
Max. sequence lenght:  33
Shape of train data tensor: (5381, 33)
Shape of train label tensor: (5381, 6)


## TREC: test data

In [5]:
# pre-process test data
questions_test = [x.question for x in trec_test]
y_test = [x.label for x in trec_test]
sequences_test = tokenizer.texts_to_sequences(questions_test)
x_test = pad_sequences(sequences_test, maxlen=max_input_lenght)

labels_encoded_test = le.transform(y_test)
categorical_labels_test = to_categorical(labels_encoded_test, num_classes=None)
print('Shape of test data tensor:', x_test.shape)
print('Shape of test labels tensor:', categorical_labels_test.shape)

Shape of test data tensor: (500, 33)
Shape of test labels tensor: (500, 6)


In [7]:
from convnets_utils import *

# CNN with random word embeddings

In [9]:
model_1 = get_cnn_rand(300, len(word_index)+1, max_input_lenght, 6)

In [10]:
history = model_1.fit(x=data_train, y=categorical_labels_train, batch_size=50, epochs=10)

Instructions for updating:
Use tf.cast instead.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [11]:
loss, accuracy = model_1.evaluate(x_test, categorical_labels_test, verbose=0)
accuracy

0.9016666593551635

In [12]:
raw_predictions = model_1.predict(x_test)
class_predictions = [np.argmax(x) for x in raw_predictions]
print(classification_report(y_test, le.inverse_transform(class_predictions)))

              precision    recall  f1-score   support

        ABBR       1.00      0.44      0.62         9
        DESC       0.57      0.34      0.43       138
        ENTY       0.37      0.68      0.48        94
         HUM       0.93      0.86      0.90        65
         LOC       0.90      0.85      0.87        81
         NUM       0.97      0.89      0.93       113

   micro avg       0.68      0.68      0.68       500
   macro avg       0.79      0.68      0.70       500
weighted avg       0.73      0.68      0.69       500



# CNN with pre-trained static word embeddings

In [13]:
embeddings_index = load_fasttext_embeddings()
embeddings_matrix = create_embeddings_matrix(embeddings_index, word_index, 100)
embedding_layer_static = get_embeddings_layer(embeddings_matrix, 'embedding_layer_static', max_input_lenght, trainable=False)
model_2 = get_cnn_pre_trained_embeddings(embedding_layer_static, max_input_lenght, 6)

Loaded 400000 word vectors.
Matrix shape: (8462, 100)


In [14]:
history = model_2.fit(x=data_train, y=categorical_labels_train, batch_size=50, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [15]:
loss, accuracy = model_2.evaluate(x_test, categorical_labels_test, verbose=0)
accuracy

0.8816666488647461

In [16]:
raw_predictions = model_2.predict(x_test)
class_predictions = [np.argmax(x) for x in raw_predictions]
print(classification_report(y_test, le.inverse_transform(class_predictions)))

              precision    recall  f1-score   support

        ABBR       1.00      0.11      0.20         9
        DESC       0.78      0.05      0.10       138
        ENTY       0.31      0.74      0.44        94
         HUM       0.96      0.69      0.80        65
         LOC       0.55      0.68      0.61        81
         NUM       0.70      0.73      0.72       113

   micro avg       0.52      0.52      0.52       500
   macro avg       0.72      0.50      0.48       500
weighted avg       0.66      0.52      0.48       500



# CNN with pre-trained dynamic word embeddings

In [17]:
embedding_layer_dynamic = get_embeddings_layer(embeddings_matrix, 'embedding_layer_dynamic', max_input_lenght, trainable=True)
model_3 = get_cnn_pre_trained_embeddings(embedding_layer_dynamic, max_input_lenght, 6)

In [18]:
history = model_3.fit(x=data_train, y=categorical_labels_train, batch_size=50, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [19]:
loss, accuracy = model_3.evaluate(x_test, categorical_labels_test, verbose=0)
accuracy

0.8899999933242798

In [20]:
raw_predictions = model_3.predict(x_test)
class_predictions = [np.argmax(x) for x in raw_predictions]
print(classification_report(y_test, le.inverse_transform(class_predictions)))

              precision    recall  f1-score   support

        ABBR       1.00      0.56      0.71         9
        DESC       0.60      0.04      0.08       138
        ENTY       0.69      0.70      0.70        94
         HUM       0.93      0.82      0.87        65
         LOC       0.73      0.89      0.80        81
         NUM       0.44      0.91      0.59       113

   micro avg       0.61      0.61      0.61       500
   macro avg       0.73      0.65      0.63       500
weighted avg       0.65      0.61      0.54       500



# CNN multichanell with pre-trained dynamic and static word embeddings

In [21]:
model_4 = get_cnn_multichannel(embedding_layer_static, embedding_layer_dynamic, max_input_lenght, 6)

In [22]:
history = model_4.fit(x=[data_train, data_train], y=categorical_labels_train, batch_size=50, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [23]:
loss, accuracy = model_4.evaluate(x=[x_test,x_test], y=categorical_labels_test, verbose=0)
accuracy

0.9206666488647461

In [24]:
raw_predictions = model_4.predict([x_test, x_test])
class_predictions = [np.argmax(x) for x in raw_predictions]
print(classification_report(y_test, le.inverse_transform(class_predictions)))

              precision    recall  f1-score   support

        ABBR       1.00      0.67      0.80         9
        DESC       0.76      0.12      0.20       138
        ENTY       0.39      0.81      0.53        94
         HUM       0.89      0.85      0.87        65
         LOC       0.66      0.88      0.76        81
         NUM       0.93      0.89      0.91       113

   micro avg       0.65      0.65      0.65       500
   macro avg       0.77      0.70      0.68       500
weighted avg       0.73      0.65      0.61       500

