# Question Classification

**Aim**: Establish baseline(s) and subsequent models for question classification using the Trec(6) dataset.

It would also be good to test these models on a small test set of labelled questions from narrativeQA

In [1]:
'''
Imports
'''

import os, re, string, sys
import sys
import pandas as pd
import nltk 
from nltk.tokenize import TweetTokenizer
import numpy as np
from nltk.tokenize import word_tokenize
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.compose import ColumnTransformer
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
import spacy
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin, BaseEstimator
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
import torch.nn.functional as F

***

In [19]:
'''
Load/parse dataset
'''

xtrain = []
ytrain =[]
xtest = []
ytest = []    

# Train
with open("../../data/question_classification/trec_train.txt", 'rb') as f:
    questions = [x.decode('utf8').strip() for x in f.readlines()]
    for q in questions:
        splt = q.replace("\n", "").split(":")
        ytrain.append(splt[0])
        xtrain.append(" ".join(splt[1].split(" ")[1:]))
        
# Test
with open("../../data/question_classification/trec_test.txt", 'rb') as f:
    questions = [x.decode('utf8').strip() for x in f.readlines()]
    for q in questions:
        splt = q.replace("\n", "").split(":")
        ytest.append(splt[0])
        xtest.append(" ".join(splt[1].split(" ")[1:]))
        
classes = dict(enumerate(list(set(ytrain))))
reverse_classes = {v: k for k, v in classes.items()}

print("Classes: ", list(reverse_classes.keys()))
print("Classes counts:")
for cls in list(reverse_classes.keys()):
    print("- {}: {}".format(cls, len([x for x in ytrain if x == cls])))
    
print(reverse_classes)

Classes:  ['DESC', 'LOC', 'HUM', 'ENTY', 'ABBR', 'NUM']
Classes counts:
- DESC: 1162
- LOC: 835
- HUM: 1223
- ENTY: 1250
- ABBR: 86
- NUM: 896
{'DESC': 0, 'LOC': 1, 'HUM': 2, 'ENTY': 3, 'ABBR': 4, 'NUM': 5}


In [3]:
'''
Preprocess data
'''

def preprocess_text(data):
    # Remove punctuation
    exclude = set(string.punctuation)
    data = [''.join(ch for ch in x if ch not in exclude).strip() for x in data]
    
    # Remove multi-spaces
    data = [re.sub(' +', ' ', x) for x in data]
    return data

xtrain = preprocess_text(xtrain)
xtest = preprocess_text(xtest)

print("Training instances: ", len(xtrain))
print("Testing instances: ", len(xtest))
print("\nTraining examples:")
print(xtrain[:5])

Training instances:  5452
Testing instances:  500

Training examples:
['How did serfdom develop in and then leave Russia', 'What films featured the character Popeye Doyle', 'How can I find a list of celebrities real names', 'What fowl grabs the spotlight after the Chinese Year of the Monkey', 'What is the full form of com']


In [8]:
'''
Inspect data
'''

for cls in reverse_classes.keys():
    print("Class {} question examples: \n".format(cls))
    idxs = [i for i, x in enumerate(ytrain) if x == cls]
    for ex in np.array(xtrain)[idxs][:10]:
        print("{}".format(ex))
    print("-"*40)

Class HUM question examples: 

What contemptible scoundrel stole the cork from my lunch
What team did baseball s St Louis Browns become
What is the oldest profession
Name the scarfaced bounty hunter of The Old West
Who was The Pride of the Yankees
Who killed Gandhi
Name 11 famous martyrs
Who was the inventor of silly putty
Which company that manufactures videogame hardware sells the super system
What 1920s cowboy star rode Tony the Wonder Horse
----------------------------------------
Class DESC question examples: 

How did serfdom develop in and then leave Russia
How can I find a list of celebrities real names
What are liver enzymes
Why do heavier objects travel downhill faster
What did the only repealed amendment to the US Constitution deal with
What is Nine Inch Nails
What is an annotated bibliography
What s the Olympic motto
What is the origin of the name Scarlett
What do Mormons believe
----------------------------------------
Class LOC question examples: 

What sprawling US state

***

In [None]:
'''
Util functions
'''

def plot_confusion_matrix(test_predictions):
    cm = confusion_matrix(ytest, test_predictions)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(xticks_rotation='vertical')

***

## Baseline 1 - Question words

Create and test a simple baseline that makes use of question words to classify questions into one of the 6 question classes.

In [None]:
'''
Establish baseline 1
'''
        
def get_named_entities(tokens):
    entities = nltk.chunk.ne_chunk(nltk.pos_tag(tokens))
    return entities

# Jonathan's baseline adapted
class SimpleBaseline:
    def __init__(self):
        self.categories = classes.keys()
                            
    def classify_question(self, question):
        question_tokens = word_tokenize(question) 
        words_and_labels = get_named_entities(question_tokens)
        entities = [ne for ne in words_and_labels if isinstance(ne, nltk.tree.Tree)]

        labels = [e.label() for e in entities]

        # HUM
        who_condition = "who" in [t.lower() for t in question_tokens]
        whose_condition = "whose" in [t.lower() for t in question_tokens]
        whos_condition = "who's" in [t.lower() for t in question_tokens]
        whom_condition = "whom" in [t.lower() for t in question_tokens]
        
        which_condition = "which" in [t.lower() for t in question_tokens]


        # ENTY
        what_condition = "what" in [t.lower() for t in question_tokens]
        
        # LOC
        where_condition = "where" in [t.lower() for t in question_tokens]
        
        # DESC
        when_condition = "when" in [t.lower() for t in question_tokens]
        why_condition = "why" in [t.lower() for t in question_tokens]
        how_condition = "how" in [t.lower() for t in question_tokens]
        
        was_condition = "was" in [t.lower() for t in question_tokens]
        did_condition = "did" in [t.lower() for t in question_tokens]
        
        named_person_condition = "PERSON" in labels

        
        if who_condition or whose_condition or whos_condition or whom_condition or (which_condition and named_person_condition) or [t.lower() for t in question_tokens[:1]] == ["name"]:
            return("HUM") #We want a person's name
        
                
        if when_condition or [t.lower() for t in question_tokens[:2]] == ["how", "many"] or (when_condition and (was_condition or did_condition)) or (what_condition and 'date' in [t.lower() for t in question_tokens]):
            return("NUM") #Number

        
        if what_condition and "stand" in [t.lower() for t in question_tokens] and "for" in [t.lower() for t in question_tokens] or "abbreviation" in [t.lower() for t in question_tokens] :
            return("ABBR")
        
                
        if where_condition or (what_condition and ("country" in [t.lower() for t in question_tokens] or "state" in [t.lower() for t in question_tokens])):
            return("LOC") #Location
        
        if [t.lower() for t in question_tokens[:2]] == ["what", "is"]:
            return("DESC")
        
        if (what_condition and not named_person_condition) or which_condition:
            return("ENTY")
        
        
        else:
            return("DESC")

In [None]:
'''
Test baseline 1 (does not need training)
'''

SB = SimpleBaseline()

predictions_b1 = []

bs_x = xtest
bs_y = ytest

for q in bs_x:
    pred_cls = SB.classify_question(q)
    predictions_b1.append(pred_cls)

print(classification_report(bs_y, predictions_b1))

In [None]:
i = 5
print(classes[i])
print(len([x for x in predictions_b1 if x == classes[i]]))

In [None]:
plot_confusion_matrix(predictions_b1) # The labels for this are currently wrong

***

## Baseline 2 - NLU package

Import and test a pre-trained question classifier from the Natural Language Understanding library and a more advanced baseline.

In [None]:
#nlu_cls = nlu.load('en.classify.trec6')

In [None]:
# predictions_b2 = []

# bs_x = xtest
# bs_y = ytest

# for q in bs_x:
#     pred_cls = nlu_cls.predict(q)
#     predictions_b2.append(pred_cls)
    
# print(classification_report(bs_y, predictions_b2))

***

## BOW Logistic Regression model

In [None]:
# Custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

In [None]:
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

punctuations = string.punctuation

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    
    # Removing stop words
    mytokens = [ word for word in mytokens if str(word) not in stop_words and str(word) not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [None]:
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [None]:
# Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter = 2000)

# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', classifier)])

In [None]:
ytrain_enc = [reverse_classes[x] for x in ytrain]
ytest_enc = [reverse_classes[x] for x in ytest]

In [None]:
pipe.fit(xtrain, ytrain_enc)

In [None]:
predicted = pipe.predict(xtest)

In [None]:
print(classification_report(bs_y, [classes[y] for y in predicted]))

***
## Sentence Embedding Logistic Regression model


In [None]:
class GloveVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, nlp):
        self.nlp = nlp
        self.dim = 300

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([self.nlp(text).vector for text in X])

In [None]:
nlp = spacy.load("en_core_web_sm")

column_preprocessor = ColumnTransformer(
    [
        ('text_glove', GloveVectorTransformer(nlp), 'text'),
    ],
    remainder='drop',
    n_jobs=1
)

# Create pipeline using sentence embedding 
pipe2 = Pipeline([('column_preprocessor', column_preprocessor),
                 ('classifier', classifier)])

In [None]:
xtrain_df = pd.DataFrame(np.array(xtrain))
xtrain_df.columns = ["text"]
pipe2.fit(xtrain_df, ytrain_enc)

In [None]:
xtest_df = pd.DataFrame(np.array(xtest))
xtest_df.columns = ["text"]
predicted = pipe2.predict(xtest_df)

In [None]:
print(classification_report(bs_y, [classes[y] for y in predicted]))

***
## InferSent model - Log Reg 

Inspired by SOTA on paperswithcode

In [None]:
# Load in pre-trained encoder
from models import InferSent
V = 2
MODEL_PATH = 'encoder/infersent%s.pkl' % V
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': V}

infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))

In [None]:
W2V_PATH = 'fastText/crawl-300d-2M.vec'
infersent.set_w2v_path(W2V_PATH)

In [None]:
infersent.build_vocab(xtrain, tokenize=True)

In [None]:
inf_train = infersent.encode(xtrain, tokenize=True)

In [None]:
classifier.fit(inf_train, ytrain_enc)

In [None]:
inf_test = infersent.encode(xtest, tokenize=True)
predicted = classifier.predict(inf_test)

In [None]:
print(classification_report(bs_y, [classes[y] for y in predicted]))

***
## InferSent model - CNN


In [None]:
#embedding size = 300 

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        
        # Dropout definition
        self.dropout = nn.Dropout(0.25)
        
        self.outsize = 10
        
        # Kernel sizes
        self.kernel_1 = 2
        self.kernel_2 = 3
        self.kernel_3 = 4
        self.kernel_4 = 5
        
        # Conv layers
        self.conv_1 = nn.Conv1d(1, self.outsize, self.kernel_1, 1) 
        self.conv_2 = nn.Conv1d(1, self.outsize, self.kernel_2, 1)
        self.conv_3 = nn.Conv1d(1, self.outsize, self.kernel_3, 1)
        self.conv_4 = nn.Conv1d(1, self.outsize, self.kernel_4, 1)
        self.pool_1 = nn.MaxPool1d(self.kernel_1)
        self.pool_2 = nn.MaxPool1d(self.kernel_2)
        self.pool_3 = nn.MaxPool1d(self.kernel_3)
        self.pool_4 = nn.MaxPool1d(self.kernel_4)
        self.fc = nn.Linear(52520, 6)

    def forward(self, x):
        # Convolution layer 1 is applied
        x1 = self.conv_1(x)
        x1 = F.leaky_relu(x1, negative_slope=0.03)
        x1 = self.pool_1(x1)

        # Convolution layer 2 is applied
        x2 = self.conv_2(x)
        x2 = F.leaky_relu(x2, negative_slope=0.03)
        x2 = self.pool_2(x2)

        # Convolution layer 3 is applied
        x3 = self.conv_3(x)
        x3 = F.leaky_relu(x3, negative_slope=0.03)
        x3 = self.pool_3(x3)

        # Convolution layer 4 is applied
        x4 = self.conv_4(x)
        x4 = F.leaky_relu(x4, negative_slope=0.03)
        x4 = self.pool_4(x4)

        # The output of each convolutional layer is concatenated into a unique vector
        union = torch.cat((x1, x2, x3, x4), 2)
        union = union.reshape(union.size(0), -1)
                
        # The "flattened" vector is passed through a fully connected layer
        out = self.fc(union)
        
        # Dropout is applied
        out = self.dropout(out)
        
        return out.squeeze()

In [None]:
# Hyperparameters
cnn_epochs = 20
cnn_lr = 0.01
cnn_lambda = 0.005
batchSize = 50

# Instatiate model
myCNN = CNN()
optimizer = optim.SGD(myCNN.parameters(), lr=cnn_lr, momentum=0.9)
criterion = torch.nn.CrossEntropyLoss()

In [None]:
load_data = [(v, ytrain_enc[i]) for i, v in enumerate(inf_train)]

trainloader = torch.utils.data.DataLoader(load_data, batch_size=batchSize, shuffle=True, 
                                          num_workers=2, worker_init_fn=np.random.seed(1))

In [None]:
running_loss = 0 

for epoch in range(cnn_epochs):
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # Get the inputs; data is a list of [inputs, labels]
        
        inputs, labels = data

        inputs = inputs.unsqueeze(1)
        
        # Reset gradients
        optimizer.zero_grad()

        # Forward pass
        output = criterion(myCNN(inputs), labels)
        loss = output.item()

        # Backward pass
        output.backward()
        optimizer.step()

        # Print statistics
        running_loss += loss
        if i % 200 == 0:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 200))
            running_loss = 0.0

        # Stop criterion
        if abs(loss) < 1e-2:
            break

In [None]:
testloader = torch.utils.data.DataLoader(inf_test, batch_size=batchSize, shuffle=True, 
                                          num_workers=2, worker_init_fn=np.random.seed(1))

In [None]:
predicted = []
with torch.no_grad():
    for data in testloader:
        inputs = data.unsqueeze(1)
        outputs = myCNN(inputs)
        _, pred = torch.max(outputs, 1)
        print(pred)
        predicted += [x.item() for x in list(pred)]

In [None]:
print(classification_report(bs_y, [classes[y] for y in predicted]))

***

## Infersent RNN with Tensorflow

In [5]:
# Load in pre-trained encoder
from models import InferSent
V = 2
MODEL_PATH = 'encoder/infersent%s.pkl' % V
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': V}

infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))

W2V_PATH = 'fastText/crawl-300d-2M.vec'
infersent.set_w2v_path(W2V_PATH)

infersent.build_vocab(xtrain, tokenize=True)

Found 8990(/9281) words with w2v vectors
Vocab size : 8990


In [6]:
import tensorflow as tf
import math
import matplotlib.pyplot as plt

tf.random.set_seed(1)

ytrain_enc = [reverse_classes[x] for x in ytrain]
ytest_enc = [reverse_classes[x] for x in ytest]
inf_train = infersent.encode(xtrain, tokenize=True) 
inf_test = infersent.encode(xtest, tokenize=True)

In [7]:
BATCH_SIZE = 100
num_train_examples = len(inf_train)

In [8]:
inf_train = np.array([x.reshape(1, 4096) for x in inf_train])
inf_test = np.array([x.reshape(1, 4096) for x in inf_test])

tf_train = tf.convert_to_tensor(np.array(inf_train))
tf_test = tf.convert_to_tensor(np.array(inf_test))
tf_y = tf.convert_to_tensor(np.array(ytrain_enc))
tf_test_y = tf.convert_to_tensor(np.array(ytest_enc))

In [9]:
tf_train_data = tf.data.Dataset.from_tensor_slices((tf_train, tf_y))
tf_test_data = tf.data.Dataset.from_tensor_slices((tf_test, tf_test_y))

In [10]:
train_dataset = tf_train_data.cache().repeat().shuffle(num_train_examples).batch(BATCH_SIZE)
test_dataset = tf_test_data.cache().batch(BATCH_SIZE)

In [11]:
model = tf.keras.Sequential([
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(4096, dropout=0.2, recurrent_dropout=0.2)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(6)
])

In [12]:
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [13]:
history = model.fit(train_dataset, epochs=10, steps_per_epoch=math.ceil(num_train_examples/BATCH_SIZE))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [14]:
results = model.evaluate(test_dataset)



In [15]:
history = model.fit(train_dataset, epochs=1, steps_per_epoch=math.ceil(num_train_examples/BATCH_SIZE))
results = model.evaluate(test_dataset)



In [16]:
history = model.fit(train_dataset, epochs=1, steps_per_epoch=math.ceil(num_train_examples/BATCH_SIZE))
results = model.evaluate(test_dataset)



In [17]:
export_path_sm = "./{}".format("rnn_model")
tf.saved_model.save(model, export_path_sm)
!ls {export_path_sm}

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: ./rnn_model/assets
assets	saved_model.pb	variables


In [18]:
!zip -r model.zip {export_path_sm}

  adding: rnn_model/ (stored 0%)
  adding: rnn_model/variables/ (stored 0%)
  adding: rnn_model/variables/variables.data-00000-of-00001 (deflated 46%)
  adding: rnn_model/variables/variables.index (deflated 68%)
  adding: rnn_model/saved_model.pb (deflated 91%)
  adding: rnn_model/assets/ (stored 0%)


In [None]:
try:
  .download('./model.zip')
except ImportError:
  pass

In [None]:
history = model.fit(train_dataset, epochs=1, steps_per_epoch=math.ceil(num_train_examples/BATCH_SIZE))
results = model.evaluate(test_dataset)

***
## Infersent CNN with tensorflow

In [None]:
inf_train_cnn = np.array([x.reshape(4096, 1) for x in inf_train])
inf_test_cnn = np.array([x.reshape(4096, 1) for x in inf_test])

tf_train_cnn = tf.convert_to_tensor(np.array(inf_train_cnn))
tf_test_cnn = tf.convert_to_tensor(np.array(inf_test_cnn))

tf_train_data = tf.data.Dataset.from_tensor_slices((tf_train_cnn, tf_y))
tf_test_data = tf.data.Dataset.from_tensor_slices((tf_test_cnn, tf_test_y))

train_dataset = tf_train_data.cache().repeat().shuffle(num_train_examples).batch(BATCH_SIZE)
test_dataset = tf_test_data.cache().batch(BATCH_SIZE)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Conv1D(32, 4, activation='relu', input_shape = (4096, 1)),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Conv1D(32, 4, activation='relu'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(6)
])

In [None]:
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [None]:
history = model.fit(train_dataset, epochs=2, steps_per_epoch=math.ceil(num_train_examples/BATCH_SIZE))

In [None]:
results = model.evaluate(test_dataset)