In [1]:
import os, re, string, sys
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from models import InferSent
import torch
import numpy as np

In [3]:
xtrain = []

# Train
with open("../../data/question_classification/trec_train.txt", 'rb') as f:
    questions = [x.decode('utf8').strip() for x in f.readlines()]
    for q in questions:
        splt = q.replace("\n", "").split(":")
        xtrain.append(" ".join(splt[1].split(" ")[1:]))
        
def preprocess_text(data):
    # Remove punctuation
    exclude = set(string.punctuation)
    data = [''.join(ch for ch in x if ch not in exclude).strip() for x in data]
    
    # Remove multi-spaces
    data = [re.sub(' +', ' ', x) for x in data]
    return data

xtrain = preprocess_text(xtrain)

In [4]:
nqa = pd.read_csv("../../data/narrativeqa_qas.csv")
nqa.describe()

Unnamed: 0,document_id,set,question,answer1,answer2,question_tokenized,answer1_tokenized,answer2_tokenized
count,46765,46765,46765,46765,46765,46765,46765,46765
unique,1572,3,46134,41246,40456,46072,40928,39974
top,e0c74cdf270ebe29a2139e7319fc7314738c88ee,train,Where does the story take place?,London,London,Where does the story take place ?,London,London
freq,50,32747,46,64,60,47,64,62


In [5]:
voc = list(nqa['question'])
#voc = xtrain


V = 2
MODEL_PATH = 'encoder/infersent%s.pkl' % V
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': V}

infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))

W2V_PATH = 'fastText/crawl-300d-2M.vec'
infersent.set_w2v_path(W2V_PATH)

infersent.build_vocab(voc, tokenize=True)

Found 22192(/24222) words with w2v vectors
Vocab size : 22192


In [7]:
model = tf.saved_model.load('../../dialogue/models/qc_rnn_model')

In [14]:
question_labels = []
for x in list(nqa['question']):
    enc_q = infersent.encode(np.array([x]), tokenize=True)
    reshaped = np.array([x.reshape(1, 4096) for x in enc_q])
    tf_test = tf.convert_to_tensor(reshaped)
    question_labels.append(model(tf_test))

In [15]:
print(list(nqa['question'])[:10])
print(question_labels[:10])

['Who is Mark Hunter?', 'Where does this radio station take place?', "Why do more students tune into Mark's show?", 'Who commits suicide?', 'What does Paige jam into her microwave?', 'What does Mark do with his radio station?', 'What does Mark tell the protesting students?', 'Who gets arrested?', 'What does the radio show cause?', 'Where does Mark Broadcast his station from?']
[<tf.Tensor: shape=(1, 6), dtype=float32, numpy=
array([[ 3.1534648, -1.1778678,  8.37542  , -1.0564224, -0.2581335,
        -3.3743525]], dtype=float32)>, <tf.Tensor: shape=(1, 6), dtype=float32, numpy=
array([[ 1.4706217 ,  6.412529  , -0.28377095, -1.6566306 ,  0.666564  ,
        -0.19584668]], dtype=float32)>, <tf.Tensor: shape=(1, 6), dtype=float32, numpy=
array([[ 8.053342  , -3.3266823 ,  0.57010335,  1.3968505 , -3.3724198 ,
        -1.5120921 ]], dtype=float32)>, <tf.Tensor: shape=(1, 6), dtype=float32, numpy=
array([[ 2.0118954, -6.0298796, 10.868199 ,  1.9843329, -0.6029846,
        -4.4573765]], dtyp

In [19]:
nqa['predicted_answer_class'] = [np.argmax(x) for x in question_labels]

In [21]:
nqa['predicted_answer_class'].describe()

count    46765.000000
mean         1.588218
std          1.403264
min          0.000000
25%          0.000000
50%          2.000000
75%          2.000000
max          5.000000
Name: predicted_answer_class, dtype: float64

In [22]:
compression_opts = dict(method='zip',
                        archive_name='pred_nqa.csv')

nqa.to_csv('pred_nqa.zip', index=False,
          compression=compression_opts)  

In [24]:
sample = nqa.sample(n = 100)

cls_dict = {
  'DESC': 0,
  'LOC': 1,
  'HUM': 2,
  'ENTY': 3,
  'ABBR': 4,
  'NUM': 5
}

In [None]:
labels = []
for row in sample:
    print(row['question'])
    print(row['answer'])
    print(row['predicted_answer_class'])
    corr = input()
    