In [1]:
# import spaCy??
# https://spacy.io/
import io
import json
import spacy
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier



# 1. Functions

In [2]:
def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    cnt = 0
    for line in fin:
        cnt += 1
        if cnt % 100000 == 0:
            print(cnt)
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = list(map(float, tokens[1:]))
    return data

In [3]:
def load_questions(fname):
    with open(fname) as f:
        questions_list = f.readlines()

    filtered_questions_list = []
    for question in questions_list:
        question_json = json.loads(question)
        question_json.pop('question1')
        question_json.pop('question2')
        filtered_questions_list.append(question_json)
            
    return filtered_questions_list

In [4]:
def load_tables(part_file_name):
    with open('../WikiSQL/data/'+part_file_name+'.tables.jsonl') as f:
        tables_list = f.readlines()
        
    tables = {}
    for table in tables_list:
        table_json = json.loads(table)
        tables[table_json['id']] = table_json
    return tables

In [5]:
def sentence_embedding(sentence, spacy_model, fastText_model):
    # weight more on nouns that are not stop words
    embed_dim = 300
    noun_amplify = 3
    doc = nlp(sentence)
    embed_res = np.zeros((embed_dim,))
    doc_len = len(doc)
    noun_cnt = 0
    for token in doc:
        if token.lemma_ in fastText_model:
            if token.pos_ == 'NOUN' and token.is_stop == False:
                embed_res += noun_amplify*np.array(fastText_model[token.lemma_])
                noun_cnt += 1
            else:
                embed_res += np.array(fastText_model[token.lemma_])
        else:
            embed_res += np.zeros((embed_dim,))
#         print(fastText_model[token.lemma_][-1], embed_res[-1])
    embed_res = np.array(embed_res)/(doc_len+(noun_amplify-1)*noun_cnt)
    return embed_res

In [6]:
def headers_embedding(headers, spacy_model, fastText_model):
    embed_dim = 300
    header_embedding = np.empty((0, embed_dim))
    for col_name in headers:
        col_embed = sentence_embedding(col_name, spacy_model, fastText_model)
        header_embedding = np.vstack((header_embedding, col_embed))
    return header_embedding

In [7]:
def question_Xy(question_embedding, header_embedding, sel_ind, conds_ind):
    question_embed = 600
    question_X = np.empty((0, question_embed))
    question_y = np.empty((0,1))
    for header_ind in range(len(header_embedding)):
        question_X = np.vstack((question_X, np.concatenate((question_embedding, header_embedding[header_ind]))))
        if header_ind == sel_ind or header_ind == conds_ind:
            question_y = np.vstack((question_y, np.array(1)))
        else:
            question_y = np.vstack((question_y, np.array(0)))
    return question_X, question_y

In [8]:
def obtain_Xy(question_list, table_dict, spacy_model, fastText_model):
    question_embed = 600
    aggreation_embed = 300
    question_Xs = np.empty((0, question_embed))
    question_ys = np.empty((0,1))
    aggreation_Xs = np.empty((0, aggreation_embed))
    aggregation_ys = np.empty((0,1))
    for question in question_list:
        question_embedding = sentence_embedding(question['question'], spacy_model, fastText_model)
        header_embedding = headers_embedding(table_dict[question['table_id']]['header'], spacy_model, \
                                           fastText_model)
        
        question_X, question_y = question_Xy(question_embedding, header_embedding, question['sql']['sel'], \
                                                  question['sql']['conds'][0][0])
        question_Xs = np.vstack((question_Xs, question_X))
        question_ys = np.vstack((question_ys, question_y))
        
        aggreation_Xs = np.vstack((aggreation_Xs, question_embedding))
        if question['sql']['agg'] == 5:
            agg = 0
        else:
            agg = 1
        aggregation_ys = np.vstack((aggregation_ys, np.array(agg)))
        
    return question_Xs, question_ys, aggreation_Xs, aggregation_ys

In [9]:
# https://datascience.stackexchange.com/questions/48796/how-to-feed-lstm-with-different-input-array-sizes

In [10]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM, Bidirectional
from keras.utils import Sequence


Using TensorFlow backend.


In [11]:
class MyBatchGenerator(Sequence):
    'Generates data for Keras'
    def __init__(self, X, y, batch_size=1, shuffle=True):
        'Initialization'
        self.X = X
        self.y = y
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.y)/self.batch_size))

    def __getitem__(self, index):
        return self.__data_generation(index)

    def on_epoch_end(self):
        'Shuffles indexes after each epoch'
        self.indexes = np.arange(len(self.y))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, index):
        Xb = np.empty((self.batch_size, *self.X[index].shape))
        yb = np.empty((self.batch_size, *self.y[index].shape))
        # naively use the same sample over and over again
        for s in range(0, self.batch_size):
            Xb[s] = self.X[index]
            yb[s] = self.y[index]
        return Xb, yb

# 2. Load Prepared Data for LSTM

In [12]:
column_train_X, column_train_y, agg_train_X, agg_train_y = np.load('train.npy', allow_pickle=True)
column_test_X, column_test_y, agg_test_X, agg_test_y = np.load('test.npy', allow_pickle=True)
train_lstm_X, train_lstm_y = np.load('train_lstm.npy', allow_pickle=True)
test_lstm_X, test_lstm_y = np.load('test_lstm.npy', allow_pickle=True)
train_lstm2_X, train_lstm2_y = np.load('train_lstm2.npy', allow_pickle=True)
test_lstm2_X, test_lstm2_y = np.load('test_lstm2.npy', allow_pickle=True)
X_train_agg_lstm, y_train_agg_lstm = np.load('train_agg_lstm.npy', allow_pickle=True)
X_test_agg_lstm, y_test_agg_lstm = np.load('test_agg_lstm.npy', allow_pickle=True)

In [13]:
train_lstm_X_list = []
train_lstm_y_list = []
for sample,label in zip(train_lstm_X, train_lstm_y):
    train_lstm_X_list += list(sample)
    train_lstm_y_list += list(label)
    
train_lstm_X_list = np.array(train_lstm_X_list)
train_lstm_y_list = np.array(train_lstm_y_list)

In [14]:
test_lstm_X_list = []
test_lstm_y_list = []
for sample,label in zip(test_lstm_X, test_lstm_y):
    test_lstm_X_list += list(sample)
    test_lstm_y_list += list(label)
    
test_lstm_X_list = np.array(test_lstm_X_list)
test_lstm_y_list = np.array(test_lstm_y_list)

In [15]:
train_lstm2_X_list = []
train_lstm2_y_list = []
for sample,label in zip(train_lstm2_X, train_lstm2_y):
    train_lstm2_X_list += list(sample)
    train_lstm2_y_list += list(label)
    
train_lstm2_X_list = np.array(train_lstm2_X_list)
train_lstm2_y_list = np.array(train_lstm2_y_list)

In [16]:
test_lstm2_X_list = []
test_lstm2_y_list = []
for sample,label in zip(test_lstm2_X, test_lstm2_y):
    test_lstm2_X_list += list(sample)
    test_lstm2_y_list += list(label)
    
test_lstm2_X_list = np.array(test_lstm2_X_list)
test_lstm2_y_list = np.array(test_lstm2_y_list)

# 3. Padding Data to Make Each Sample Have the Same Length

In [17]:
X_train_padded = sequence.pad_sequences(train_lstm_X_list, padding='post')
X_test_padded = sequence.pad_sequences(test_lstm_X_list, padding='post')
print(X_train_padded.shape, X_test_padded.shape)

(2135, 23, 300) (187, 23, 300)


In [18]:
X_train2_padded = sequence.pad_sequences(train_lstm2_X_list, padding='post')
X_test2_padded = sequence.pad_sequences(test_lstm2_X_list, padding='post')
print(X_train2_padded.shape, X_test2_padded.shape)

(2135, 27, 300) (187, 28, 300)


In [19]:
X_train_agg_padded = sequence.pad_sequences(X_train_agg_lstm, padding='post')
X_test_agg_padded = sequence.pad_sequences(X_test_agg_lstm, padding='post')
print(X_train_agg_padded.shape, X_test_agg_padded.shape)

(320, 20, 300) (30, 20, 300)


In [20]:
sequence.pad_sequences(train_lstm_X_list, padding='post').shape

(2135, 23, 300)

# 4. Column Selection Prediction

## LSTM

In [22]:
batch_size = 32
model = Sequential()
model.add(LSTM(256, return_sequences=True, dropout=0, recurrent_dropout=0))
model.add(LSTM(128, return_sequences=True, dropout=0, recurrent_dropout=0))
model.add(LSTM(64, dropout=0, recurrent_dropout=0))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(X_train_padded, train_lstm_y_list, batch_size=32,
          epochs=2)

model.summary()

score, acc = model.evaluate(X_test_padded, test_lstm_y_list,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Epoch 1/2
Epoch 2/2
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 23, 256)           570368    
_________________________________________________________________
lstm_2 (LSTM)                (None, 23, 128)           197120    
_________________________________________________________________
lstm_3 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 816,961
Trainable params: 816,961
Non-trainable params: 0
_________________________________________________________________
Test score: 0.6312216762552925
Test accuracy: 0.6791443824768066


## Bidirectional LSTM

In [23]:
model = Sequential()
model.add(Bidirectional(LSTM(256, return_sequences=True, dropout=0, recurrent_dropout=0)))
model.add(Bidirectional(LSTM(128, return_sequences=True, dropout=0, recurrent_dropout=0)))
model.add(Bidirectional(LSTM(64, dropout=0, recurrent_dropout=0)))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(X_train_padded, train_lstm_y_list, batch_size=32,
          epochs=2)

score, acc = model.evaluate(X_test_padded, column_test_y,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Epoch 1/2
Epoch 2/2
Test score: 0.641189912742472
Test accuracy: 0.6791443824768066


# 5. Aggregation Method prediction

## LSTM

In [26]:
model = Sequential()
model.add(LSTM(256, return_sequences=True, dropout=0, recurrent_dropout=0))
model.add(LSTM(128, return_sequences=True, dropout=0, recurrent_dropout=0))
model.add(LSTM(64, dropout=0, recurrent_dropout=0))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(X_train_agg_padded, y_train_agg_lstm, batch_size=16,
          epochs=5)

score, acc = model.evaluate(X_train_agg_padded, y_train_agg_lstm,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test score: 0.6759211480617523
Test accuracy: 0.59375


## Bidirectional LSTM

In [27]:
model = Sequential()
model.add(Bidirectional(LSTM(256, return_sequences=True, dropout=0, recurrent_dropout=0)))
model.add(Bidirectional(LSTM(128, return_sequences=True, dropout=0, recurrent_dropout=0)))
model.add(Bidirectional(LSTM(64, dropout=0, recurrent_dropout=0)))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(X_train_agg_padded, y_train_agg_lstm, batch_size=32,
          epochs=2)

score, acc = model.evaluate(X_test_agg_padded, y_test_agg_lstm,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Epoch 1/2
Epoch 2/2
Test score: 0.6183054447174072
Test accuracy: 0.8666666746139526
