In [1]:
# features: sentence embeddings+noun embedding+adj embeddding
# or named entity embeddings
# column edit distance??

# https://medium.com/huggingface/universal-word-sentence-embeddings-ce48ddc8fc3a

# Wiki Pre Trained with Fasttext https://fasttext.cc/docs/en/english-vectors.html
# Advances in Pre-Training Distributed Word Representations

In [1]:
# import spaCy??
# https://spacy.io/
import io
import json
import spacy
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier



# 1. Functions

In [3]:
def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    cnt = 0
    for line in fin:
        cnt += 1
        if cnt % 100000 == 0:
            print(cnt)
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = list(map(float, tokens[1:]))
    return data

In [4]:
def load_questions(fname):
    with open(fname) as f:
        questions_list = f.readlines()

    filtered_questions_list = []
    for question in questions_list:
        question_json = json.loads(question)
        question_json.pop('question1')
        question_json.pop('question2')
        filtered_questions_list.append(question_json)
            
    return filtered_questions_list

In [5]:
def load_tables(part_file_name):
    with open('../WikiSQL/data/'+part_file_name+'.tables.jsonl') as f:
        tables_list = f.readlines()
        
    tables = {}
    for table in tables_list:
        table_json = json.loads(table)
        tables[table_json['id']] = table_json
    return tables

In [6]:
def sentence_embedding(sentence, spacy_model, fastText_model):
    # weight more on nouns that are not stop words
    embed_dim = 300
    noun_amplify = 3
    doc = nlp(sentence)
    embed_res = np.zeros((embed_dim,))
    doc_len = len(doc)
    noun_cnt = 0
    for token in doc:
        if token.lemma_ in fastText_model:
            if token.pos_ == 'NOUN' and token.is_stop == False:
                embed_res += noun_amplify*np.array(fastText_model[token.lemma_])
                noun_cnt += 1
            else:
                embed_res += np.array(fastText_model[token.lemma_])
        else:
            embed_res += np.zeros((embed_dim,))
#         print(fastText_model[token.lemma_][-1], embed_res[-1])
    embed_res = np.array(embed_res)/(doc_len+(noun_amplify-1)*noun_cnt)
    return embed_res

In [7]:
def headers_embedding(headers, spacy_model, fastText_model):
    embed_dim = 300
    header_embedding = np.empty((0, embed_dim))
    for col_name in headers:
        col_embed = sentence_embedding(col_name, spacy_model, fastText_model)
        header_embedding = np.vstack((header_embedding, col_embed))
    return header_embedding

In [8]:
def question_Xy(question_embedding, header_embedding, sel_ind, conds_ind):
    question_embed = 600
    question_X = np.empty((0, question_embed))
    question_y = np.empty((0,1))
    for header_ind in range(len(header_embedding)):
        question_X = np.vstack((question_X, np.concatenate((question_embedding, header_embedding[header_ind]))))
        if header_ind == sel_ind or header_ind == conds_ind:
            question_y = np.vstack((question_y, np.array(1)))
        else:
            question_y = np.vstack((question_y, np.array(0)))
    return question_X, question_y

In [9]:
def obtain_Xy(question_list, table_dict, spacy_model, fastText_model):
    question_embed = 600
    aggreation_embed = 300
    question_Xs = np.empty((0, question_embed))
    question_ys = np.empty((0,1))
    aggreation_Xs = np.empty((0, aggreation_embed))
    aggregation_ys = np.empty((0,1))
    for question in question_list:
        question_embedding = sentence_embedding(question['question'], spacy_model, fastText_model)
        header_embedding = headers_embedding(table_dict[question['table_id']]['header'], spacy_model, \
                                           fastText_model)
        
        question_X, question_y = question_Xy(question_embedding, header_embedding, question['sql']['sel'], \
                                                  question['sql']['conds'][0][0])
        question_Xs = np.vstack((question_Xs, question_X))
        question_ys = np.vstack((question_ys, question_y))
        
        aggreation_Xs = np.vstack((aggreation_Xs, question_embedding))
        if question['sql']['agg'] == 5:
            agg = 0
        else:
            agg = 1
        aggregation_ys = np.vstack((aggregation_ys, np.array(agg)))
        
    return question_Xs, question_ys, aggreation_Xs, aggregation_ys

# 2. Load Prepared Data

In [2]:
column_train_X, column_train_y, agg_train_X, agg_train_y = np.load('train.npy', allow_pickle=True)
column_test_X, column_test_y, agg_test_X, agg_test_y = np.load('test.npy', allow_pickle=True)
train_lstm_X, train_lstm_y = np.load('train_lstm.npy', allow_pickle=True)
test_lstm_X, test_lstm_y = np.load('test_lstm.npy', allow_pickle=True)
test_np = np.load('test_individual.npy', allow_pickle=True)

# 3. Models

## Random Forest for Aggregation Method Prediction

In [12]:
agg_clf = RandomForestClassifier()
agg_clf.fit(agg_train_X,agg_train_y)
agg_y_pred = agg_clf.predict(agg_test_X)
accuracy_score(agg_test_y, agg_y_pred)

  


0.9

In [13]:
agg_clf

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

## MLP Classifier for Aggregation Method Prediction

In [14]:
agg_clf = MLPClassifier(hidden_layer_sizes=(256,128))
agg_clf.fit(agg_train_X,agg_train_y)
agg_y_pred = agg_clf.predict(agg_test_X)
accuracy_score(agg_test_y, agg_y_pred)

  y = column_or_1d(y, warn=True)


0.9666666666666667

In [15]:
agg_clf

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(256, 128), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

## Random Forest for Column Selection Prediction

In [6]:
col_clf = RandomForestClassifier()
col_clf.fit(column_train_X, column_train_y)

# test_questions = questions[50:]
test_tot = test_np.shape[1]
correct_cnt = 0
for ind in range(test_tot):
    col_test_X = test_np[0][ind]
    col_test_y = test_np[1][ind]
    col_y_pred = col_clf.predict(col_test_X)
#     print(col_test_y.shape, col_y_pred.shape)
    acc = accuracy_score(col_test_y, col_y_pred)
    if acc == 1.0:
        correct_cnt+=1
        print(ind)
print("accuracy:", correct_cnt/test_tot)
# np.intersect1d(np.where(column_test_y==1)[0], np.where(col_y_pred==1))

  


2
9
12
13
14
20
22
25
26
27
28
accuracy: 0.36666666666666664


In [7]:
column_y_pred = col_clf.predict(column_test_X)
accuracy_score(column_test_y, column_y_pred)

0.8181818181818182

In [10]:
col_clf

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(512, 256, 64), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

## MLP Classifier for Column Selection Prediction

In [8]:
col_clf = MLPClassifier(hidden_layer_sizes=(512,256,64))
# col_clf = RandomForestClassifier()
col_clf.fit(column_train_X, column_train_y)

# test_questions = questions[50:]
test_tot = test_np.shape[1]
correct_cnt = 0
for ind in range(test_tot):
    col_test_X = test_np[0][ind]
    col_test_y = test_np[1][ind]
    col_y_pred = col_clf.predict(col_test_X)
#     print(col_test_y.shape, col_y_pred.shape)
    acc = accuracy_score(col_test_y, col_y_pred)
    if acc == 1.0:
        correct_cnt+=1
        print(ind)
print("accuracy:", correct_cnt/test_tot)
# np.intersect1d(np.where(column_test_y==1)[0], np.where(col_y_pred==1))

  y = column_or_1d(y, warn=True)


4
7
12
13
14
17
19
22
28
29
accuracy: 0.3333333333333333


In [9]:
column_y_pred = col_clf.predict(column_test_X)
accuracy_score(column_test_y, column_y_pred)

0.8609625668449198

In [39]:
col_clf

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(512, 256, 64), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)