In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# importing the necessary libaries
import os, sys

import tensorflow
from tensorflow import keras
from keras.models import Model
from keras.layers import Input, LSTM, GRU, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt

import numpy as np 
from numpy import array
from numpy import asarray
from numpy import zeros
import pandas as pd 
import re
from tqdm import tqdm
 
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
en_stop = set(nltk.corpus.stopwords.words('english'))
from nltk.stem import WordNetLemmatizer
stemmer = WordNetLemmatizer()
from nltk.translate.bleu_score import corpus_bleu
 
from gensim.models.fasttext import FastText

import h5py
from keras.models import model_from_json
import json

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# defining the values of various parameters
BATCH_SIZE = 16
EPOCHS = 20
LSTM_NODES = 256
NUM_SENTENCES = 10000
MAX_SENTENCE_LENGTH = 50
MAX_NUM_WORDS = 20000
EMBEDDING_SIZE = 100
number = 7000

In [None]:
# defining the lists for encoder and decoder inputs and decoder output
input_sentences = [] # encoder input
output_sentences = [] # decoder output
output_sentences_inputs = [] # decoder input

In [None]:
# loading the dataset
df = pd.read_csv('/content/drive/MyDrive/final year project /jumbo_trials/wikisql_v3.csv')
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.head()

Unnamed: 0,question,sql
0,What number is the player that played 1998-2001,SELECT MIN No. FROM table WHERE Years in Toron...
1,What time was the highest for 2nd finishers?,SELECT MAX 2nd FROM table
2,how many times is the fuel propulsion is cng?,SELECT COUNT Fleet Series (Quantity) FROM tabl...
3,When did the Metrostars have their first Rooki...,SELECT MIN Season FROM table WHERE Team = Metr...
4,What is the number of chapters listed for the ...,SELECT MAX Chapters FROM table WHERE Classific...


In [None]:
df.shape

(7500, 2)

In [None]:
# input_sentences list will be fed in the encoder LSTM
# output_sentences_inputs list will be fed in the decoder LSTM
# output_sentences list will be the output of the decoder LSTM
# appending english elements in the input_sentences list
# adding <eos> tag to the end of the sql elements and appending them to the output_sentences list
# adding <sos> tag to the start of the sql elements and appending them to the output_sentences_inputs list

input_sentences = df['question']

for each in df['sql']:
  output_sentence = each + ' <eos>'
  output_sentence_input = '<sos> ' + each
  output_sentences.append(output_sentence)
  output_sentences_inputs.append(output_sentence_input)

In [None]:
# printing the lengths of each list
print("Number samples input:", len(input_sentences))
print("Number samples output:", len(output_sentences))
print("Number samples output input:", len(output_sentences_inputs))

Number samples input: 7500
Number samples output: 7500
Number samples output input: 7500


In [None]:
# printing a random record from all 3 lists
i = np.random.choice(len(input_sentences))
print(input_sentences[i])
print(output_sentences[i])
print(output_sentences_inputs[i])

What is the % similarity to C7orf38 of the animal whose % identity to C7orf38 is 81?
SELECT MAX % Similarity to C7orf38 FROM table WHERE % Identity to C7orf38 = 81 <eos>
<sos> SELECT MAX % Similarity to C7orf38 FROM table WHERE % Identity to C7orf38 = 81


In [None]:
'''# due to computational reasons
input_sentences = input_sentences
output_sentences = output_sentences
output_sentences_inputs = output_sentences_inputs'''

'# due to computational reasons\ninput_sentences = input_sentences\noutput_sentences = output_sentences\noutput_sentences_inputs = output_sentences_inputs'

In [None]:
# initializing tokenizer and passing input_sentences through them 
# tokenizer divides a sentence into the corresponding list of word
# then it converts the words to integers
# text_to_sequences substitutes words for their corresponding integer values
# the word_index attribute of the Tokenizer class returns a word-to-index dictionary where words are the keys and the corresponding integers are the values
# this just prints the value of the longest input sentence

# encoder
input_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
input_tokenizer.fit_on_texts(input_sentences)
input_integer_seq = input_tokenizer.texts_to_sequences(input_sentences)

word2idx_inputs = input_tokenizer.word_index
'''
count = max(word2idx_inputs.values())
for each in words:
  word2idx_inputs[each] = count
  count = count + 1
'''

print('Total unique words in the input: %s' % len(word2idx_inputs))

max_input_len = max(len(sen) for sen in input_integer_seq)
print("Length of longest sentence in input: %g" % max_input_len)

print('\n')

# decoder
output_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, filters='')
output_tokenizer.fit_on_texts(output_sentences + output_sentences_inputs)
output_integer_seq = output_tokenizer.texts_to_sequences(output_sentences)
output_input_integer_seq = output_tokenizer.texts_to_sequences(output_sentences_inputs)

word2idx_outputs = output_tokenizer.word_index
'''
count = max(word2idx_outputs.values())
for each in words:
  word2idx_outputs[each] = count
  count = count + 1
'''

print('Total unique words in the output: %s' % len(word2idx_outputs))

num_words_output = len(word2idx_outputs) + 1
max_out_len = max(len(sen) for sen in output_integer_seq)
print("Length of longest sentence in the output: %g" % max_out_len)

Total unique words in the input: 9001
Length of longest sentence in input: 46


Total unique words in the output: 10005
Length of longest sentence in the output: 58


In [None]:
input_integer_seq = input_integer_seq
print('Total unique words in the input: %s' % len(word2idx_inputs))

max_input_len = max(len(sen) for sen in input_integer_seq)
print("Length of longest sentence in input: %g" % max_input_len)

print('\n')

output_integer_seq = output_integer_seq
output_input_integer_seq = output_input_integer_seq

print('Total unique words in the output: %s' % len(word2idx_outputs))

num_words_output = len(word2idx_outputs) + 1
max_out_len = max(len(sen) for sen in output_integer_seq)
print("Length of longest sentence in the output: %g" % max_out_len)

Total unique words in the input: 9001
Length of longest sentence in input: 46


Total unique words in the output: 10005
Length of longest sentence in the output: 58


In [None]:
# here the lists made by text_to_sequences is padded to make them all equal in size 

# encoder input
encoder_input_sequences = pad_sequences(input_integer_seq, maxlen=max_input_len)
print("encoder_input_sequences.shape:", encoder_input_sequences.shape)
print("encoder_input_sequences[172]:", encoder_input_sequences[172])

print('\n')

# decoder input
decoder_input_sequences = pad_sequences(output_input_integer_seq, maxlen=max_out_len, padding='post')
print("decoder_input_sequences.shape:", decoder_input_sequences.shape)
print("decoder_input_sequences[172]:", decoder_input_sequences[172])

print('\n')

# decoder output
decoder_output_sequences = pad_sequences(output_integer_seq, maxlen=max_out_len, padding='post')
print("decoder_input_sequences.shape:", decoder_output_sequences.shape)
print("decoder_input_sequences[172]:", decoder_output_sequences[172])

encoder_input_sequences.shape: (7500, 46)
encoder_input_sequences[172]: [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   3   2   1
  78 973  46  82 974 723   2 132 272 869]


decoder_input_sequences.shape: (7500, 58)
decoder_input_sequences[172]: [  7   3   9 795   1   2   5  19 707 355   4 184 185 796   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0]


decoder_input_sequences.shape: (7500, 58)
decoder_input_sequences[172]: [  3   9 795   1   2   5  19 707 355   4 184 185 796   6   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0]


In [None]:
num_words_output

10006

In [None]:
# loading the GloVE embeddings
'''
GloVe, coined from Global Vectors, is a model for distributed word representation. 
The model is an unsupervised learning algorithm for obtaining vector representations for words. 
This is achieved by mapping words into a meaningful space where the distance between words is related to semantic similarity
'''

embeddings_dictionary = dict()
# words = []

glove_file = open(r'/content/drive/MyDrive/final year project /glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    # words.append(word)
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

In [None]:
# finding out the values for each word in our dataset from the GloVE embeddings
num_words = max(MAX_NUM_WORDS, len(word2idx_inputs) + 1)
embedding_matrix = zeros((num_words, EMBEDDING_SIZE))
for word, index in word2idx_inputs.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [None]:
# defining the embedding layer
embedding_layer = Embedding(num_words, EMBEDDING_SIZE, weights=[embedding_matrix], input_length=max_input_len)

In [None]:
# To make predictions, the final layer of the model will be a dense layer, 
# therefore we need the outputs in the form of one-hot encoded vectors, 
# since we will be using softmax activation function at the dense layer. 
# To create such one-hot encoded output, the next step is to assign 1 to the column number that corresponds to the integer representation of the word.

decoder_targets_one_hot = np.zeros((
        len(input_sentences),
        max_out_len,
        num_words_output
    ),
    dtype=np.uint16
)

print(decoder_targets_one_hot.shape)

for i, d in enumerate(decoder_output_sequences):
    for t, word in enumerate(d):
        decoder_targets_one_hot[i, t, word] = 1

(7500, 58, 10006)


In [None]:
# defining the encoder
# the embedding layer output is passed through the LSTM nodes
# The input to the encoder will be the sentence in English and the output will be the hidden state and cell state of the LSTM.

encoder_inputs_placeholder = Input(shape=(max_input_len,))
x = embedding_layer(encoder_inputs_placeholder)
encoder = LSTM(LSTM_NODES, return_state=True)

encoder_outputs, h, c = encoder(x)
encoder_states = [h, c]

In [None]:
# The decoder will have two inputs: the hidden state and cell state from the encoder and the input sentence, which actually will be the output sentence with an <sos> token appended at the beginning.

decoder_inputs_placeholder = Input(shape=(max_out_len,))

decoder_embedding = Embedding(num_words_output, LSTM_NODES)
decoder_inputs_x = decoder_embedding(decoder_inputs_placeholder)

decoder_lstm = LSTM(LSTM_NODES, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs_x, initial_state=encoder_states)

In [None]:
# the output from the decoder LSTM is passed through a dense layer to predict decoder outputs

decoder_dense = Dense(num_words_output, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [None]:
model = Model([encoder_inputs_placeholder,
  decoder_inputs_placeholder], decoder_outputs)
model.compile(
    optimizer='rmsprop',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [None]:
r = model.fit(
    [encoder_input_sequences, decoder_input_sequences],
    decoder_targets_one_hot,
    batch_size=64,
    epochs=50,
    validation_split=0.3,
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
encoder_model = Model(encoder_inputs_placeholder, encoder_states)

# saving the model
encoder_model.save('encoder')





INFO:tensorflow:Assets written to: encoder/assets


INFO:tensorflow:Assets written to: encoder/assets


In [None]:
decoder_state_input_h = Input(shape=(LSTM_NODES,))
decoder_state_input_c = Input(shape=(LSTM_NODES,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

In [None]:
decoder_inputs_single = Input(shape=(1,))
decoder_inputs_single_x = decoder_embedding(decoder_inputs_single)

In [None]:
decoder_outputs, h, c = decoder_lstm(decoder_inputs_single_x, initial_state=decoder_states_inputs)

In [None]:
decoder_states = [h, c]
decoder_outputs = decoder_dense(decoder_outputs)

In [None]:
decoder_model = Model(
    [decoder_inputs_single] + decoder_states_inputs,
    [decoder_outputs] + decoder_states
)

# saving the model
decoder_model.save('decoder')





INFO:tensorflow:Assets written to: decoder/assets


INFO:tensorflow:Assets written to: decoder/assets


In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 46)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 58)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 46, 100)      2000000     ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        multiple             2561536     ['input_2[0][0]']                
                                                                                              

In [None]:
idx2word_input = {v:k for k, v in word2idx_inputs.items()}
idx2word_target = {v:k for k, v in word2idx_outputs.items()}

In [None]:
def translate_sentence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = word2idx_outputs['<sos>']
    eos = word2idx_outputs['<eos>']
    output_sentence = []

    for _ in range(max_out_len):
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        idx = np.argmax(output_tokens[0, 0, :])

        if eos == idx:
            break

        word = ''

        if idx > 0:
            word = idx2word_target[idx]
            output_sentence.append(word)

        target_seq[0, 0] = idx
        states_value = [h, c]

    return ' '.join(output_sentence)

In [None]:
i = np.random.choice(len(input_sentences))
input_seq = encoder_input_sequences[i:i+1]
# print(input_seq)
# print(type(input_seq))
translation = translate_sentence(input_seq)
# print('-')
print('Input:', input_sentences[i])
print('Response:', translation)
print('Original Translation:', output_sentences[i])

Input: Name the total number of japanese for amagasaki
Response: select count japanese from table where name = amagasaki
Original Translation: SELECT COUNT Japanese FROM table WHERE Name = Amagasaki <eos>


In [None]:
scores = []
for j in range(20):
  i = np.random.choice(len(input_sentences))
  input_seq = encoder_input_sequences[i:i+1]
  # print(input_seq)
  # print(type(input_seq))
  translation = translate_sentence(input_seq)
  # print('-')
  # print('Input:', input_sentences[i])
  # print('Response:', translation)
  # print('Original Translation:', output_sentences[i])
  lst1 = output_sentences[i].lower().split(' ')
  lst2 = translation.split(' ')
  score = nltk.translate.bleu_score.sentence_bleu(lst1, lst2)
  scores.append(score)
  # print(score)
  # print('\n')

print('Average: ', sum(scores)/len(scores))

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


Average:  0.5549612420618882


Frontend work

In [None]:
reconstructed_encoder = keras.models.load_model("encoder")

reconstructed_encoder.compile(
    optimizer='rmsprop',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)





In [None]:
reconstructed_decoder = keras.models.load_model("decoder")

reconstructed_decoder.compile(
    optimizer='rmsprop',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)





In [None]:
with open("dict1.json", "w") as outfile:
    json.dump(word2idx_outputs, outfile)

with open("dict2.json", "w") as outfile:
    json.dump(idx2word_target, outfile)

with open("dict3.json", "w") as outfile:
    json.dump(word2idx_inputs, outfile)

In [None]:
a = open("dict1.json")
word2idx_outputs_frontend = json.load(a)

b = open("dict2.json")
idx2word_target_frontend = json.load(b)

c = open("dict3.json")
word2idx_inputs_frontend = json.load(c)

In [None]:
max_out_len

58

In [None]:
def translate_sentence(input_seq):
    states_value = reconstructed_encoder.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = word2idx_outputs_frontend['<sos>']
    eos = word2idx_outputs_frontend['<eos>']
    output_sentence = []
    lst1 = []

    for _ in range(max_out_len):
        output_tokens, h, c = reconstructed_decoder.predict([target_seq] + states_value)
        idx = np.argmax(output_tokens[0, 0, :])

        if eos == idx:
            break

        word = ''

        if idx > 0:
            idx = str(idx)
            word = idx2word_target_frontend[idx]
            output_sentence.append(word)

        target_seq[0, 0] = idx
        states_value = [h, c]

    return ' '.join(output_sentence)

In [None]:
text = "What are the students first names who have cats as pets"
text = "How many people work in the HR department"
# text = 'Who are all of the players on the Westchester High School club team?'
# text = 'What official languages are spoken in the country whose capital city is Canberra'



def preprocess(text):
  text1 = text.lower()
  lst1 = text1.split(' ')
  # print(lst1)
  txt_seq = []
  count = max(word2idx_inputs_frontend.values()) + 1
  for each in lst1:
    '''try:
      temp = word2idx_inputs_frontend[each]
    except:
      temp_lst = glove_vectors.most_similar(each)
      temp = temp_lst[0][0]'''
    if each not in word2idx_inputs_frontend.keys():

      word2idx_inputs_frontend[each] = count
      count = count + 1
    else:
      temp = word2idx_inputs_frontend[each]
    txt_seq.append(temp)
  # print(txt_seq)
  if len(txt_seq)<46:
    lst2 = [[0]*(46-len(txt_seq)) + txt_seq]
  # print(lst2)
  translation = translate_sentence(lst2)
  # print(translation)
  return translation

lst3 = preprocess(text)
print(lst3)

select count country from table where us (in lakh income


In [None]:
import gensim.downloader as api

In [None]:
word_vectors = api.load("glove-wiki-gigaword-100") 

In [None]:
word_vectors

<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7f6032054950>

In [None]:
result = word_vectors.most_similar(positive=['woman', 'king'], negative=['man'])

In [None]:
most_similar_key, similarity = result[0] 

In [None]:
print(f"{most_similar_key}: {similarity:.4f}")

queen: 0.7699


In [None]:
print(word_vectors[most_similar_key])

[-0.50045  -0.70826   0.55388   0.673     0.22486   0.60281  -0.26194
  0.73872  -0.65383  -0.21606  -0.33806   0.24498  -0.51497   0.8568
 -0.37199  -0.58824   0.30637  -0.30668  -0.2187    0.78369  -0.61944
 -0.54925   0.43067  -0.027348  0.97574   0.46169   0.11486  -0.99842
  1.0661   -0.20819   0.53158   0.40922   1.0406    0.24943   0.18709
  0.41528  -0.95408   0.36822  -0.37948  -0.6802   -0.14578  -0.20113
  0.17113  -0.55705   0.7191    0.070014 -0.23637   0.49534   1.1576
 -0.05078   0.25731  -0.091052  1.2663    1.1047   -0.51584  -2.0033
 -0.64821   0.16417   0.32935   0.048484  0.18997   0.66116   0.080882
  0.3364    0.22758   0.1462   -0.51005   0.63777   0.47299  -0.3282
  0.083899 -0.78547   0.099148  0.039176  0.27893   0.11747   0.57862
  0.043639 -0.15965  -0.35304  -0.048965 -0.32461   1.4981    0.58138
 -1.132    -0.60673  -0.37505  -1.1813    0.80117  -0.50014  -0.16574
 -0.70584   0.43012   0.51051  -0.8033   -0.66572  -0.63717  -0.36032
  0.13347  -0.56075 ]


In [None]:
BLEUscore = nltk.translate.bleu_score.sentence_bleu([pre], post)