In [1]:
import pandas as pd
from sqlalchemy import create_engine
import torch
import os

#from utils.dataset import ExtendedWikiSQL

from preprocessing.word_embedding import *

db_name = 'data/db/wikisql'
engine = create_engine('sqlite:///{}.db'.format(db_name))

In [2]:
extended_wikisql = pd.read_sql_table('EWikiSQL', con=engine)
extended_wikisql['header'] = extended_wikisql['header'].apply(lambda x: x.split('|'))
extended_wikisql['targets'] = extended_wikisql['targets'].apply(lambda x: x.split('|'))
extended_wikisql.head()

Unnamed: 0,index,table_id,header,question,targets
0,0,1-1672976-2,"[Date of appointment, City of license, Neutral...",What institution had 6 wins and a current stre...,"[0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
1,1,2-14245-3,"[Country, Opponent, Area (1930) in 1,000skm 2,...",Capital of brześć nad bugiem has what populati...,"[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2,2-17692986-2,"[Wrestler, Total Passengers, Time, Production ...",What time is listed against the Wrestler Jimmy...,"[1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0]"
3,3,2-15039040-6,"[Legs, Other details, Project Name, Class AA, ...",What is the rank for the 96 floors?,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, ..."
4,4,2-17306260-2,"[Away team score, Manufacturer, Film title use...",Which second team had a first leg score of 83-69?,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."


In [3]:
data_dir = os.path.join('data', 'glove')
word2idx_path = 'word2idx.json'
usedwordemb_path = 'usedwordemb.npy'

word_emb = load_word_emb(data_dir, word2idx_path, usedwordemb_path)
embedding = WordEmbedding(word_emb)

In [40]:
import torch
import os


class ExtendedWikiSQL():
    
    def __init__(self):
        self.inputs, self.targets = [], []
        
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, idx):
        return {
            'input': self.inputs[idx],
            'target': self.targets[idx]
        }
    
    def load_from_df(self, ewikisql_df, embedding, initialize=True):
        if initialize:
            self.inputs, self.targets = [],[]
            
        for idx, row in ewikisql_df.iterrows():
            header, question, target = row[['header', 'question', 'targets']]
            input_pre = [embedding(word.lower()) for word in question]
            input_suf = [embedding(word.lower()) for word in header]
            input_sequence = [embedding('<BEG>')] + input_pre + [embedding('<SEQ>')] + input_suf + [embedding('<END>')]

            target_pre = [0 for word in question] # question
            target_suf = [1 if indicator == '1' else 0 for indicator in target] # header
            target_sequence = [0] + target_pre + [0] + target_suf + [0]
            
            self.inputs.append(torch.Tensor(input_sequence))
            self.targets.append(torch.LongTensor(target_sequence))
                
    def load_from_torch(self, path):
        self.inputs = torch.load('{}_inputs.pt'.format(path))
        self.targets = torch.load('{}_targets.pt'.format(path))
    
    def save_to_torch(self, path):
        torch.save(self.inputs, '{}_inputs.pt'.format(path))
        torch.save(self.targets, '{}_targets.pt'.format(path))

In [41]:
data = ExtendedWikiSQL()
data.load_from_df(extended_wikisql.head(5), embedding)

In [42]:
data.save_to_torch('training/data/ewikisql')

---