In [1]:
import numpy as np
from datetime import datetime

class QA_Preprocessor():
    def __init__(self, file_path, length_limit = 40):
        self.file_path = file_path
        self.max_token = 0
        self.vocab = {}
        self.data = {'q': [], 'a': [], 'y': []}
        self.data_dict = {'q': {}, 'a': {}}
        self.length_limit = length_limit
        self.load_raw_file(file_path)
        now = datetime.now()
        print('[{:s}] Data_load done. Max token size: {:d}'.format(str(now), self.max_token))
                
    def load_raw_file(self, file_path):
        f = open(file_path, 'r', encoding='utf8')
        for line in f:
            str_split = line.lower().split('\t')
            y_value = int(''.join(i for i in str_split[2] if i.isdigit())) #if int(str_split[2])>0 else -1
            self.data['y'].append(y_value)
            for i, value in enumerate(['q', 'a']):
                sent = str_split[i]
                if not (sent in self.data_dict[value]):
                    split_sent = sent.split()
                    bow_sent = [0]*self.length_limit
                    if self.max_token < len(split_sent):
                        self.max_token = len(split_sent)
                    for idx, word in enumerate(split_sent):
                        if idx >= self.length_limit:
                            break
                        word_idx = 0
                        if not(word in self.vocab):
                            self.vocab[word] = len(self.vocab)+1
                        word_idx = self.vocab[word]
                        bow_sent[idx] = word_idx
                    self.data_dict[value][sent] = bow_sent
                bow_sent = self.data_dict[value][sent][:]
                self.data[value]+= bow_sent
        #Make a form of datasets to (-1, 40)
        for i, value in enumerate(['q', 'a']):
            self.data[value] = np.array(self.data[value]).reshape((-1, self.length_limit))
        #Make a form of binary labels to (-1, 1)
        self.data['y'] = np.array(self.data['y']).reshape((-1, 1))
        
    def load_raw_eval_file(self, file_path):
        f = open(file_path, 'r', encoding='utf8')
        len_cnt = 0 #tbd
        is_save = False
        y_list = []
        data_list = {'q': [], 'a': []}
        
        for line in f:
            str_split = line.lower().split('\t')
            for i, value in enumerate(['q', 'a']):
                sent = str_split[i]
                if not (sent in self.data_dict[value]):
                    if value == 'q':
                        if is_save:
                            y_list = np.array(y_list).reshape((-1, 1))
                            self.data['y'].append(y_list)
                            for key in ['q', 'a']:
                                data_list[key] = np.array(data_list[key]).reshape((-1, self.length_limit))
                                self.data[key].append(data_list[key])

                        is_save = False
                        y_list = []
                        data_list = {'q': [], 'a': []}
                        
                    split_sent = sent.split()
                    bow_sent = [0]*self.length_limit
                    for idx, word in enumerate(split_sent):
                        if idx >= self.length_limit:
                            break
                        word_idx = 0
                        if not(word in self.vocab):
                            self.vocab[word] = len(self.vocab)+1
                        word_idx = self.vocab[word]
                        bow_sent[idx] = word_idx
                    self.data_dict[value][sent] = bow_sent
                bow_sent = self.data_dict[value][sent][:]
                data_list[value]+= bow_sent
            y_value = int(''.join(i for i in str_split[2] if i.isdigit())) #if int(str_split[2])>0 else -1
            if y_value == 1:
                is_save = True
            y_list.append(y_value)
   
        
    def reset_data(self):
        self.data = {'q': [], 'a': [], 'y': []}
        self.data_dict = {'q': {}, 'a': {}}
        