In [1]:
import numpy as np
from text_processor import *


In [1]:

class DataGen:

    def __init__(self, reverse=False, *args, **kwargs):
        if reverse:
            self.roots, self.words, self.featArray, mr, mw, n_features = get_reverse_feature_array()
            self.n_features = n_features
            self.pred_feat_shapes = []
            for feat in self.featArray[0]:
                self.pred_feat_shapes.append(len(feat))
        else:
            self.roots, self.words, self.featArray, mr, mw = get_feature_array()
            self.word_feat_len = len(self.featArray[0])
        self.n_chars = len(char2int)
        self.max_root_len = mr
        self.max_output_len = mw + 2
        

    def gen2(self, batch_size=100, n_batches=-1, trainset=True):
        max_batch, min_batch = 0, 0
        if trainset == True:
            max_batch = int(len(self.words) * .7) / batch_size
            min_batch = 0
        else:
            max_batch = len(self.words)/ batch_size
            min_batch = int(len(self.words) * .7 / batch_size)
        
        total_batchs = max_batch
        batch = min_batch
        while True:
            rootX, target_inX, featX, y = list(), list(), list(), list()
            for i in range(batch * batch_size, (1 + batch) * batch_size):
                root = self.roots[i]
                word = self.words[i]
                word_feature = self.featArray[i]
                root_encoded, target_encoded, target_in_encoded = self.encond_input_output(root, word)
                rootX.append(root_encoded.reshape((root_encoded.shape[0], root_encoded.shape[1], 1)))
                target_inX.append(target_in_encoded)
                featX.append(word_feature)
                y.append(target_encoded)
            yield [np.array(rootX), np.array(target_inX), np.array(featX)], np.array(y)
            batch += 1
            if batch == total_batchs or batch == n_batches:
                batch = min_batch
                
    def gen_rnn(self, batch_size=100, n_batches=-1, trainset=True):
        max_batch, min_batch = 0, 0
        if trainset == True:
            max_batch = int(len(self.words) * .7) / batch_size
            min_batch = 0
        else:
            max_batch = len(self.words)/ batch_size
            min_batch = int(len(self.words) * .7 / batch_size)
        
        total_batchs = max_batch
        batch = min_batch
        while True:
            rootX, target_inX, featX, y = list(), list(), list(), list()
            for i in range(batch * batch_size, (1 + batch) * batch_size):
                root = self.roots[i]
                word = self.words[i]
                word_feature = self.featArray[i]
                root_encoded, target_encoded, target_in_encoded = self.encond_input_output(root, word)
                rootX.append(root_encoded.reshape((root_encoded.shape[0], root_encoded.shape[1])))
                target_inX.append(target_in_encoded)
                featX.append(word_feature)
                y.append(target_encoded)
            yield [np.array(rootX), np.array(target_inX), np.array(featX)], np.array(y)
            batch += 1
            if batch == total_batchs or batch == n_batches:
                batch = min_batch
                
    def gen_rev(self, batch_size=100, n_batches=-1, trainset=True):
        max_batch, min_batch = 0, 0
        if trainset == True:
            max_batch = int(len(self.words) * .7) / batch_size
            min_batch = 0
        else:
            max_batch = len(self.words)/ batch_size
            min_batch = int(len(self.words) * .7 / batch_size)
        
        total_batchs = max_batch
        batch = min_batch
        preds = []
        for s in self.pred_feat_shapes:
            preds.append(np.zeros((batch_size, s)))
            
        while True:
            y_index = 0
            rootX, tagrteX, y = list(), list(), list()
            for i in range(batch * batch_size, (1 + batch) * batch_size):
                root = self.roots[i]
                word = self.words[i]
                word_feature = self.featArray[i]
                root_encoded, target_encoded = self.encond_rev_input_output(root, word)
                for j in range(len(self.pred_feat_shapes)):
                    preds[j][y_index] = word_feature[j] 
#                 rootX.append(root_encoded.reshape((root_encoded.shape[0], root_encoded.shape[1], 1)))
                tagrteX.append(target_encoded.reshape((target_encoded.shape[0], target_encoded.shape[1], 1)))
#                 y.append(word_feature)
                y_index += 1
            yield np.array(tagrteX), preds
            batch += 1
            if batch == total_batchs or batch == n_batches:
                batch = min_batch


    def word2vec(self, word, max_chars):
        vec = np.zeros((max_chars, self.n_chars))
        for i in range(len(word)):
            vec[i][char2int[word[i]]] = 1
        index_sps = char2int[' ']
        vec[len(word):, index_sps] = 1
        return vec
    
    def char2vec(self, char):
        vec = np.zeros((self.n_chars,))
        vec[char2int[char]] = 1
        return vec
    
    def encond_input_output(self, root_word, target_word):
        root_word = list(root_word)
        target_word = list(target_word) + ['&']
        target_word_in = ["&"] + target_word#[:-1]
        root_encoded = self.word2vec(root_word, self.max_root_len)
        target_encoded = self.word2vec(target_word, self.max_output_len)
        target_in_encoded = self.word2vec(target_word_in, self.max_output_len)
        return root_encoded, target_encoded, target_in_encoded
    
    def encond_rev_input_output(self, root_word, target_word):
        root_word = list(root_word)
        target_word = list(target_word) + ['&']
        target_word_in = ["&"] + target_word#[:-1]
        root_encoded = self.word2vec(root_word, self.max_root_len)
        target_encoded = self.word2vec(target_word, self.max_output_len)
        return root_encoded, target_encoded

        
    def one_hot_decode(self, vec):
        return [int2char[np.argmax(v)] for v in vec]
    
    def word_sim(self, word1, word2):
        c = 0
        for i in range(len(word1)):
            if word1[i] == word2[i]:
                c += 1
        return c/len(word1)
            

    def get_dataset(self, n=100):
        j = 0
        rootX, target_inX, featX, y = list(), list(), list(), list()
        for i in range(len(self.words)):
            root = self.roots[i]
            word = self.words[i]
            word_feature = self.featArray[i]
            root_encoded, target_encoded, target_in_encoded = self.encond_input_output(root, word)
            rootX.append(root_encoded)
            target_inX.append(target_in_encoded)
            featX.append(word_feature)
            y.append(target_encoded)
            j += 1
            if j == n: break
        return np.array(rootX), np.array(target_inX), np.array(featX), np.array(y)
             
    def get_dataset2(self, batch_szie, n=100):
        j = 0
        rootX, target_inX, featX, y = list(), list(), list(), list()
        for i in range(len(self.words)):
            root = self.roots[i]
            word = self.words[i]
            word_feature = self.featArray[i]
            root_encoded, target_encoded, target_in_encoded = self.encond_input_output(root, word)
            rootX.append(root_encoded)
            target_inX.append(target_in_encoded)
            featX.append(word_feature)
            y.append(target_encoded)
            j += 1
            if j == n: break
        return np.array(rootX).reshape((batch_size, 15, 28)), np.array(target_inX), np.array(featX), np.array(y)
        

In [3]:
# dg = DataGen(reverse=True)

[3, 3, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 5, 5, 5, 5, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]


In [4]:
# gen = dg.gen_rev(batch_size=8, n_batches=-1, trainset=True)

In [5]:
# [x1, x2], y = next(gen)

In [6]:
# y[0]