In [1]:
import numpy as np
import matplotlib.pyplot as plt
import sys
sys.path.append('..')

In [2]:
dataset_dir = "../data/sketchrnn/"
dataset_name = "sketchrnn_airplane.npz"
dict_data = np.load(dataset_dir + dataset_name, encoding='latin1', allow_pickle=True)
# extract the first array

print(dict_data)

NpzFile '../data/sketchrnn/sketchrnn_airplane.npz' with keys: test, train, valid


In [3]:
import utils

train = dict_data['train']
print(train.shape) # we have 70000 sketches
# each seq length varies!

(70000,)


In [4]:
print(train[0].shape, train[1].shape, train[2].shape) # each sketch has a different length

(33, 3) (56, 3) (39, 3)


In [5]:
train[0][1]

array([-52,  -4,   0], dtype=int16)

In [6]:
a = [[1,2,3,4], [5,6,7,8], [9,10,11,12]]
b = [["a", "b", "c"], ["d", "e", "f"], ["g", "h", "i"]]

a = np.array(a)
b = np.array(b)

c = [a, b]


print(np.array(c).shape)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (2, 3) + inhomogeneous part.

In [7]:
import dezero
from dezero import cuda
from typing import List, Optional, Tuple, Any
import math

class StrokesDataset(dezero.DataLoader):
    def __init__(self, data, batch_size, max_seq_length: int, scale: Optional[float] = None, shuffle=True, gpu=False):
        stroke_data = []
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.data_size = len(data)
        self.max_iter = math.ceil(self.data_size / batch_size)
        self.gpu = gpu
        
        xp = cuda.cupy if self.gpu else np
        
        for seq in data:
            # we will deem a sequence that is less than 10 as too short and thus ignore it
            if 10 < len(seq) <= max_seq_length:
                # clamp the delta x and delta y to [-1000, 1000]
                seq = np.minimum(seq, 1000)
                seq = np.maximum(seq, -1000)
                
                seq = np.array(seq, dtype=np.float32)
                stroke_data.append(seq)
        
        if scale is None:
            # calculate the scale factor
            # the scale factor is the standard deviation of the x and y coordinates
            # mean is not adjusted for simplicity
            # 0:2 means the first two columns of the array which are x and y coordinates
            scale = np.std(np.concatenate([np.ravel(s[:,0:2]) for s in stroke_data]))
        
        longest_seq_len = max([len(seq) for seq in stroke_data])
        
        # we add two extra columns to the dataset since we currently there are only 3 columns in the dataset
        # additional two columns are for changing the last point 1/0 to a one-hot vector
        temp_stroke_dataset = xp.zeros((len(stroke_data), longest_seq_len + 2, 5), dtype=np.float32)
        
        # self.mask is used to mark areas of the sequence that are not used
        # we first initialize it to zero
        temp_mask_dataset = xp.zeros((len(stroke_data), longest_seq_len + 1))
        
        self.dataset = []
        
        # start of sequence is [0, 0, 1, 0, 0]
        
        for i, seq in enumerate(stroke_data):
            seq = xp.array(seq, dtype=xp.float32)
            len_seq = len(seq)
            
            # we start from 1 to leave the first row for the start of sequence token
            temp_stroke_dataset[i, 1:len_seq + 1, 0:2] = seq[:, :2] / scale # this is the x and y coordinates
            temp_stroke_dataset[i, 1:len_seq + 1, 2] = 1 - seq[:, 2] # this is the pen down
            temp_stroke_dataset[i, 1:len_seq + 1, 3] = seq[:, 2] # this is the pen up
            temp_stroke_dataset[i, len_seq + 1, 4] = 1  # this is the end of sequence token
            temp_mask_dataset[i, :len_seq + 1] = 1 # mask is on until the end of the sequence 
            # self.mask is used to mark areas of the sequence that are not used
            # for example, if the sequence is shorter than the longest sequence, we use mask to ignore the rest of the sequence
            # an example of mask is [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        
        temp_stroke_dataset[:, 0, 2] = 1
        
        for i in range(len(stroke_data)):
            self.dataset.append([temp_stroke_dataset[i], temp_mask_dataset[i]])
        
        
        self.reset()


In [8]:
strokes = StrokesDataset(train, batch_size=4, max_seq_length=200, gpu=False, shuffle=False)

In [9]:
print(strokes.data_size)
# first item



x, t = strokes.__next__()
print(x.shape, t.shape) # x is the stroke, t is the mask (x has one more column than t)


# check if the mask is working
batch_size = x.shape[0]

for i in range(batch_size):
    mask_zero_id = np.where(t[i] == 0)[0]
    # first id
    first_id = mask_zero_id[0]
    stroke_end_id = np.where(x[i, :, 4] == 1)[0]
    first_stroke_end_id = stroke_end_id[0]
    
    print(first_id, first_stroke_end_id)


70000
(4, 101, 5) (4, 100)
34 34
57 57
40 40
86 86


In [10]:
import dezero.functions as F

# According to other estimates
# the number of distributions in the mixture model is 20
# https://github.com/Shun14/sketch-rnn-kanji
# https://nn.labml.ai/sketch_rnn/index.html

# This is for getting the loss of delta_x and delta_y
class BivariateGaussianMixture:
    def __init__(self, pi_logits, mu_x, mu_y, sigma_x, sigma_y, rho_xy):
        self.pi_logits = pi_logits
        self.mu_x = mu_x
        self.mu_y = mu_y
        self.sigma_x = sigma_x
        self.sigma_y = sigma_y
        self.rho_xy = rho_xy
    
    @property
    def n_distributions(self):
        return self.pi_logits.shape[-1]
    
    def set_temperature(self, temperature: float):
        self.pi_logits /= temperature
        self.sigma_x *= math.sqrt(temperature)
        self.sigma_y *= math.sqrt(temperature)
    
    def gaussian_pdf(self, x_delta, y_delta):
        # the result means the probability of y in the normal distribution
        # we check the probability of y in the normal distribution
        # if the probability is high, the result is close to 1
        norm1 = F.sub(x_delta, self.mu_x)
        norm2 = F.sub(y_delta, self.mu_y)
        xp = cuda.get_array_module(norm1)

        
        s1s2 = F.mul(self.sigma_x, self.sigma_y)
        
        # This is from: https://github.com/hardmaru/write-rnn-tensorflow/blob/master/model.py
        # z = tf.square(tf.div(norm1, s1)) + tf.square(tf.div(norm2, s2))
        #     - 2 * tf.div(tf.multiply(rho, tf.multiply(norm1, norm2)), s1s2)
         
        # below is the deconstruction of the above linez
        z_first_term = F.pow(F.div(norm1, self.sigma_x), 2)
        z_second_term = F.pow(F.div(norm2, self.sigma_y), 2)
        z_last_term_inner = F.mul(self.rho_xy, F.mul(norm1, norm2))
        z_last_term_middle = F.div(z_last_term_inner, s1s2)
        tmp_z = np.ones(z_last_term_middle.shape) * -2
        z_last_term = F.mul(tmp_z, z_last_term_middle)
        z = F.add(F.add(z_first_term, z_second_term), z_last_term)
        negRho = F.sub(np.ones(self.rho_xy.shape), F.pow(self.rho_xy, 2))

        
        result = F.exp(F.div(-z, 2 * negRho))
        deno_first_term = np.ones(self.sigma_x.shape) * 2 * math.pi
        denom_second_term = F.mul(s1s2, F.pow(negRho, 0.5))
        denom = F.mul(deno_first_term, denom_second_term)
        result = F.div(result, denom)
        
        return result

    # x1_data and x2_data are the real x and y coordinates of the stroke
    def get_lossfunc(self, x_delta, y_delta):
        result0 = self.gaussian_pdf(x_delta, y_delta)
        
        elipson = 1e-20
        # check if result0 has inf or nan
        result1 = F.mul(result0, self.pi_logits)
        result1 = F.sum(result1, axis=1, keepdims=True)
        result1 = -F.log(result1)
        
        return F.mean(result1)

        

In [None]:
import dezero.models as M
import dezero.layers as L

class EncoderRNN(M.Model):
    def __init__(self, n_input, n_hidden, n_output):
        super().__init__()
        self.n_hidden = n_hidden
        self.lstm = L.LSTM(5, n_hidden)
        self.mu_head = L.Linear(2 * n_hidden, n_output)
        self.sigma_head = L.Linear(2 * n_hidden, n_output)
    
    def reset_state(self):
        self.rnn.reset_state()
    
    def forward(self, x):
        h = self.lstm(x)
        
        mu = self.mu_head(h)
        
        sigma_hat = self.sigma_head(h)
        
        sigma = F.exp(F.div(sigma_hat, 2))
        
        xp = cuda.get_array_module(mu)
        
        z = mu + sigma * xp.random.normal(0, 1, mu.shape) # reparameterization trick
        
        return z, mu, sigma
        

In [1]:
class DecoderRNN(M.Model):
    def __init__(self, d_z: int, n_hidden: int, n_distributions: int):
        super().__init__()
        self.lstm = L.LSTM(d_z + 5, n_hidden)
        self.init_state = L.Linear(d_z, n_hidden) # initial state of lstm is [h0;c0] = tanh(Wz * z + bz)
        self.mixtures = L.Linear(n_hidden, 6 * n_distributions)
        self.q_head = L.Linear(n_hidden, 3) # this is for logit q1 and q2 and q3
        self.q_log_softmax = F.LogSoftmax(-1) 
        
        self.n_distributions = n_distributions
        self.n_hidden = n_hidden
    
    def forward(self, x, z, state = None):
        xp = cuda.get_array_module(x)
        if state is None:
            temp = F.tanh(self.init_state(z))
            # split the state into h and c into n_hidden groups
            h, c = xp.split(temp, self.n_hidden, axis=1)
            # make h and c into 1d array
            state = (h.rave(), c.ravel())
        
        outputs, state = self.lstm(x, state)
        
        q_logits = self.q_log_softmax(self.q_head(outputs))
        
        pi_logts, mu_x, mu_y, sigma_x, sigma_y, rho_xy = xp.split(self.mixtures(outputs), self.n_distributions, axis=2)
        
        BGM = BivariateGaussianMixture(pi_logts, mu_x, mu_y, sigma_x, sigma_y, rho_xy)
        
        return q_logits, BGM, state

NameError: name 'M' is not defined

In [None]:

class ReconstructionLoss(M.Model):
    def forward(self, mask, target, bgm, q_logits):
        xp = cuda.get_array_module(mask)
        # target is a 3 dimensional array
        # xy = target[:, :, 0:2].unsqueeze(-2).expand(-1, -1, dist.n_distributions, -1)
        xy = target[:, :, 0:2]
        xy = xy[:, :, xp.newaxis, :]
        
        expanded_shape = (xy.shape[0], xy.shape[1], bgm.n_distributions, xy.shape[3])
        
        x = xp.tile(xy, expanded_shape)
        y = xp.tile(xy, expanded_shape)
        loss_stroke = bgm.get_lossfunc(x, y)
        
        loss_pen = -F.mean(F.mul(target[:,:,2:], q_logits))
        
        return F.add(loss_stroke, loss_pen)
        

In [None]:
class KLDivergenceLoss(M.Model):
    def forward(self, mu, sigma):
        xp = cuda.get_array_module(mu)
        tmp = xp.ones(sigma.shape)
        inner_1 = F.add(tmp, sigma)
        inner_2 = F.add(F.pow(mu, 2), F.exp(sigma))
        inner = F.sub(inner_1, inner_2)
        tmp2 = xp.ones(inner.shape) * -2
        return F.mean(F.div(inner, tmp2))
        