# 哲賢

In [30]:
import pickle, numpy

In [None]:
with open('data/i2w.pickle', 'rb') as handle:
	a = pickle.load(handle)
with open('data/w2i.pickle', 'rb') as handle:
	b = pickle.load(handle)
for i in range(10):
	print(a[i])
	print(b[a[i]])

In [31]:
weights = numpy.load('data/weights.npy')

In [32]:
weights

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.46295956,  0.60897344, -0.09038982, ..., -0.14422344,
        -0.447958  , -0.10773784],
       [ 0.40195522,  0.16258185,  0.3913999 , ..., -0.38018417,
        -0.29988715,  0.00189703],
       ...,
       [ 0.11018498, -0.08562936,  0.04963986, ...,  0.05504006,
        -0.0210738 , -0.12578586],
       [-0.03119463,  0.03227964, -0.00795191, ...,  0.01224131,
         0.07899259,  0.03871416],
       [ 0.14568943,  0.04103405,  0.12413269, ..., -0.08994981,
         0.13046025, -0.08646748]], dtype=float32)

# 克安 playground

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import numpy as np
import random
from scipy.special import expit

In [68]:
pretrained = np.load('word_vectors.npy')
embedding = nn.Embedding(num_embeddings=pretrained.shape[0], embedding_dim=pretrained.shape[1])
print(embedding.weight)
print(embedding.weight.requires_grad)
embedding.weight = nn.Parameter(torch.Tensor(pretrained))
print(embedding.weight)
print(embedding.weight.requires_grad)

Parameter containing:
-6.1635e-01 -8.1987e-01 -8.3256e-01  ...  -4.8794e-01 -4.8985e-01  9.1782e-01
-1.5929e+00  2.6971e-01 -1.0009e+00  ...   6.2423e-01 -3.2411e-01 -1.0822e+00
 8.9374e-01 -7.2405e-01 -1.5573e+00  ...   1.2894e+00 -2.2827e+00  4.2213e-01
                ...                   ⋱                   ...                
 1.8184e-01 -6.3451e-01  8.9738e-01  ...   1.0519e+00  1.3843e+00 -6.2459e-01
 5.0422e-01  5.3824e-01 -1.3206e+00  ...  -2.7180e-02 -3.1137e-01 -2.0636e-02
-1.0055e+00 -3.4340e-01  7.1548e-01  ...   5.6963e-01  7.4671e-01  7.3272e-01
[torch.FloatTensor of size 44089x100]

True
Parameter containing:
-2.3829e-01 -7.4225e-02 -3.9740e-02  ...   8.0199e-03  1.3803e-01  2.7434e-01
 3.6569e-01  2.5967e-01 -8.3636e-01  ...  -3.8568e-01  3.5039e-01  2.5933e-01
 3.9887e-01  2.1847e-01 -1.7002e-01  ...  -2.9260e-01  3.8296e-01  5.8763e-02
                ...                   ⋱                   ...                
-5.1088e-02  1.8064e-01  1.0946e-01  ...   1.0074e-01 

In [63]:
embedding.weight.requires_grad

True

In [2]:
class AttentionLayer(nn.Module):
    def __init__(self, hidden_size):
        super(AttentionLayer, self).__init__()

        self.hidden_size = hidden_size
        self.match = nn.Linear(2*hidden_size, hidden_size)
        self.to_weight = nn.Linear(hidden_size, 1, bias=False)


    def forward(self, hidden_state, encoder_outputs):
        """
        Arguments:
            (decoder current) hidden_state {Variable} -- (1, batch, hidden_size)
            encoder_outputs {Variable} -- (batch, seq_len, hidden_size) 
        Returns:
            Variable -- context vector of size batch_size x dim
        """

        batch_size, seq_len, feat_n = encoder_outputs.size()
        # Resize hidden_state and copy it seq_len times, so that we can get its attention
        # with each encoder_output
        hidden_state = hidden_state.view(batch_size, 1, feat_n).repeat(1, seq_len, 1)

        matching_inputs = torch.cat((encoder_outputs, hidden_state), 2).view(-1, 2*self.hidden_size)

        attention_weights = self.to_weight(self.match(matching_inputs))
        attention_weights = attention_weights.view(batch_size, seq_len)
        attention_weights = F.softmax(attention_weights, dim=1)

        context = torch.bmm(attention_weights.unsqueeze(1), encoder_outputs).squeeze(1)

        return context


class EncoderRNN(nn.Module):
    def __init__(self, word_vec_filepath='word_vectors.npy', hidden_size=1024, num_layers=1):
        super(EncoderRNN, self).__init__()
    
        self.hidden_size = hidden_size
        
        # load pretrained embedding
        pretrained = np.load(word_vec_filepath)
        self.vocab_size = pretrained.shape[0]
        self.word_vec_dim = pretrained.shape[1]
        
        embedding = nn.Embedding(num_embeddings=self.vocab_size, embedding_dim=self.word_vec_dim)
        embedding.weight = nn.Parameter(torch.Tensor(pretrained)) # requires_grad == True
        self.embedding = embedding # TODO: can let encoder and decoder share embeddings
        
        # feed word vector into encoder GRU
        self.gru = nn.GRU(input_size=self.word_vec_dim, hidden_size=self.hidden_size, num_layers=num_layers, batch_first=True)

    def forward(self, input): # input: (batch_size, sentence_length)
        """
        param input: input sequence with shape (batch size, sequence_length)
        return: gru output, hidden state
        """    
        word_embeddings = self.embedding(input) # (batch_size, sentence_length, word_vec_dim)
        top_layer_output, last_time_step_all_layers_output = self.gru(word_embeddings)
        # top_layer_output: (seq_len, batch, hidden_size * num_directions)
        # last_time_step_all_layers_output: (num_layers * num_directions, batch, hidden_size)
        
        return top_layer_output, last_time_step_all_layers_output

class DecoderRNN(nn.Module):
    def __init__(self, word_vec_filepath='word_vectors.npy', hidden_size=1024, num_layers=1):
        super(DecoderRNN, self).__init__()

        # define hyper parameters
        self.hidden_size = hidden_size # size of gru's Y and H
        
        # load pretrained embedding
        pretrained = np.load(word_vec_filepath)
        self.vocab_size = pretrained.shape[0]
        self.word_vec_dim = pretrained.shape[1]
        
        embedding = nn.Embedding(num_embeddings=self.vocab_size, embedding_dim=self.word_vec_dim)
        embedding.weight = nn.Parameter(torch.Tensor(pretrained)) # requires_grad == True
        self.embedding = embedding # TODO: can let encoder and decoder share embeddings

        # gru input is word vector of prev_output_word (one hot), plus attention context vector
        self.gru = nn.GRU(self.word_vec_dim+self.hidden_size, hidden_size=self.hidden_size, num_layers=num_layers, batch_first=True)
        self.attention = AttentionLayer(self.hidden_size)
        # output is softmax over entire vocabulary
        self.to_final_output = nn.Linear(self.hidden_size, self.vocab_size)

    def forward(self, encoder_last_hidden_state, encoder_output, targets=None, mode='train', steps=None):
        """
        :param encoder_last_hidden_state: (num_layers * num_directions, batch, hidden_size)
        :param encoder_output: (batch, length_prev_sentences, hidden_size * num_directions)
        :param targets: (batch, length_curr_sentences) target ground truth sentences
        :param steps: just a parameter used for calculating scheduled sampling, unrelated to RNN time steps
        :return:
        """

        # parameters used in both train and inference stage
        _, batch_size, _ = encoder_last_hidden_state.size()
        decoder_current_hidden_state = encoder_last_hidden_state # (encoder_num_layers * num_directions, batch, hidden_size)
        decoder_current_input_word = Variable(torch.ones(batch_size, 1)).long()  #<SOS> (batch x word index)
        decoder_current_input_word = decoder_current_input_word.cuda() if torch.cuda.is_available() else decoder_current_input_word
        seq_logProb = []
        seq_predictions = []


        if targets is None:
            raise NotImplementedError('Training target is None. Error location: RNNDecoder')
        if steps is None:
            raise NotImplementedError('steps is not specified. Error location: RNNDecoder -> steps')

        # targets is only used for scheduled sampling, not used for calculating loss
        targets = self.embedding(targets) # (batch, max_seq_len, embedding_size) embeddings of target labels of ground truth sentences
        _, seq_len, _ = targets.size()

        for i in range(seq_len-1): # only the length_curr_sentences-1 words will be the gru input, we exclude EOS token
            """
            we implement the decoding procedure in a step by step fashion
            so the seq_len is always 1
            """
            threshold = self._get_teacher_learning_ratio(training_steps=steps)
            
            # target[:, i]: (batch, 1, embedding_size)
            current_input_word = targets[:, i] if random.uniform(0.05, 0.995) > threshold \
                else self.embedding(decoder_current_input_word)
            # current_input_word: (batch, 1, embedding_size)

            # weighted sum of the encoder output w.r.t the current hidden state
            context = self.attention(decoder_current_hidden_state, encoder_output) # (1, batch, hidden_size) (batch, seq_len, hidden_size) 
            # context: (batch, hidden_size)
            gru_input = torch.cat([current_input_word.squeeze(1), context], dim=1).unsqueeze(1)
            # gru_input: (batch, 1, embedding_size+hidden_size)

            # only runs for one time step because sequence length is only 1
            gru_output, decoder_current_hidden_state = self.gru(gru_input, decoder_current_hidden_state)
            # gru_output (last time step): (batch, seq_length==1, hidden_size * num_directions)
            # decoder_current_hidden_state (last layer): (num_layers * num_directions, batch, hidden_size)

            # project the dim of the gru output to match the final decoder output dim
            # logprob = F.log_softmax(self.to_final_output(gru_output.squeeze(1)), dim=1)
            logprob = self.to_final_output(gru_output.squeeze(1)) # logprob: (batch, vocab_size)
            seq_logProb.append(logprob)

            decoder_current_input_word = logprob.max(1)[1]
            
        # seq_logProb: list of [(batch, vocab_size), (batch, vocab_size)], len(list) == seq_len
        seq_logProb = torch.stack(seq_logProb, dim=1)
        # seq_logProb: (batch, seq_len, vocab_size)
        
        seq_predictions = seq_logProb.max(2)[1]

        return seq_logProb, seq_predictions

    # basically same as forward(), but without scheduled sampling
    def infer(self, encoder_last_hidden_state, encoder_output):
        _, batch_size, _ = encoder_last_hidden_state.size()
        decoder_current_hidden_state = encoder_last_hidden_state # (encoder_num_layers * num_directions, batch, hidden_size)
        decoder_current_input_word = Variable(torch.ones(batch_size, 1)).long()  #<SOS> (batch x word index)
        decoder_current_input_word = decoder_current_input_word.cuda() if torch.cuda.is_available() else decoder_current_input_word
        seq_logProb = []
        seq_predictions = []

        assumption_seq_len = 28 # run for fixed amount of time steps
        for i in range(assumption_seq_len-1):

            current_input_word = self.embedding(decoder_current_input_word)

            context = self.attention(decoder_current_hidden_state, encoder_output)

            gru_input = torch.cat([current_input_word.squeeze(1), context], dim=1).unsqueeze(1)

            gru_output, decoder_current_hidden_state = self.gru(gru_input, decoder_current_hidden_state)

            logprob = self.to_final_output(gru_output.squeeze(1))
            seq_logProb.append(logprob)

            decoder_current_input_word = logprob.max(1)[1]

        seq_logProb = torch.stack(seq_logProb, dim=1)

        seq_predictions = seq_logProb.max(2)[1]

        return seq_logProb, seq_predictions


    def _get_teacher_learning_ratio(self, training_steps):
        return (expit(training_steps/40 +0.85))



class VideoCaptionGenerator(nn.Module):
    def __init__(self, encoder, decoder):
        super(VideoCaptionGenerator, self).__init__()

        self.encoder = encoder
        self.decoder = decoder


    def forward(self, avi_feats, mode, target_sentences=None, steps=None):
        """
        Args:
            param avi_feats(Variable): size(batch size x 80 x 4096)
            param target_sentences: ground truth for training, None for inference
        Returns:
            seq_logProb
            seq_predictions
        """

        encoder_outputs, encoder_last_hidden_state = self.encoder(avi_feats)

        if mode == 'train':
            seq_logProb, seq_predictions = self.decoder(
                encoder_last_hidden_state = encoder_last_hidden_state,
                encoder_output = encoder_outputs,
                targets = target_sentences,
                mode = mode,
                steps=steps
            )

        elif mode == 'inference':
            seq_logProb, seq_predictions = self.decoder.infer(
                encoder_last_hidden_state=encoder_last_hidden_state,
                encoder_output=encoder_outputs,
            )

        else:
            raise KeyError('mode is not valid')

        return seq_logProb, seq_predictions

### my lil sandbox

In [None]:
from dataset import TrainingDataset, collate_fn
from vocabulary import Vocabulary
from torch.utils.data import DataLoader

training_data_path='data/clr_conversation.txt'
helper = Vocabulary(training_data_path)
dataset = TrainingDataset(training_data_path, helper)
dataloader = DataLoader(dataset, batch_size=128, shuffle=True, num_workers=8, collate_fn=collate_fn)

In [3]:
encoder = EncoderRNN()
decoder = DecoderRNN()

In [8]:
encoder.train()
for batch_idx, batch in enumerate(dataloader):
    padded_prev_sentences, padded_curr_sentences, lengths_curr_sentences = batch
    padded_prev_sentences, padded_curr_sentences = Variable(padded_prev_sentences), Variable(padded_curr_sentences)
    
    top_layer_output, last_time_step_all_layers_output = encoder(padded_prev_sentences)
    
    seq_logProb, seq_predictions = decoder(
        encoder_last_hidden_state = last_time_step_all_layers_output,
        encoder_output = top_layer_output,
        targets = padded_curr_sentences,
        steps=1
    )
    print(seq_logProb)
    print()
    print(seq_predictions)
    break

Variable containing:
(  0  ,.,.) = 
  4.0636e-03  1.4534e-02  4.3411e-02  ...   3.8113e-02  1.5125e-02  7.4324e-02
  3.1995e-03 -2.8838e-02 -4.2324e-02  ...   5.1349e-02  2.0716e-02  5.2914e-02
  3.4641e-03 -4.8222e-02 -5.4895e-02  ...   6.4168e-02  3.0389e-02  5.0499e-02
                 ...                   ⋱                   ...                
  2.5436e-02 -6.2421e-02 -5.0598e-02  ...   6.3670e-02  2.6074e-02  1.0488e-01
  4.3065e-02 -8.5981e-02 -1.1462e-01  ...   4.3215e-02  1.1495e-01  1.3081e-01
  3.2917e-02 -8.7677e-02 -2.0065e-01  ...   3.3761e-02  9.9658e-03  9.4516e-02

(  1  ,.,.) = 
 -9.3686e-03 -1.0405e-03  1.4614e-02  ...   8.5765e-03 -1.5679e-02  4.5429e-02
 -4.8277e-03 -5.7344e-03  3.2877e-03  ...   2.4134e-02 -2.3264e-02  3.6407e-02
 -1.1159e-02 -2.0141e-02 -2.1574e-02  ...   3.7705e-02 -3.2930e-02  3.1977e-02
                 ...                   ⋱                   ...                
  2.8472e-02 -7.0078e-02  2.3107e-02  ...  -4.5401e-03 -3.1738e-02  4.1753e-02


In [49]:
if __name__ == '__main__':
    import logging
    logger.setLevel(logging.INFO)
    from vocabulary import Vocabulary

    json_file = 'data/testing_label.json'
    numpy_file = 'data/testing_data/feat'

    helper = Vocabulary(json_file, min_word_count=5)



    input_data = Variable(torch.randn(3, 80, 4096).view(-1, 80, 4096))

    encoder = EncoderRNN(input_size=4096, hidden_size=1000)
    decoder = DecoderRNN(hidden_size=1000, output_size=1700, vocab_size=1700, word_dim=128, helper=helper)

    model = VideoCaptionGenerator(encoder=encoder, decoder=decoder)

    ground_truth = Variable(torch.rand(3, 27)).long()

    for step in range(50, 100):
        seq_prob, seq_predict = model(avi_feats=input_data, mode='train', target_sentences=ground_truth, steps=step)

        if step % 10 == 0:
            print(seq_prob.size())
            print(seq_predict.size())

Initalizing vocabulary...
Building mapping...
Parsing training data to dataset...
epoch: 1
batch no: 0


Columns 0 to 10 
     1   1861   3430   3200    926     36   5360    926     36      2      0
     1    110    389  26774     32   1107    319      9     32  12447     25
     1    105     32  30827    489    110  29918    343    249   2010   2463

Columns 11 to 12 
     0      0
  5025      2
     2      0
[torch.LongTensor of size 3x13]


[10, 13, 12]


     1    660    201   4941   1208    350    139    574    794    515      2
     1    951      9    256   7091    243    690    319   2794     95      2
     1    619     72    422    187    487  16316    200      2      0      0
[torch.LongTensor of size 3x11]


[11, 11, 9]

['<SOS>', '回去', '睡覺', '派對', '結束', '了', '表演', '結束', '了', '<EOS>', '<PAD>', '<PAD>', '<PAD>']
['<SOS>', '沒有', '任何', '破門而入', '的', '痕跡', '但是', '我', '的', '抽屜', '和', '物品', '<EOS>']
['<SOS>', '他', '的', '軟盤', '上', '沒有', '寫字', '而且', '全都', '一起', '帶著', '<EOS>', '<PAD>']

In [76]:
prev_sentences = [[1,2,3,4], [1,2,3,4,5,6,7], [1,2]]
lengths_prev_sentences = [len(sentence) for sentence in prev_sentences]
padded_prev_sentences = np.zeros((len(prev_sentences), max(lengths_prev_sentences)))
for i, sentence in enumerate(prev_sentences):
    end = lengths_prev_sentences[i]
    padded_prev_sentences[i, -end:] = sentence[:end]

print(padded_prev_sentences)

[[0. 0. 0. 1. 2. 3. 4.]
 [1. 2. 3. 4. 5. 6. 7.]
 [0. 0. 0. 0. 0. 1. 2.]]
