# Learning PyTorch for Multimodality

In [1]:
import torch

In [2]:
device = torch.device('cuda')

## Basic Model of Sequential Modules

Example from [here](https://github.com/jcjohnson/pytorch-examples#pytorch-optim)

In [3]:
N, D_in, Z_in, H, H2, D_out = 64, 400, 20, 300, 200, 10

x = torch.randn(N, D_in, device=device)
z = torch.randn(N, Z_in, device=device)
y = torch.randn(N, D_out, device=device)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
).to(device)

loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(10):
    y_pred = model(x)
    
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    
    optimizer.zero_grad()
    
    loss.backward()
    
    optimizer.step()

0 636.4593505859375
1 623.7313232421875
2 611.2108764648438
3 598.9217529296875
4 586.8594360351562
5 575.022705078125
6 563.3897705078125
7 551.9767456054688
8 540.7965698242188
9 529.8015747070312


In [4]:
model.state_dict().keys()

odict_keys(['0.weight', '0.bias', '2.weight', '2.bias'])

In [5]:
model.state_dict().get('0.bias').shape

torch.Size([300])

## Expanding to "Multi-Modal" with Custom Class

So rather than stringing together modules like above, we have to create a custom class for our model. Mainly, this is because we are concatenating multiple inputs.

This still hasn't been tested on real data, and I don't know all the "gotchas" of these modules yet, but this works as expected thus far, and creates weight matrices as expected.

In [23]:
class Multimodal(torch.nn.Module):
    def __init__(self, D_in, H, Z_in, H2, D_out):
        super(Multimodal, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.multi = torch.nn.Linear(H + Z_in, H2)
        self.linear2 = torch.nn.Linear(H2, D_out)
        
    def forward(self, x, z):
        h1 = self.linear1(x)
        h1_z = torch.cat([h1, z], dim=1)
        h2 = self.multi(h1_z)
        out = self.linear2(h2)
        return out

In [24]:
N, D_in, Z_in, H, H2, D_out = 64, 400, 20, 300, 200, 10

x = torch.randn(N, D_in, device=device)
z = torch.randn(N, Z_in, device=device)
y = torch.randn(N, D_out, device=device)

model = Multimodal(D_in, H, Z_in, H2, D_out)

loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for t in range(10):
    y_pred = model(x, z)
    
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    
    optimizer.zero_grad()
    
    loss.backward()
    
    optimizer.step()

RuntimeError: Expected object of backend CPU but got backend CUDA for argument #4 'mat1'

For example, the "multi" weights have input weights of length 320 because we take 300 from the output of `linear1` + 20 from the auxillary input (e.g. z) and then we map this to the output of 200

In [8]:
model.state_dict().keys()

odict_keys(['linear1.weight', 'linear1.bias', 'multi.weight', 'multi.bias', 'linear2.weight', 'linear2.bias'])

In [9]:
model.state_dict().get('multi.weight').shape

torch.Size([200, 320])

In [10]:
print(model)

Multimodal(
  (linear1): Linear(in_features=400, out_features=300, bias=True)
  (multi): Linear(in_features=320, out_features=200, bias=True)
  (linear2): Linear(in_features=200, out_features=10, bias=True)
)


## Manual run of FastAI

In [6]:
from fastai import *
from fastai.text import *

Bring in some sample data using fastai imdb

In [7]:
path = untar_data(URLs.IMDB_SAMPLE)
data_lm = TextLMDataBunch.from_csv(path)

for x, y in list(data_lm.train_dl): # just testing with one batch
    x, y
    
z = torch.randn(x.shape[0], x.shape[1], 10,
                device=device, requires_grad=True) # making up 10 "audio features"
z.shape

torch.Size([72, 64, 10])

In [8]:
original_learn = RNNLearner.language_model(data_lm)
{k:v.shape for (k,v) in original_learn.model.state_dict().items()}

{'0.encoder.weight': torch.Size([5999, 400]),
 '0.encoder_dp.emb.weight': torch.Size([5999, 400]),
 '0.rnns.0.weight_hh_l0_raw': torch.Size([4600, 1150]),
 '0.rnns.0.module.weight_ih_l0': torch.Size([4600, 400]),
 '0.rnns.0.module.weight_hh_l0': torch.Size([4600, 1150]),
 '0.rnns.0.module.bias_ih_l0': torch.Size([4600]),
 '0.rnns.0.module.bias_hh_l0': torch.Size([4600]),
 '0.rnns.1.weight_hh_l0_raw': torch.Size([4600, 1150]),
 '0.rnns.1.module.weight_ih_l0': torch.Size([4600, 1150]),
 '0.rnns.1.module.weight_hh_l0': torch.Size([4600, 1150]),
 '0.rnns.1.module.bias_ih_l0': torch.Size([4600]),
 '0.rnns.1.module.bias_hh_l0': torch.Size([4600]),
 '0.rnns.2.weight_hh_l0_raw': torch.Size([1600, 400]),
 '0.rnns.2.module.weight_ih_l0': torch.Size([1600, 1150]),
 '0.rnns.2.module.weight_hh_l0': torch.Size([1600, 400]),
 '0.rnns.2.module.bias_ih_l0': torch.Size([1600]),
 '0.rnns.2.module.bias_hh_l0': torch.Size([1600]),
 '1.decoder.weight': torch.Size([5999, 400]),
 '1.decoder.bias': torch.Size(

In [9]:
original_learn.save('imdb-model-test')

In [10]:
with open(original_learn.path/original_learn.model_dir/'imdb-itos.pkl', 'wb') as f:
    pickle.dump(original_learn.data.train_ds.vocab.itos, f)
    

First, let's make sure we can run the same model as fastai manually...

In [13]:
vocab_sz = 5999
emb_sz = 400
n_hid = 1150
n_layers = 3
pad_token= 1
qrnn = False
bidir = False

dps = np.array([0.25, 0.1, 0.2, 0.02, 0.15])

hidden_p = dps[4]
input_p = dps[0]
embed_p = dps[3]
weight_p = dps[2]

tie_weights = True
output_p = dps[1]
bias = True

audio_sz = 10

# Create a full AWD-LSTM.
rnn_enc = RNNCore(vocab_sz=vocab_sz,
                  emb_sz=emb_sz,
                  n_hid=n_hid,
                  n_layers=n_layers,
                  pad_token=pad_token,
                  qrnn=qrnn,
                  bidir=bidir,
                  hidden_p=hidden_p,
                  input_p=input_p,
                  embed_p=embed_p,
                  weight_p=weight_p)

enc = rnn_enc.encoder if tie_weights else None
model = SequentialRNN(rnn_enc, LinearDecoder(vocab_sz, emb_sz, output_p, tie_encoder=enc, bias=bias))

In [14]:
out = model(x)

RuntimeError: Expected object of backend CPU but got backend CUDA for argument #3 'index'

In [571]:
model.train()
model.reset()

loss_fn = torch.nn.CrossEntropyLoss(reduction='mean')

learning_rate = 1e-4

opt_params = model.parameters()

# this is a hack for now... not sure if this messes up the graph somewhere by doing this
# opt_params = [par for par in model.parameters() if par.is_leaf]
optimizer = torch.optim.Adam(opt_params, lr=learning_rate)

y_pred = model(x)

for t in range(1):
    y_pred = model(x)[0]
    
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    
    optimizer.zero_grad()
    
    loss.backward()
    
    optimizer.step()

0 8.698554992675781


Next, try porting this into the fastai learner object

In [18]:
# model.reset()
# learn = RNNLearner(data_lm, model)
# learn.fit(1)

## Add Multimodality to FastAI

Thus far, I'm able to get multimodal_rnn to work, but it doesnt work when used with SequentialRNN. Pretty sure this is because `forward` is not registered properly with SequentialRNN.

Below, I attempt to include the decoder directly into the custom module

In [15]:
class MultiModalRNN(RNNCore):
    def __init__(self, audio_sz, output_p, bias, tie_encoder:bool=True, **kwargs):
        super(MultiModalRNN, self).__init__(**kwargs)
        self.rnns = None
        self.audio_sz = audio_sz
        self.multimode = [nn.LSTM(emb_sz + audio_sz if l == 0 else n_hid,
                                  (n_hid if l != n_layers - 1 else emb_sz)//self.ndir,
                                  1, bidirectional=bidir) for l in range(n_layers)]
        self.multimode = [WeightDropout(rnn, weight_p) for rnn in self.multimode]
        self.multimode = torch.nn.ModuleList(self.multimode)
        
        if tie_encoder:
            enc = self.encoder
        else:
            enc = None
        
        self.multidecoder = LinearDecoder(vocab_sz,
                                          emb_sz,
                                          output_p,
                                          tie_encoder=enc,
                                          bias=bias)
        
    def forward(self, input:LongTensor, input_audio:Tensor)->Tuple[Tensor,Tensor,Tensor]:
        sl,bs = input.size()
        if bs!=self.bs:
            self.bs=bs
            self.reset()
        raw_output = self.input_dp(self.encoder_dp(input))
        raw_output = torch.cat([raw_output, input_audio], dim=2)
        new_hidden,raw_outputs,outputs = [],[],[]
        for l, (rnn,hid_dp) in enumerate(zip(self.multimode, self.hidden_dps)):
            raw_output, new_h = rnn(raw_output, self.hidden[l])
            new_hidden.append(new_h)
            raw_outputs.append(raw_output)
            if l != self.n_layers - 1: raw_output = hid_dp(raw_output)
            outputs.append(raw_output)
        self.hidden = to_detach(new_hidden)
        
        output = self.multidecoder.output_dp(outputs[-1])
        decoded = self.multidecoder.decoder(output.view(output.size(0)*output.size(1),
                                                        output.size(2)))
        
        return decoded, raw_outputs, outputs
    
    def _one_hidden(self, l:int)->Tensor:
        "Return one hidden state."
        nh = (self.n_hid if l != self.n_layers - 1 else self.emb_sz)//self.ndir
        return self.weights.new(self.ndir, self.bs, nh).zero_()

    def reset(self):
        "Reset the hidden states."
        [r.reset() for r in self.multimode if hasattr(r, 'reset')]
        self.weights = next(self.parameters()).data
        if self.qrnn: self.hidden = [self._one_hidden(l) for l in range(self.n_layers)]
        else: self.hidden = [(self._one_hidden(l), self._one_hidden(l)) for l in range(self.n_layers)]
    
multimodal_rnn = MultiModalRNN(audio_sz=audio_sz,
                              vocab_sz=vocab_sz,
                              emb_sz=emb_sz,
                              n_hid=n_hid,
                              n_layers=n_layers,
                              pad_token=pad_token,
                              qrnn=qrnn,
                              bidir=bidir,
                              hidden_p=hidden_p,
                              input_p=input_p,
                              embed_p=embed_p,
                              weight_p=weight_p,
                              output_p=output_p,
                              bias=bias,
                              tie_encoder=tie_weights).to(device)

multimodal_rnn

MultiModalRNN(
  (encoder): Embedding(5999, 400, padding_idx=1)
  (encoder_dp): EmbeddingDropout(
    (emb): Embedding(5999, 400, padding_idx=1)
  )
  (rnns): None
  (input_dp): RNNDropout()
  (hidden_dps): ModuleList(
    (0): RNNDropout()
    (1): RNNDropout()
    (2): RNNDropout()
  )
  (multimode): ModuleList(
    (0): WeightDropout(
      (module): LSTM(410, 1150)
    )
    (1): WeightDropout(
      (module): LSTM(1150, 1150)
    )
    (2): WeightDropout(
      (module): LSTM(1150, 400)
    )
  )
  (multidecoder): LinearDecoder(
    (decoder): Linear(in_features=400, out_features=5999, bias=True)
    (output_dp): RNNDropout()
  )
)

In [16]:
out = multimodal_rnn(x, z)

In [19]:
class MultiLinearDecoder(nn.Module):
    "To go on top of a RNNCore module and create a Language Model."

    initrange=0.1

    def __init__(self, n_out:int, n_hid:int, audio_sz:int,
                 output_p:float, tie_encoder:nn.Module=None,
                 bias:bool=True):
        super().__init__()
        self.decoder = nn.Linear(n_hid + audio_sz, n_out, bias=bias)
        self.decoder.weight.data.uniform_(-self.initrange, self.initrange)
        self.output_dp = RNNDropout(output_p)
        if bias: self.decoder.bias.data.zero_()
        if tie_encoder: self.decoder.weight = tie_encoder.weight

    def forward(self, input:Tuple[Tensor,Tensor])->Tuple[Tensor,Tensor,Tensor]:
        raw_outputs, outputs = input
        output = self.output_dp(outputs[-1])
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded, raw_outputs, outputs

In [47]:
class MultiModalPostRNN(RNNCore):
    def __init__(self, audio_sz, output_p, bias, tie_encoder:bool=False, **kwargs):
        super(MultiModalPostRNN, self).__init__(**kwargs)
        self.rnns = None
        self.multimode = [nn.LSTM(emb_sz if l == 0 else n_hid, (n_hid if l != n_layers - 1 else emb_sz)//self.ndir,
            1, bidirectional=bidir) for l in range(n_layers)]
        self.multimode = [WeightDropout(rnn, weight_p) for rnn in self.multimode]
        self.multimode = torch.nn.ModuleList(self.multimode)
        self.audio_sz = audio_sz
#         self.multimode = [nn.LSTM(emb_sz + audio_sz if l == 0 else n_hid,
#                                   (n_hid if l != n_layers - 1 else emb_sz)//self.ndir,
#                                   1, bidirectional=bidir) for l in range(n_layers)]
#         self.multimode = [WeightDropout(rnn, weight_p) for rnn in self.multimode]
#         self.multimode = torch.nn.ModuleList(self.multimode)
        
        if tie_encoder:
            enc = self.encoder
        else:
            enc = None
        
        self.multidecoder = MultiLinearDecoder(vocab_sz,
                                               emb_sz,
                                               audio_sz,
                                               output_p,
                                               tie_encoder=enc,
                                               bias=bias)
        
    def forward(self, input:LongTensor, input_audio:Tensor)->Tuple[Tensor,Tensor,Tensor]:
        sl,bs = input.size()
        if bs!=self.bs:
            self.bs=bs
            self.reset()
        raw_output = self.input_dp(self.encoder_dp(input))
#         raw_output = torch.cat([raw_output, input_audio], dim=2)
        new_hidden,raw_outputs,outputs = [],[],[]
        for l, (rnn,hid_dp) in enumerate(zip(self.multimode, self.hidden_dps)):
            raw_output, new_h = rnn(raw_output, self.hidden[l])
            new_hidden.append(new_h)
            raw_outputs.append(raw_output)
            if l != self.n_layers - 1: raw_output = hid_dp(raw_output)
            outputs.append(raw_output)
        self.hidden = to_detach(new_hidden)
        
        output = self.multidecoder.output_dp(outputs[-1])
        output = torch.cat([raw_output, input_audio], dim=2)
        decoded = self.multidecoder.decoder(output.view(output.size(0)*output.size(1),
                                                        output.size(2)))
        
        return decoded, raw_outputs, outputs
    
    def _one_hidden(self, l:int)->Tensor:
        "Return one hidden state."
        nh = (self.n_hid if l != self.n_layers - 1 else self.emb_sz)//self.ndir
        return self.weights.new(self.ndir, self.bs, nh).zero_()

    def reset(self):
        "Reset the hidden states."
        [r.reset() for r in self.multimode if hasattr(r, 'reset')]
        self.weights = next(self.parameters()).data
        if self.qrnn: self.hidden = [self._one_hidden(l) for l in range(self.n_layers)]
        else: self.hidden = [(self._one_hidden(l), self._one_hidden(l)) for l in range(self.n_layers)]
    
multimodal_rnn = MultiModalPostRNN(audio_sz=audio_sz,
                              vocab_sz=vocab_sz,
                              emb_sz=emb_sz,
                              n_hid=n_hid,
                              n_layers=n_layers,
                              pad_token=pad_token,
                              qrnn=qrnn,
                              bidir=bidir,
                              hidden_p=hidden_p,
                              input_p=input_p,
                              embed_p=embed_p,
                              weight_p=weight_p,
                              output_p=output_p,
                              bias=bias,
                              tie_encoder=False).to(device)

multimodal_rnn

MultiModalPostRNN(
  (encoder): Embedding(5999, 400, padding_idx=1)
  (encoder_dp): EmbeddingDropout(
    (emb): Embedding(5999, 400, padding_idx=1)
  )
  (rnns): None
  (input_dp): RNNDropout()
  (hidden_dps): ModuleList(
    (0): RNNDropout()
    (1): RNNDropout()
    (2): RNNDropout()
  )
  (multimode): ModuleList(
    (0): WeightDropout(
      (module): LSTM(400, 1150)
    )
    (1): WeightDropout(
      (module): LSTM(1150, 1150)
    )
    (2): WeightDropout(
      (module): LSTM(1150, 400)
    )
  )
  (multidecoder): MultiLinearDecoder(
    (decoder): Linear(in_features=410, out_features=5999, bias=True)
    (output_dp): RNNDropout()
  )
)

In [48]:
out = multimodal_rnn(x, z)

In [46]:
out[0]

tensor([[-0.0027,  0.2645,  0.2557,  ..., -0.0856, -0.0201, -0.1912],
        [ 0.1005, -0.0032,  0.2590,  ...,  0.2249,  0.1127, -0.0132],
        [-0.2534, -0.3319, -0.3910,  ...,  0.0995,  0.1720,  0.3377],
        ...,
        [-0.2927, -0.1935, -0.2345,  ...,  0.1611, -0.1844,  0.1487],
        [-0.0127,  0.0996, -0.0252,  ..., -0.1012,  0.0079, -0.2184],
        [-0.1856, -0.0598, -0.1102,  ...,  0.0976,  0.1709,  0.3871]],
       device='cuda:0', grad_fn=<ThAddmmBackward>)

In [11]:
out = multimodal_rnn(x, z)

In [12]:
out[0].shape

torch.Size([4160, 5999])

Now, we should be able to train the data here

In [13]:
multimodal_rnn.train()
multimodal_rnn.reset()

loss_fn = torch.nn.CrossEntropyLoss(reduction='sum')

learning_rate = 1e-4

opt_params = multimodal_rnn.parameters()

# this is a hack for now... not sure if this messes up the graph somewhere by doing this
# opt_params = [par for par in multimodal_rnn.parameters() if par.is_leaf]
optimizer = torch.optim.Adam(opt_params, lr=learning_rate)

y_pred = multimodal_rnn(x, z)

for t in range(1):
    y_pred = multimodal_rnn(x, z)[0]
    
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    
    optimizer.zero_grad()
    
    loss.backward()
    
    optimizer.step()

0 36194.5390625


~~So we get an error for those that are non-leaf nodes (tensors). Doing some investigating shows those that have a `grad_fn` are the culprits. These are most likely the tensors that are dropouts. Need to look at fastai and see how they handle these tensors...~~

~~For now I used the code above to subset to `is_leaf == False`. I'm not sure if this "is leaf" is a symptom of something that's not working right or if this is, in fact, what we should do.~~

Using `model.reset()` solves the above issue.

In the original `RNNCore` class I'm a bit confused by what is happening for `self.rnns` still.. but assuming this is fine for now.

Now, let's make training happen with the learner framework... First step is data.

The following is a new class called `AudioDataset`. To initialize, we have to link it to a `TextDataset` instance, and the two will be used together to initialize `MultimodalDataLoader`. The reason the Datesets must align, is so that when we create bptt batches in the DataLoader object, the audio attributes remain the same for songs

In [573]:
class AudioDataset(torch.utils.data.Dataset):
    '''
    PyTorch Dataset for Aggregate Audio features from MSD
    '''
    def __init__(self, data:np.ndarray, text_link:TextDataset):
        '''
        Args
        ----
          data (2-darray) : a 2-d array of (n, features). n must be the same
                            size as the length of `text_link`
          text_link       : an instance of `TextDataset` from fastai to link
                            with. This is necessary to create a dataloader
                            of language and corresponding audio features
        '''
        # audio features
        self.data = data
        
        # linked song lyrics
        self.text_link = text_link
        assert len(self.text_link) == len(self.data),\
        "Number of examples must be the same."
        
        # return the length of each song
        self.text_length = self._get_text_length()
    
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, idx:int):
        return self.data[idx]
    
    def _get_text_length(self):
        '''
        An list of size n containing the length of lyrics for each song in `text_link`
        '''
        return [len(song[0]) for song in self.text_link]
    
    @property
    def feature_size(self):
        return self.data.shape[1]

In [532]:
wts = torch.load(original_learn.path/original_learn.model_dir/'imdb-model-test.pth')


In [574]:
import copy
text_data = copy.copy(data_lm.train_ds)
audio_features = np.random.normal(size=(800,audio_sz))

data_audio = AudioDataset(audio_features, text_data)

In [575]:
class MultimodalDataLoader(LanguageModelLoader):
    """
    A data loader for language model with bptt with augmented audio features
    """
    def __init__(self, audio_dataset, *args, **kwargs):
        super(MultimodalDataLoader, self).__init__(*args, **kwargs)
        self.audio_dataset = audio_dataset
        self.audio_data = self.batchify_audio(audio_dataset)
        
        
    def batchify_audio(self, audio_data):
        '''
        Repeats audio features for each word in song and batchifies
        '''
        nb = np.sum(audio_data.text_length) // self.bs # words per batch
        
        # repeat audio features
        repeated_audio = np.empty((0, self.audio_dataset.feature_size))
        for song in zip(audio_data.data, audio_data.text_length):
            features, song_length = song
            repeated_audio = np.append(repeated_audio,
                                       np.tile(features, (song_length, 1)),
                                       axis=0)
        
        # reshape to (nb * bs)
        audio = repeated_audio[:nb*self.bs]\
            .reshape(-1, self.bs, self.audio_dataset.feature_size)
        return Tensor(audio)
    
    def get_batch(self, i:int, seq_len:int) -> Tuple[LongTensor, Tensor, LongTensor]:
        "Create a batch at `i` of a given `seq_len`."
        seq_len = min(seq_len, len(self.data) - 1 - i)
        return ((self.data[i:i+seq_len],
                self.audio_data[i:i+seq_len]),
                self.data[i+1:i+1+seq_len].contiguous().view(-1))
        

In [576]:
multi_data = MultimodalDataLoader(audio_dataset=data_audio, dataset=text_data)
# just doubling up the training data for now
multi_db = DataBunch(multi_data, multi_data, device=device)

In [377]:
x1, y1 = next(iter(multi_data))

In [380]:
x1[1].shape

torch.Size([65, 64, 15])

Now, let's see what we can do about training... Bringing in some lower-level fastai functions and modifying

In [352]:
# def loss_batch(model:nn.Module, xb:Tensor, yb:Tensor, loss_func:OptLossFunc=None, opt:OptOptimizer=None,
#                cb_handler:Optional[CallbackHandler]=None)->Tuple[Union[Tensor,int,float,str]]:
#     "Calculate loss and metrics for a batch, call out to callbacks as necessary."
#     cb_handler = ifnone(cb_handler, CallbackHandler())
#     if not is_listy(xb): xb = [xb]
#     if not is_listy(yb): yb = [yb]
#     out = model(*xb)
#     out = cb_handler.on_loss_begin(out)

#     if not loss_func: return to_detach(out), yb[0].detach()
#     loss = loss_func(out, *yb)

#     if opt is not None:
#         loss = cb_handler.on_backward_begin(loss)
#         loss.backward()
#         cb_handler.on_backward_end()
#         opt.step()
#         cb_handler.on_step_end()
#         opt.zero_grad()

#     return loss.detach().cpu()

In [362]:
# multimodal_rnn.train()
# multimodal_rnn.reset()

# loss_fn = torch.nn.CrossEntropyLoss(reduction='mean')
# learning_rate = 1e-4
# opt_params = multimodal_rnn.parameters()
# optimizer = torch.optim.Adam(opt_params, lr=learning_rate)

# xb, yb = next(iter(multi_data))

# # loss_batch(multimodal_rnn, xb, yb, loss_fn, optimizer)
# xxx = multimodal_rnn(*xb)

In [577]:
# multimodal_rnn.train()
# multimodal_rnn.reset()
# loss_fn = torch.nn.CrossEntropyLoss(reduction='mean')
# learning_rate = 1e-4
# opt_params = multimodal_rnn.parameters()

# optimizer = torch.optim.Adam(opt_params, lr=learning_rate)

learn = RNNLearner(multi_db, multimodal_rnn)



In [367]:
# learn.fit(1)

epoch,train_loss,valid_loss,accuracy
,,,


KeyboardInterrupt: 

In [551]:
multimodal_rnn.state_dict()
{k:v.shape for (k,v) in learn.model.state_dict().items()}

{'encoder.weight': torch.Size([5999, 400]),
 'encoder_dp.emb.weight': torch.Size([5999, 400]),
 'multidecoder.decoder.bias': torch.Size([5999]),
 'multidecoder.decoder.weight': torch.Size([5999, 400]),
 'multimode.0.module.bias_hh_l0': torch.Size([4600]),
 'multimode.0.module.bias_ih_l0': torch.Size([4600]),
 'multimode.0.module.weight_hh_l0': torch.Size([4600, 1150]),
 'multimode.0.module.weight_ih_l0': torch.Size([4600, 410]),
 'multimode.0.weight_hh_l0_raw': torch.Size([4600, 1150]),
 'multimode.1.module.bias_hh_l0': torch.Size([4600]),
 'multimode.1.module.bias_ih_l0': torch.Size([4600]),
 'multimode.1.module.weight_hh_l0': torch.Size([4600, 1150]),
 'multimode.1.module.weight_ih_l0': torch.Size([4600, 1150]),
 'multimode.1.weight_hh_l0_raw': torch.Size([4600, 1150]),
 'multimode.2.module.bias_hh_l0': torch.Size([1600]),
 'multimode.2.module.bias_ih_l0': torch.Size([1600]),
 'multimode.2.module.weight_hh_l0': torch.Size([1600, 400]),
 'multimode.2.module.weight_ih_l0': torch.Size([

In [500]:
PRETRAINED_TO_MULTI = {
    '0.encoder.weight': 'encoder.weight',
    '0.encoder_dp.emb.weight': 'encoder_dp.emb.weight',
    '0.rnns.0.weight_hh_l0_raw': 'multimode.0.weight_hh_l0_raw',
    '0.rnns.0.module.weight_ih_l0': 'multimode.0.module.weight_ih_l0',
    '0.rnns.0.module.weight_hh_l0': 'multimode.0.module.weight_hh_l0',
    '0.rnns.0.module.bias_ih_l0': 'multimode.0.module.bias_ih_l0',
    '0.rnns.0.module.bias_hh_l0': 'multimode.0.module.bias_hh_l0',
    '0.rnns.1.weight_hh_l0_raw': 'multimode.1.weight_hh_l0_raw',
    '0.rnns.1.module.weight_ih_l0': 'multimode.1.module.weight_ih_l0',
    '0.rnns.1.module.weight_hh_l0': 'multimode.1.module.weight_hh_l0',
    '0.rnns.1.module.bias_ih_l0': 'multimode.1.module.bias_ih_l0',
    '0.rnns.1.module.bias_hh_l0': 'multimode.1.module.bias_hh_l0', 
    '0.rnns.2.weight_hh_l0_raw': 'multimode.2.weight_hh_l0_raw', 
    '0.rnns.2.module.weight_ih_l0': 'multimode.2.module.weight_ih_l0',
    '0.rnns.2.module.weight_hh_l0': 'multimode.2.module.weight_hh_l0',
    '0.rnns.2.module.bias_ih_l0': 'multimode.2.module.bias_ih_l0',
    '0.rnns.2.module.bias_hh_l0': 'multimode.2.module.bias_hh_l0',
    '1.decoder.weight': 'multidecoder.decoder.weight',
    '1.decoder.bias': 'multidecoder.decoder.bias'
}

In [503]:
for k, v in PRETRAINED_TO_MULTI.items():
    print(k,v)

0.encoder.weight encoder.weight
0.encoder_dp.emb.weight encoder_dp.emb.weight
0.rnns.0.weight_hh_l0_raw multimode.0.weight_hh_l0_raw
0.rnns.0.module.weight_ih_l0 multimode.0.module.weight_ih_l0
0.rnns.0.module.weight_hh_l0 multimode.0.module.weight_hh_l0
0.rnns.0.module.bias_ih_l0 multimode.0.module.bias_ih_l0
0.rnns.0.module.bias_hh_l0 multimode.0.module.bias_hh_l0
0.rnns.1.weight_hh_l0_raw multimode.1.weight_hh_l0_raw
0.rnns.1.module.weight_ih_l0 multimode.1.module.weight_ih_l0
0.rnns.1.module.weight_hh_l0 multimode.1.module.weight_hh_l0
0.rnns.1.module.bias_ih_l0 multimode.1.module.bias_ih_l0
0.rnns.1.module.bias_hh_l0 multimode.1.module.bias_hh_l0
0.rnns.2.weight_hh_l0_raw multimode.2.weight_hh_l0_raw
0.rnns.2.module.weight_ih_l0 multimode.2.module.weight_ih_l0
0.rnns.2.module.weight_hh_l0 multimode.2.module.weight_hh_l0
0.rnns.2.module.bias_ih_l0 multimode.2.module.bias_ih_l0
0.rnns.2.module.bias_hh_l0 multimode.2.module.bias_hh_l0
1.decoder.weight multidecoder.decoder.weight
1.de

In [578]:
def map_weights(learner, pretrained_weights, pretrained_vocab, pretrained_to_multi):
    """
    Special case of loading pretrained weights for multimodal model. 
    Does the following:
        1. Convert encoder/decoder vocab weights
        2. Map the names of the weights from the pretrained wikitext103 to multimodal
        3. For the multimodal weights, random initialize and concatenate
        4. Load Weights
    """
    # 1)
    old_itos = pickle.load(open(pretrained_vocab, 'rb'))
    old_stoi = {v:k for k,v in enumerate(old_itos)}
    wgts = torch.load(pretrained_weights, map_location=lambda storage, loc: storage)
    wgts = convert_weights(wgts, old_stoi, learner.data.train_ds.vocab.itos)
    
    #2)
    for k, v in pretrained_to_multi.items():
        wgts[v] = wgts.pop(k)
        
    #3)
    h1_dim = wgts['multimode.0.module.weight_ih_l0'].shape[0]
    random_init = torch.randn(h1_dim, learner.model.audio_sz)
    h1_z1 = torch.cat([wgts['multimode.0.module.weight_ih_l0'],
                       random_init], dim=1)
    wgts['multimode.0.module.weight_ih_l0'] = h1_z1
    
    #4)
    learner.model.load_state_dict(wgts)
    
#     return random_init

In [579]:
zz = map_weights(learn,
            original_learn.path/original_learn.model_dir/'imdb-model-test.pth',
            original_learn.path/original_learn.model_dir/'imdb-itos.pkl',
            PRETRAINED_TO_MULTI)
zz

In [580]:
learn.fit(1)

Total time: 15:11
epoch  train_loss  valid_loss  accuracy
1      6.305592    6.226836    0.058895  (15:11)



In [550]:
{k:v.shape for (k,v) in zz.items()}

AttributeError: 'NoneType' object has no attribute 'items'

In [518]:
zz['multimode.0.module.weight_ih_l0'].shape

torch.Size([4600, 400])

In [476]:
zz = next(multimodal_rnn.named_parameters())[1]
zz.data = torch.zeros(20000, 400, device=device)

In [478]:
zz = next(multimodal_rnn.named_parameters())[1]
zz

Parameter containing:
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], requires_grad=True)

In [342]:
next(iter(multi_data))

(tensor([[  42, 2435,    0,  ...,   16,   21,   27],
         [  39,    9,   21,  ...,  819,    0,   11],
         [4612,    0,    6,  ...,   13,   16,   80],
         ...,
         [   9,    8,    3,  ...,   22,    2,  727],
         [   6,  404,   24,  ...,    5, 1913,   61],
         [ 595,   10,   13,  ..., 1239,  219,   30]]),
 tensor([[[-0.8940, -0.9863,  1.0023,  ..., -0.0324, -1.4640,  1.0113],
          [-0.8940, -0.9863,  1.0023,  ..., -0.0324, -1.4640,  1.0113],
          [-0.8940, -0.9863,  1.0023,  ..., -0.0324, -1.4640,  1.0113],
          ...,
          [-0.8940, -0.9863,  1.0023,  ..., -0.0324, -1.4640,  1.0113],
          [-0.8940, -0.9863,  1.0023,  ..., -0.0324, -1.4640,  1.0113],
          [-0.8940, -0.9863,  1.0023,  ..., -0.0324, -1.4640,  1.0113]],
 
         [[-0.8940, -0.9863,  1.0023,  ..., -0.0324, -1.4640,  1.0113],
          [-0.8940, -0.9863,  1.0023,  ..., -0.0324, -1.4640,  1.0113],
          [-0.8940, -0.9863,  1.0023,  ..., -0.0324, -1.4640,  1.0113],


In [330]:
aa = np.arange(1, (300*10*64)+1).reshape(300*64, 10)
aa = aa.reshape(-1, 64, 10)
aa[0:20, :, :]

array([[[    1,     2,     3,     4, ...,     7,     8,     9,    10],
        [   11,    12,    13,    14, ...,    17,    18,    19,    20],
        [   21,    22,    23,    24, ...,    27,    28,    29,    30],
        [   31,    32,    33,    34, ...,    37,    38,    39,    40],
        ...,
        [  601,   602,   603,   604, ...,   607,   608,   609,   610],
        [  611,   612,   613,   614, ...,   617,   618,   619,   620],
        [  621,   622,   623,   624, ...,   627,   628,   629,   630],
        [  631,   632,   633,   634, ...,   637,   638,   639,   640]],

       [[  641,   642,   643,   644, ...,   647,   648,   649,   650],
        [  651,   652,   653,   654, ...,   657,   658,   659,   660],
        [  661,   662,   663,   664, ...,   667,   668,   669,   670],
        [  671,   672,   673,   674, ...,   677,   678,   679,   680],
        ...,
        [ 1241,  1242,  1243,  1244, ...,  1247,  1248,  1249,  1250],
        [ 1251,  1252,  1253,  1254, ...,  1257, 

In [166]:
# new_array = np.empty((232543, 10))
new_array = np.empty((0, 10))
for song in zip(data_audio.data, tl):
    features, song_length = song
    new_array = np.append(new_array, np.tile(features, (song_length,1)), axis=0)

new_array.shape

(232543, 10)

In [210]:
new_array[:90]

array([[-0.893972, -0.986326,  1.002337,  1.44418 , ..., -0.701897, -0.032365, -1.464049,  1.011268],
       [-0.893972, -0.986326,  1.002337,  1.44418 , ..., -0.701897, -0.032365, -1.464049,  1.011268],
       [-0.893972, -0.986326,  1.002337,  1.44418 , ..., -0.701897, -0.032365, -1.464049,  1.011268],
       [-0.893972, -0.986326,  1.002337,  1.44418 , ..., -0.701897, -0.032365, -1.464049,  1.011268],
       ...,
       [-0.893972, -0.986326,  1.002337,  1.44418 , ..., -0.701897, -0.032365, -1.464049,  1.011268],
       [-0.893972, -0.986326,  1.002337,  1.44418 , ..., -0.701897, -0.032365, -1.464049,  1.011268],
       [-0.893972, -0.986326,  1.002337,  1.44418 , ..., -0.701897, -0.032365, -1.464049,  1.011268],
       [ 0.35043 ,  1.06222 , -0.450322, -0.047755, ..., -0.18382 , -1.813732,  1.08301 , -0.740351]])

In [276]:
new_array[:3633*64].reshape(64, 10, -1).T[:,:,0]

array([[-8.939716e-01, -4.775463e-02, -1.150684e+00, -2.796405e-01, ...,  8.273797e-01,  9.500533e-01, -8.594624e-01,
        -1.653091e-01],
       [-9.863259e-01, -4.900306e-01, -1.747391e+00, -5.196730e-01, ..., -2.519979e-01,  2.382013e+00,  4.387546e-01,
        -4.724991e-01],
       [ 1.002337e+00, -1.582357e-01,  7.213377e-01,  1.400212e+00, ..., -1.941280e+00,  1.537789e-01,  4.615265e-01,
        -1.079530e-01],
       [ 1.444180e+00, -1.838197e-01, -1.130803e-01,  4.847791e-01, ..., -9.453367e-01,  1.841860e+00,  1.183999e-01,
        -2.690129e-01],
       ...,
       [-7.403514e-01,  3.171212e-01,  5.168427e-01, -1.010780e+00, ...,  1.093102e+00,  6.304512e-01, -5.211421e-01,
        -4.872220e-01],
       [ 3.504297e-01,  1.105425e+00,  4.489977e-01, -1.510797e+00, ...,  2.343389e-01, -8.119963e-01, -2.440054e+00,
         1.993736e-01],
       [ 1.062220e+00,  8.832193e-01, -2.157157e+00,  3.181909e-01, ..., -8.850293e-01,  1.482015e+00,  1.277619e+00,
         1.404804e

In [179]:
np.sum(data_audio.text_length) // 64

3633

In [55]:
dd = np.concatenate(text_data.ids)
nb = dd.shape[0] // 64
dt = np.array(dd[:nb*64]).reshape(64, -1).T

In [70]:
(len(dt)-1) // 70

51

In [231]:
dt[:, 0]

array([  42,   39, 4612,   18, ...,  489,  243,  146,    2])

In [300]:
dt

array([[  42, 2435,    0,   85, ...,   82,   16,   21,   27],
       [  39,    9,   21,   14, ...,    6,  819,    0,   11],
       [4612,    0,    6,   25, ...,   22,   13,   16,   80],
       [  18,   59, 1581,    4, ...,    3,   23, 2692,    0],
       ...,
       [ 489,  100,   12,  849, ...,   10,   50,  905, 1216],
       [ 243,   36,  163,  140, ...,    3,   17,   61,   47],
       [ 146,   99,   17,   13, ...,   19,   93,   30,    6],
       [   2,    2,  381,   32, ...,   10, 1336,    0,  979]])

In [135]:
text_length

NameError: name 'text_length' is not defined

In [137]:
np.sum([10, 20, 30])

60