# Learning PyTorch for Multimodality

In [1]:
import torch

In [2]:
device = torch.device('cpu')

## Basic Model of Sequential Modules

Example from [here](https://github.com/jcjohnson/pytorch-examples#pytorch-optim)

In [3]:
N, D_in, Z_in, H, H2, D_out = 64, 400, 20, 300, 200, 10

x = torch.randn(N, D_in, device=device)
z = torch.randn(N, Z_in, device=device)
y = torch.randn(N, D_out, device=device)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
).to(device)

loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(50):
    y_pred = model(x)
    
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    
    optimizer.zero_grad()
    
    loss.backward()
    
    optimizer.step()

0 570.465576171875
1 558.5599365234375
2 546.862548828125
3 535.3700561523438
4 524.0850219726562
5 512.9972534179688
6 502.1164245605469
7 491.43743896484375
8 480.9502868652344
9 470.6551513671875
10 460.548095703125
11 450.6246337890625
12 440.8854064941406
13 431.3063049316406
14 421.8977355957031
15 412.66162109375
16 403.59527587890625
17 394.6910400390625
18 385.9486389160156
19 377.3605651855469
20 368.9128723144531
21 360.6111755371094
22 352.4601135253906
23 344.438720703125
24 336.5470886230469
25 328.78759765625
26 321.16632080078125
27 313.6796569824219
28 306.3223876953125
29 299.0881652832031
30 291.9808044433594
31 285.00579833984375
32 278.1521301269531
33 271.42120361328125
34 264.8191223144531
35 258.3395690917969
36 251.96961975097656
37 245.70846557617188
38 239.55307006835938
39 233.5180206298828
40 227.5930938720703
41 221.76150512695312
42 216.03688049316406
43 210.41720581054688
44 204.9068603515625
45 199.4998016357422
46 194.18519592285156
47 188.957153320312

In [53]:
model.state_dict().keys()

odict_keys(['0.weight', '0.bias', '2.weight', '2.bias'])

## Expanding to "Multi-Modal" with Custom Class

So rather than stringing together modules like above, we have to create a custom class for our model. Mainly, this is because we are concatenating multiple inputs.

This still hasn't been tested on real data, and I don't know all the "gotchas" of these modules yet, but this works as expected thus far, and creates weight matrices as expected.

In [4]:
class Multimodal(torch.nn.Module):
    def __init__(self, D_in, H, Z_in, H2, D_out):
        super(Multimodal, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.multi = torch.nn.Linear(H + Z_in, H2)
        self.linear2 = torch.nn.Linear(H2, D_out)
        
    def forward(self, x, z):
        h1 = self.linear1(x)
        h1_z = torch.cat([h1, z], dim=1)
        h2 = self.multi(h1_z)
        out = self.linear2(h2)
        return out

In [5]:
N, D_in, Z_in, H, H2, D_out = 64, 400, 20, 300, 200, 10

x = torch.randn(N, D_in, device=device)
z = torch.randn(N, Z_in, device=device)
y = torch.randn(N, D_out, device=device)

model = Multimodal(D_in, H, Z_in, H2, D_out)

loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for t in range(50):
    y_pred = model(x, z)
    
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    
    optimizer.zero_grad()
    
    loss.backward()
    
    optimizer.step()

0 694.5526123046875
1 677.2709350585938
2 660.2911376953125
3 643.60693359375
4 627.211669921875
5 611.0978393554688
6 595.2576293945312
7 579.6826782226562
8 564.3642578125
9 549.2939453125
10 534.463134765625
11 519.8637084960938
12 505.4876403808594
13 491.3275451660156
14 477.3764343261719
15 463.62786865234375
16 450.07611083984375
17 436.7160339355469
18 423.543212890625
19 410.5539245605469
20 397.7452087402344
21 385.1148376464844
22 372.6612854003906
23 360.3837890625
24 348.28228759765625
25 336.357421875
26 324.6104431152344
27 313.04327392578125
28 301.65838623046875
29 290.45880126953125
30 279.44793701171875
31 268.6296691894531
32 258.00823974609375
33 247.5880584716797
34 237.37384033203125
35 227.370361328125
36 217.5824432373047
37 208.01487731933594
38 198.67234802246094
39 189.55938720703125
40 180.68020629882812
41 172.03872680664062
42 163.63848876953125
43 155.48257446289062
44 147.57357788085938
45 139.91355895996094
46 132.50401306152344
47 125.34584045410156
4

For example, the "multi" weights have input weights of length 320 because we take 300 from the output of `linear1` + 20 from the auxillary input (e.g. z) and then we map this to the output of 200

In [6]:
model.state_dict().keys()

odict_keys(['linear1.weight', 'linear1.bias', 'multi.weight', 'multi.bias', 'linear2.weight', 'linear2.bias'])

In [7]:
model.state_dict().get('multi.weight').shape

torch.Size([200, 320])

In [8]:
print(model)

Multimodal(
  (linear1): Linear(in_features=400, out_features=300, bias=True)
  (multi): Linear(in_features=320, out_features=200, bias=True)
  (linear2): Linear(in_features=200, out_features=10, bias=True)
)


## Add Multimodality to FastAI

In [9]:
from fastai import *
from fastai.text import *

Bring in some sample data using fastai imdb

In [237]:
path = untar_data(URLs.IMDB_SAMPLE)
data_lm = TextLMDataBunch.from_csv(path)

for x, y in list(data_lm.train_dl): # just testing with one batch
    x, y
    
z = torch.randn(x.shape[0], x.shape[1], 10,
                device=device, requires_grad=True) # making up 10 "audio features"
z.shape

torch.Size([67, 64, 10])

In [238]:
vocab_sz = 20000
emb_sz = 400
n_hid = 1150
n_layers = 3
pad_token= 1
qrnn = False
bidir = False

dps = np.array([0.25, 0.1, 0.2, 0.02, 0.15])

hidden_p = dps[4]
input_p = dps[0]
embed_p = dps[3]
weight_p = dps[2]

tie_weights = True
output_p = dps[1]
bias = True

audio_sz = 10

# Create a full AWD-LSTM.
rnn_enc = RNNCore(vocab_sz=vocab_sz,
                  emb_sz=emb_sz,
                  n_hid=n_hid,
                  n_layers=n_layers,
                  pad_token=pad_token,
                  qrnn=qrnn,
                  bidir=bidir,
                  hidden_p=hidden_p,
                  input_p=input_p,
                  embed_p=embed_p,
                  weight_p=weight_p)

enc = rnn_enc.encoder if tie_weights else None
model = SequentialRNN(rnn_enc, LinearDecoder(vocab_sz, emb_sz, output_p, tie_encoder=enc, bias=bias))

In [239]:
out = model(x)

In [240]:
class MultiModalRNN(RNNCore):
    def __init__(self, audio_sz, **kwargs):
        super(MultiModalRNN, self).__init__(**kwargs)
        self.rnns = None
        self.audio_sz = audio_sz
        self.multimode = [nn.LSTM(emb_sz + audio_sz if l == 0 else n_hid,
                                  (n_hid if l != n_layers - 1 else emb_sz + audio_sz)//self.ndir,
                                  1, bidirectional=bidir) for l in range(n_layers)]
        self.multimode = [WeightDropout(rnn, weight_p) for rnn in self.multimode]
        self.multimode = torch.nn.ModuleList(self.multimode)
        
    def forward(self, input:LongTensor, input_audio:Tensor)->Tuple[Tensor,Tensor]:
        sl,bs = input.size()
        if bs!=self.bs:
            self.bs=bs
            self.reset()
        raw_output = self.input_dp(self.encoder_dp(input))
        raw_output = torch.cat([raw_output, input_audio], dim=2)
        new_hidden,raw_outputs,outputs = [],[],[]
        for l, (rnn,hid_dp) in enumerate(zip(self.multimode, self.hidden_dps)):
            raw_output, new_h = rnn(raw_output, self.hidden[l])
            new_hidden.append(new_h)
            raw_outputs.append(raw_output)
            if l != self.n_layers - 1: raw_output = hid_dp(raw_output)
            outputs.append(raw_output)
        self.hidden = to_detach(new_hidden)
        return raw_outputs, outputs
    
    def _one_hidden(self, l:int)->Tensor:
        "Return one hidden state."
        nh = (self.n_hid if l != self.n_layers - 1 else self.emb_sz + self.audio_sz)//self.ndir
        return self.weights.new(self.ndir, self.bs, nh).zero_()

    def reset(self):
        "Reset the hidden states."
        [r.reset() for r in self.multimode if hasattr(r, 'reset')]
        self.weights = next(self.parameters()).data
        if self.qrnn: self.hidden = [self._one_hidden(l) for l in range(self.n_layers)]
        else: self.hidden = [(self._one_hidden(l), self._one_hidden(l)) for l in range(self.n_layers)]
    
multimodal_rnn = MultiModalRNN(audio_sz=audio_sz,
                         vocab_sz=vocab_sz,
                         emb_sz=emb_sz,
                         n_hid=n_hid,
                         n_layers=n_layers,
                         pad_token=pad_token,
                         qrnn=qrnn,
                         bidir=bidir,
                         hidden_p=hidden_p,
                         input_p=input_p,
                         embed_p=embed_p,
                         weight_p=weight_p)

enc = multimodal_rnn.encoder if tie_weights else None
model = SequentialRNN(multimodal_rnn,
                      LinearDecoder(vocab_sz,
                                    emb_sz + audio_sz,
                                    output_p,
                                    tie_encoder=enc,
                                    bias=bias)).to(device)
model

SequentialRNN(
  (0): MultiModalRNN(
    (encoder): Embedding(20000, 400, padding_idx=1)
    (encoder_dp): EmbeddingDropout(
      (emb): Embedding(20000, 400, padding_idx=1)
    )
    (rnns): None
    (input_dp): RNNDropout()
    (hidden_dps): ModuleList(
      (0): RNNDropout()
      (1): RNNDropout()
      (2): RNNDropout()
    )
    (multimode): ModuleList(
      (0): WeightDropout(
        (module): LSTM(410, 1150)
      )
      (1): WeightDropout(
        (module): LSTM(1150, 1150)
      )
      (2): WeightDropout(
        (module): LSTM(1150, 410)
      )
    )
  )
  (1): LinearDecoder(
    (decoder): Linear(in_features=410, out_features=20000, bias=True)
    (output_dp): RNNDropout()
  )
)

In [241]:
out = multimodal_rnn(x, z)

In [242]:
out[0][2].shape

torch.Size([67, 64, 410])

Thus far, I'm able to get multimodal_rnn to work, but it doesnt work when used with SequentialRNN. Pretty sure this is because `forward` is not registered properly with SequentialRNN.

Below, I attempt to include the decoder directly into the custom module

In [252]:
class MultiModalRNN(RNNCore):
    def __init__(self, audio_sz, output_p, bias, **kwargs):
        super(MultiModalRNN, self).__init__(**kwargs)
        self.rnns = None
        self.audio_sz = audio_sz
        self.multimode = [nn.LSTM(emb_sz + audio_sz if l == 0 else n_hid,
                                  (n_hid if l != n_layers - 1 else emb_sz + audio_sz)//self.ndir,
                                  1, bidirectional=bidir) for l in range(n_layers)]
        self.multimode = [WeightDropout(rnn, weight_p) for rnn in self.multimode]
        self.multimode = torch.nn.ModuleList(self.multimode)
        
        self.multidecoder = LinearDecoder(vocab_sz,
                                          emb_sz + audio_sz,
                                          output_p,
                                          tie_encoder=None,
                                          bias=bias)
        
    def forward(self, input:LongTensor, input_audio:Tensor)->Tuple[Tensor,Tensor,Tensor]:
        sl,bs = input.size()
        if bs!=self.bs:
            self.bs=bs
            self.reset()
        raw_output = self.input_dp(self.encoder_dp(input))
        raw_output = torch.cat([raw_output, input_audio], dim=2)
        new_hidden,raw_outputs,outputs = [],[],[]
        for l, (rnn,hid_dp) in enumerate(zip(self.multimode, self.hidden_dps)):
            raw_output, new_h = rnn(raw_output, self.hidden[l])
            new_hidden.append(new_h)
            raw_outputs.append(raw_output)
            if l != self.n_layers - 1: raw_output = hid_dp(raw_output)
            outputs.append(raw_output)
        self.hidden = to_detach(new_hidden)
        
        output = self.multidecoder.output_dp(outputs[-1])
        decoded = self.multidecoder.decoder(output.view(output.size(0)*output.size(1),
                                                        output.size(2)))
        
        return decoded, raw_outputs, outputs
    
    def _one_hidden(self, l:int)->Tensor:
        "Return one hidden state."
        nh = (self.n_hid if l != self.n_layers - 1 else self.emb_sz + self.audio_sz)//self.ndir
        return self.weights.new(self.ndir, self.bs, nh).zero_()

    def reset(self):
        "Reset the hidden states."
        [r.reset() for r in self.multimode if hasattr(r, 'reset')]
        self.weights = next(self.parameters()).data
        if self.qrnn: self.hidden = [self._one_hidden(l) for l in range(self.n_layers)]
        else: self.hidden = [(self._one_hidden(l), self._one_hidden(l)) for l in range(self.n_layers)]
    
multimodal_rnn = MultiModalRNN(audio_sz=audio_sz,
                              vocab_sz=vocab_sz,
                              emb_sz=emb_sz,
                              n_hid=n_hid,
                              n_layers=n_layers,
                              pad_token=pad_token,
                              qrnn=qrnn,
                              bidir=bidir,
                              hidden_p=hidden_p,
                              input_p=input_p,
                              embed_p=embed_p,
                              weight_p=weight_p,
                              output_p=output_p,
                              bias=bias).to(device)

multimodal_rnn

MultiModalRNN(
  (encoder): Embedding(20000, 400, padding_idx=1)
  (encoder_dp): EmbeddingDropout(
    (emb): Embedding(20000, 400, padding_idx=1)
  )
  (rnns): None
  (input_dp): RNNDropout()
  (hidden_dps): ModuleList(
    (0): RNNDropout()
    (1): RNNDropout()
    (2): RNNDropout()
  )
  (multimode): ModuleList(
    (0): WeightDropout(
      (module): LSTM(410, 1150)
    )
    (1): WeightDropout(
      (module): LSTM(1150, 1150)
    )
    (2): WeightDropout(
      (module): LSTM(1150, 410)
    )
  )
  (multidecoder): LinearDecoder(
    (decoder): Linear(in_features=410, out_features=20000, bias=True)
    (output_dp): RNNDropout()
  )
)

In [253]:
out = multimodal_rnn(x, z)

In [254]:
out[0].shape

torch.Size([4288, 20000])

Now, we should be able to train the data here

In [255]:
multimodal_rnn.train()

loss_fn = torch.nn.CrossEntropyLoss(reduction='mean')

learning_rate = 1e-4
optimizer = torch.optim.Adam(multimodal_rnn.parameters(), lr=learning_rate)

for t in range(50):
    y_pred = multimodal_rnn(x, z)
    
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    
    optimizer.zero_grad()
    
    loss.backward()
    
    optimizer.step()

ValueError: can't optimize a non-leaf Tensor

In [263]:
xx = list(multimodal_rnn.parameters())[0]
xx.is_con

True

In [264]:
for idx, thing in enumerate(multimodal_rnn.parameters()):
    print(idx, thing.is_leaf)

0 True
1 True
2 True
3 False
4 True
5 True
6 True
7 True
8 False
9 True
10 True
11 True
12 True
13 False
14 True
15 True
16 True
17 True


In [275]:
list(multimodal_rnn.parameters())[13]

tensor([[ 0.0504, -0.0437, -0.0204,  ...,  0.0562,  0.0344, -0.0210],
        [ 0.0370,  0.0000, -0.0000,  ..., -0.0072,  0.0238,  0.0026],
        [-0.0064,  0.0480, -0.0527,  ..., -0.0084,  0.0398, -0.0314],
        ...,
        [ 0.0469, -0.0468, -0.0339,  ...,  0.0566, -0.0047,  0.0132],
        [ 0.0168,  0.0000, -0.0549,  ...,  0.0170,  0.0000,  0.0153],
        [-0.0044,  0.0383, -0.0571,  ...,  0.0059, -0.0031,  0.0284]],
       grad_fn=<MulBackward0>)

So we get an error for those that are non-leaf nodes (tensors). Doing some investigating shows those that have a `grad_fn` are the culprits. These are most likely the tensors that are dropouts. Need to look at fastai and see how they handle these tensors...