In [2]:
import torch
import os

We want to take features from XceptionNet and the the Spectrogram features prepared by the audio

In [4]:
# load a spectrogram feature
spect_dir = '/home/jklc9f/data/dfdc/sample/train_spectrograms_part-5/real/name_video/name_video-000-24.pt'
t = torch.load(spect_dir)

Let's check where the relevant information exists in the spectrogram:

In [None]:
# import matplotlib.pyplot as plt

# # t = t.numpy()
# plt.imshow(t.transpose(), aspect='auto', origin='bottom', cmap='jet')

Let's create a dataset object for concatenated xception features and spectrogram features

In [5]:
from torch.utils.data import Dataset

class FrimagenetDataset(Dataset):
    '''
    FrimageNet data set for concatenating XceptionNet Features and Spectrogram features
    '''
    def __init__(self, spectrogram_folder, xception_features_folder):
        """
        Args:
            spectrogram_folder (string): Path to the csv file with annotations.
            xception_features_folder (string): Directory with all the images.
        """
        self.classification = []
        self.encode_map = {
            'real': 1,
            'fake': 0
        }
        self.features = self.__get_feats(spectrogram_folder, xception_features_folder)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.classification[idx]

    def __get_feats(self, spect_directory, xception_directory, seq_size=24, nfirst=25, max_spect_feats=700, max_xcept_feats=2048):
        samples = []
        labels = ['real', 'fake']
        for label in labels:
            xception_vidpaths = sorted([os.path.join(xception_directory, label, vid) for vid in os.listdir(os.path.join(xception_directory, label))])
            spect_vidpaths = sorted([os.path.join(spect_directory, label, vid) for vid in os.listdir(os.path.join(spect_directory, label))])

            for xcept_path, spect_path in zip(xception_vidpaths, spect_vidpaths):
                # loops through the paths to the video labels of xception features and spectrogram features folders
                sorted_vid_xcept = sorted(os.listdir(xcept_path))
                sorted_vid_spect = sorted(os.listdir(spect_path))

                for xcept_feat, spect_feat in zip(sorted_vid_xcept, sorted_vid_spect):
                    # loops throught the individual files in each respective video_id folder for the xception features and spectrogram features
                    if (xcept_feat != spect_feat):
                        # the labels are not identical, so alignment is off. Return error
                        print(f'{xcept_feat} != {spect_feat} ')
                        # raise NonAligned

                    if xcept_feat[-5:] == f'{seq_size}.pt':
                        xcept = torch.load(os.path.join(xcept_path, xcept_feat))[:, :max_xcept_feats]
                        spect = torch.load(os.path.join(spect_path, spect_feat))[:, :max_spect_feats]
                        samples.append(torch.cat((xcept, spect), dim=-1))
                        self.classification.append(torch.tensor(self.encode_map[label]))
        self.classification = torch.stack(self.classification)
        return torch.stack(samples)

And let us build our LSTM

In [6]:
from torch import nn
import torch.nn.functional as F

class FrimageNet(nn.Module):
    def __init__(self, feature_size, num_layers=2, hidden_dim=512, device='cuda'):
        super(FrimageNet, self).__init__()
        self.device = device
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim

        # input dim is 167, output 200
        self.lstm = nn.LSTM(feature_size, hidden_dim,
                            batch_first=True, num_layers=num_layers)
        # fully connected
        self.fc1 = nn.Linear(hidden_dim, hidden_dim)
        self.act = nn.Sigmoid()
        self.fc2 = nn.Linear(hidden_dim, 2)
        self.softmax = nn.Softmax()

    def forward(self, x, hidden):
#         print(x.device, hidden[0].device)
        y, hidden = self.lstm(x, hidden)    # returns the two outputs
        y = y[:, -1, :]  # get only the last output
        y = self.fc1(y)
        y = self.fc2(y)
        y = F.softmax(y, dim=1)

        return y, hidden

    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.num_layers, batch_size, self.hidden_dim).zero_().to(self.device),
                  weight.new(self.num_layers, batch_size, self.hidden_dim).zero_().to(self.device))
        return hidden


Now let's get an example of this dataset

In [8]:
spectrogram_folder = r'J:\reu\code\output\spectrogram_features' 
xception_folder = r'J:\reu\code\output\xception_features'
data = FrimagenetDataset(spectrogram_folder, xception_folder)
data.features.shape

torch.Size([24, 24, 2748])

Awesome! We wanted the xception features, which are of size 2048, to be concatenated with the relevant data from spectrgram features, which, as defined by the FrimagenetDataset class, are the first 700 dimensions.

Now let's test out our LSTM with this data

In [18]:
#initialize model
net = FrimageNet(feature_size=2748) # feature size matches the last dimension of Dataset Features
net

FrimageNet(
  (lstm): LSTM(2748, 512, num_layers=2, batch_first=True)
  (fc1): Linear(in_features=512, out_features=512, bias=True)
  (act): Sigmoid()
  (fc2): Linear(in_features=512, out_features=2, bias=True)
  (softmax): Softmax(dim=None)
)

Now let's create a function to simulate training our model

In [11]:
from torch.utils.data import DataLoader

def train(model, spectrogram_folder, xception_folder, loss_function, optimizer, epochs=100, batch_size=5, device='cuda'):
    training_data = FrimagenetDataset(spectrogram_folder, xception_folder)
    trainloader = DataLoader(training_data, batch_size=batch_size, shuffle=True, drop_last=True)

    hidden = model.init_hidden(batch_size)
    for h in hidden:
        h = h.to(device)    
    
    print_every = 20
    i = 0
    losses = []
    accs = []
    running_loss = 0.0
    running_acc = 0.0

    for epoch in range(epochs):
        for inp, labels in trainloader:  # renamed sequence to inp because inp is a batch of sequences
            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            model.zero_grad()

            inp = inp.float().to(device)
            labels = labels.to(device)
            
            # Step 2. Run our forward pass.
            tag_scores, h = model(inp, hidden)

            # Step 3. Compute the loss, gradients, and update the parameters by
            # calling optimizer.step()
            loss = loss_function(tag_scores, labels)
            loss.backward()
            optimizer.step()

            running_acc += torch.mean((tag_scores.argmax(dim=1) == labels).float()).item()

            # print statistics
            running_loss += loss.item()
            if i % print_every == print_every-1:
                print('[%d, %5d] loss: %.3f - acc: %.3f' %
                      (epoch + 1, i + 1, running_loss / print_every, running_acc * 100 / print_every))
                
                losses.append(running_loss / print_every)
                accs.append(running_acc * 100 / print_every)
                
                running_loss = 0.0
                running_acc = 0.0
            i += 1
    return losses, accs

In order to use this function, we need to define a lsot function and optimizer. Let's use Cross Entropy Loss and Adam, respectively

In [17]:
import torch.optim as optim
from torch import nn

loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.0001)

Now before we train, let's make sure everything is on the same device (cuda)

In [19]:
device = 'cuda'
loss_function.to(device)
net.cuda()

FrimageNet(
  (lstm): LSTM(2748, 512, num_layers=2, batch_first=True)
  (fc1): Linear(in_features=512, out_features=512, bias=True)
  (act): Sigmoid()
  (fc2): Linear(in_features=512, out_features=2, bias=True)
  (softmax): Softmax(dim=None)
)

In [22]:
train(net, spectrogram_folder, xception_folder, loss_function, optimizer)

[5,    20] loss: 0.693 - acc: 50.000
[10,    40] loss: 0.694 - acc: 51.000
[15,    60] loss: 0.693 - acc: 54.000
[20,    80] loss: 0.693 - acc: 53.000
[25,   100] loss: 0.693 - acc: 50.000
[30,   120] loss: 0.693 - acc: 49.000
[35,   140] loss: 0.693 - acc: 51.000
[40,   160] loss: 0.694 - acc: 50.000
[45,   180] loss: 0.693 - acc: 53.000
[50,   200] loss: 0.693 - acc: 49.000
[55,   220] loss: 0.694 - acc: 45.000
[60,   240] loss: 0.693 - acc: 48.000
[65,   260] loss: 0.694 - acc: 46.000
[70,   280] loss: 0.693 - acc: 48.000
[75,   300] loss: 0.693 - acc: 54.000
[80,   320] loss: 0.694 - acc: 49.000
[85,   340] loss: 0.693 - acc: 50.000
[90,   360] loss: 0.693 - acc: 52.000
[95,   380] loss: 0.693 - acc: 50.000
[100,   400] loss: 0.693 - acc: 51.000


([0.693226546049118,
  0.6935099601745606,
  0.6929204791784287,
  0.6926646858453751,
  0.6930759340524674,
  0.6933588236570358,
  0.6931680500507355,
  0.6935722380876541,
  0.6926412552595138,
  0.6933632642030716,
  0.6935172408819199,
  0.69336057305336,
  0.6936771124601364,
  0.6931857109069824,
  0.6927893221378326,
  0.6935473531484604,
  0.6931888729333877,
  0.6930012255907059,
  0.6932780593633652,
  0.693035489320755],
 [50.00000149011612,
  51.00000157952309,
  54.00000140070915,
  53.0000014603138,
  50.00000134110451,
  49.00000125169754,
  51.00000128149986,
  50.00000096857548,
  53.00000101327896,
  49.0000007301569,
  45.00000096857548,
  48.00000123679638,
  46.000001057982445,
  48.00000123679638,
  54.000001326203346,
  49.00000110268593,
  50.0000012665987,
  52.00000137090683,
  50.00000111758709,
  51.000001430511475])

And just like that, with fake data and a super small sample, we have put together an LSTM!