In [1]:
import torch
from torch.utils.data import DataLoader, Dataset

In [2]:
class KWS_dataset(Dataset):
    def __init__(self, input_data, output_data):
        self.input_data = input_data
        self.output_data = output_data
        
    def __len__(self):
        return len(self.input_data)
    
    def __getitem__(self, index):
        keyword = self.output_data[index]
        audio_features = self.input_data[index]
        return audio_features, keyword

In [3]:
path = "D:\Documents\Data_Science\MDS_CL\Term2\Block6\COLX_585_trends_in_computational_linguistics\Team_project\data_splits"
train_loader = torch.load(path+'\\en_splits_10.trainloader')
dev_loader = torch.load(path+'\\en_splits_10.devloader')
test_loader = torch.load(path+'\\en_splits_10.testloader')

In [4]:
print(len(train_loader.dataset))
print(len(dev_loader.dataset))
print(len(test_loader.dataset))

8027
993
980


In [5]:
type(train_loader)
for item in train_loader:
    # print(item)
    print(type(item))
    print(len(item))
    print(item[0].shape) # audio features, 40x100 , 40 is the number of features, 100 is the number of frames, 1024 is the batch size
    print(item[1].shape) # keyword, 1024 is the batch size
    exm_input = item[0]
    break

<class 'list'>
2
torch.Size([1024, 40, 100])
torch.Size([1024])


In [6]:
###
import torch
import torch.nn.functional as F
import torch.nn as nn
import torchvision


class MultilingualEmbeddingModel(nn.Module):
    def __init__(self, num_classes):
        super(MultilingualEmbeddingModel, self).__init__()
        # Load EfficientNet-B0 as the base model
        self.efficient_b0_model = torchvision.models.efficientnet_b0(pretrained=True)

        # Add a global average pooling layer
        self.global_avg_pool = nn.AdaptiveAvgPool2d((1024, 2048))

        # Add two dense layers of 2048 units with ReLU activations
        self.linear1 = nn.Linear(2048, 2048)
        self.relu1 = nn.ReLU()
        self.linear2 = nn.Linear(2048, 2048)
        self.relu2 = nn.ReLU()
        # Add a penultimate 1024-unit SELU activation layer
        self.linear3 = nn.Linear(2048, 1024)
        self.selu = nn.SELU()
        # add a softmax layer
        self.linear4 = nn.Linear(1024, num_classes)
        self.softmax = nn.Softmax(dim=1)


    def forward(self, x):
        # print(f"input shape: {x.shape}")
        # Pass the input through the base model
        x = x.unsqueeze(1)
        # print(f"after unsqueeze: {x.shape}")
        x = x.repeat(1, 3, 1, 1)
        # print(f"after repeat: {x.shape}")
        x = self.efficient_b0_model(x)
        # print(f"after efficientnet: {x.shape}")
        # add a 1 to the first dimension
        x = x.unsqueeze(0)
        # print(f"after unsqueeze: {x.shape}")
        # Pass the output through the global average pooling layer
        x = self.global_avg_pool(x)
        # print(f"after global_avg_pool: {x.shape}")
        # pass the output through the dense layers
        # remove the first 1 in the shape
        x = x.squeeze(0)
        # print(f"after squeeze: {x.shape}")
        x = self.linear1(x)
        # print(f"after linear1: {x.shape}")
        x = self.relu1(x)
        # print(f"after relu1: {x.shape}")
        x = self.linear2(x)
        # print(f"after linear2: {x.shape}")
        x = self.relu2(x)
        # print(f"after relu2: {x.shape}")
        x = self.linear3(x)
        # print(f"after linear3: {x.shape}")
        x = self.selu(x)
        # print(f"after selu: {x.shape}")
        # pass the output through the softmax layer
        x  = self.linear4(x)
        # print(f"after linear4: {x.shape}")
        x = self.softmax(x)
        # print(f"after softmax: {x.shape}")

        return x


In [7]:
model = MultilingualEmbeddingModel(10)




In [8]:
model(exm_input).shape


torch.Size([1024, 10])