#### Load dataframe

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json
import random
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from sklearn.preprocessing import LabelEncoder

In [3]:
RANDOM_SEED = 42
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x114616970>

In [4]:
calls_df = pd.read_csv('features_and_spectrograms.csv')
calls_df['log_padded_spectrogram'] = calls_df['log_padded_spectrogram'].apply(lambda x: np.array(json.loads(x)))

In [5]:
calls_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7300 entries, 0 to 7299
Data columns (total 33 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   track_ID                7300 non-null   object 
 1   clip_ID                 7300 non-null   object 
 2   goose_ID                7300 non-null   object 
 3   call_type               7300 non-null   object 
 4   waveform                7300 non-null   object 
 5   sr                      7300 non-null   int64  
 6   filepath                7300 non-null   object 
 7   lfccs                   7300 non-null   object 
 8   peak                    7300 non-null   float64
 9   duration                7300 non-null   float64
 10  normalized_log_length   7300 non-null   float64
 11  log_target_duration     7300 non-null   float64
 12  log_padded_spectrogram  7300 non-null   object 
 13  log_padded_lfccs        7300 non-null   object 
 14  f0mean                  6484 non-null   

In [6]:
print(len(calls_df[calls_df.isna().any(axis=1)]))
calls_df = calls_df.dropna()
calls_df = calls_df[["track_ID", "clip_ID", "goose_ID", "call_type", "log_padded_spectrogram"]]
calls_df.info()

816
<class 'pandas.core.frame.DataFrame'>
Index: 6484 entries, 0 to 7299
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   track_ID                6484 non-null   object
 1   clip_ID                 6484 non-null   object
 2   goose_ID                6484 non-null   object
 3   call_type               6484 non-null   object
 4   log_padded_spectrogram  6484 non-null   object
dtypes: object(5)
memory usage: 303.9+ KB


In [7]:
le = LabelEncoder()
le.fit(calls_df["call_type"])
calls_df["encoded_call_type"] = le.transform(calls_df["call_type"])

### Variational Autoencoder

In [15]:
learning_rate = 1e-3
batch_size = 256 # very unbalanced dataset so choosing a bigger batch size - TODO balance
epochs = 60
retrain = True

In [9]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using mps device


#### Create data loader

In [10]:
calls_df.iloc[0]["log_padded_spectrogram"].shape

(44, 170)

In [11]:
class SoundDS(Dataset):
    # Custom data loader 
    def __init__(self, calls_df):
        self.df = calls_df
                
    # of items in dataset
    def __len__(self):
        return len(self.df)    
    
    # Get item through index
    def __getitem__(self, idx):
        spectrogram = torch.from_numpy(self.df.iloc[idx]["log_padded_spectrogram"]).to(torch.float32).to(device)
        spectrogram = spectrogram[np.newaxis, ...]
        return spectrogram, self.df.iloc[idx]["encoded_call_type"]

In [12]:
ds = SoundDS(calls_df)

# Random split for train:val - 80:20
num_items = len(ds)
num_train = round(num_items * 0.8)
num_val = num_items - num_train
train_ds, val_ds = random_split(ds, [num_train, num_val])

# Create training and validation data loaders
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=batch_size, shuffle=False)

In [13]:
train_features, train_labels = next(iter(train_dl))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape, type: {train_labels.size()}")

input_size=train_features.size()[1:]
print(input_size[1], input_size[2])

Feature batch shape: torch.Size([256, 1, 44, 170])
Labels batch shape, type: torch.Size([256])
44 170


#### Define architecture

In [34]:
class ConvolutionalAutoencoder(torch.nn.Module):
    def __init__(self):
        super().__init__()

        self.encoder = torch.nn.Sequential(
            # input shape (n, 1, 44, 170)
            torch.nn.Conv2d(1, 8, (4,4), (2,2), (1,2)), # output shape (n, 8, 22, 86)
            torch.nn.BatchNorm2d(8),
            torch.nn.LeakyReLU(),
            torch.nn.Conv2d(8, 4, (4,4), (2,2), (1,1)), # output shape (n, 4, 11, 43)
            torch.nn.BatchNorm2d(4),
            torch.nn.LeakyReLU(),
            torch.nn.Flatten(), # output shape (n, 1280)
            torch.nn.Linear(1892, 512),
            torch.nn.LeakyReLU(),
            torch.nn.Linear(512, 128),
        )

        # latent mean and variance 
        self.mean_layer = torch.nn.Linear(128, 2)
        self.logvar_layer = torch.nn.Linear(128, 2)
        
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(128, 512),
            torch.nn.LeakyReLU(),
            torch.nn.Linear(512, 1892),
            torch.nn.Unflatten(1, (4, 11, 43)),
            torch.nn.LeakyReLU(),
            torch.nn.BatchNorm2d(4),
            torch.nn.ConvTranspose2d(4, 8, (4,4), (2,2), (1,1)),
            torch.nn.LeakyReLU(),
            torch.nn.BatchNorm2d(8),
            torch.nn.ConvTranspose2d(8, 1, (4,4), (2,2), (1,2)),
            torch.nn.Sigmoid()
        )

    def encode(self, x):
        x = self.encoder(x)
        mean, logvar = self.mean_layer(x), self.logvar_layer(x)
        return mean, logvar

    def decode(self, x):
        return self.decoder(x)
    
    def reparameterization(self, mean, var):
        epsilon = torch.randn_like(var).to(device)      
        z = mean + var*epsilon
        return z

    def forward(self, x):
        mean, log_var = self.encode(x)
        z = self.reparameterization(mean, log_var)
        x_hat = self.decode(z)
        return x_hat, mean, log_var

In [35]:
model = ConvolutionalAutoencoder().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.L1Loss()
print(model)

ConvolutionalAutoencoder(
  (encoder): Sequential(
    (0): Conv2d(1, 8, kernel_size=(4, 4), stride=(2, 2), padding=(1, 2))
    (1): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): LeakyReLU(negative_slope=0.01)
    (3): Conv2d(8, 4, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))
    (4): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): LeakyReLU(negative_slope=0.01)
    (6): Flatten(start_dim=1, end_dim=-1)
    (7): Linear(in_features=1892, out_features=512, bias=True)
    (8): LeakyReLU(negative_slope=0.01)
    (9): Linear(in_features=512, out_features=128, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=128, out_features=512, bias=True)
    (1): LeakyReLU(negative_slope=0.01)
    (2): Linear(in_features=512, out_features=1892, bias=True)
    (3): Unflatten(dim=1, unflattened_size=(4, 11, 43))
    (4): LeakyReLU(negative_slope=0.01)
    (5): BatchNorm2d(4, eps=1e-05, momentum=0.1, a

In [17]:
class EarlyStopper:
    # written by https://stackoverflow.com/questions/71998978/early-stopping-in-pytorch
    def __init__(self, patience=1, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = float('inf')

    def early_stop(self, validation_loss):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
        elif validation_loss > (self.min_validation_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False

In [36]:
train_losses = []
val_losses = []

early_stopper = EarlyStopper(patience=5, min_delta=1e-3)

if retrain:
    for epoch in range(epochs):
        total_train_loss = 0.0
        total_val_loss = 0.0
        #-------- train --------
        model.train()
        for batch_idx, (X, y) in enumerate(train_dl):
            # Forward pass
            encoded, decoded = model(X)
            # Compute the loss and perform backpropagation
            loss = loss_fn(decoded, X)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Update the running loss
            total_train_loss += loss.item() * X.size(0)
            if early_stopper.early_stop(total_train_loss):             
                break

        # Calculate epoch loss
        epoch_train_loss = total_train_loss / len(train_dl.dataset)
        train_losses.append(epoch_train_loss)


        #------- validate -------
        model.eval()
        for batch_idx, (X, y) in enumerate(val_dl):
            # Forward pass
            encoded, decoded = model(X)

            # Compute the loss and perform backpropagation
            val_loss = loss_fn(decoded, X)

            # Update the running loss
            total_val_loss += val_loss.item() * X.size(0)

        # Calculate epoch loss
        epoch_val_loss = total_val_loss / len(val_dl.dataset)
        val_losses.append(epoch_val_loss)

        print(
            "Epoch {}/{}:\n training loss={:.4f}".format(epoch + 1, epochs, epoch_train_loss),
            "\n validation loss={:.4f}".format(epoch_val_loss)
        )

Epoch 1/60:
 training loss=0.1291 
 validation loss=0.4395
Epoch 2/60:
 training loss=0.1252 
 validation loss=0.4384


KeyboardInterrupt: 