#  "UJ SN2019 Zadanie 2: Nocne Ptasie Wędrówki"

# MLP for imbalanced dataset

## Train and test data preparation

In [1]:
import pathlib
import numpy as np

def load_data(directory, dataset_name):
    p = pathlib.Path(directory)
    if not p.is_dir():
        raise ValueError('Directory: {directory} does not exist. Please, run firstly imbalanced_data.ipynb for creating data')
    return np.load(pathlib.Path(directory + dataset_name + '.npy'))

data_dir = '../../data/imbalanced/splitted/'
X_train = load_data(data_dir, 'X_train')
y_train = load_data(data_dir, 'y_train')
X_validation = load_data(data_dir, 'X_validation')
y_validation = load_data(data_dir, 'y_validation')

In [2]:
import torch
from torch.utils.data import TensorDataset, DataLoader
from typing import Sequence

def get_train_and_validation_datasets(train_X, train_y, validation_X, validation_y) -> Sequence[torch.utils.data.TensorDataset]:
    X_train: torch.Tensor = torch.from_numpy(train_X).float()
    X_validation: torch.Tensor = torch.from_numpy(validation_X).float()
        
    y_train: torch.Tensor = torch.from_numpy(train_y.flatten()).long()
    y_validation: torch.Tensor = torch.from_numpy(validation_y.flatten()).long()

    train_dataset = TensorDataset(X_train, y_train)
    validation_dataset = TensorDataset(X_validation, y_validation)
    
    return train_dataset, validation_dataset


train_dataset, validation_dataset = get_train_and_validation_datasets(
    X_train, y_train, X_validation, y_validation)

In [3]:
BATCH_SIZE = 64
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
validation_dataloader = DataLoader(validation_dataset, batch_size=BATCH_SIZE, shuffle=True)

## How MLP model works on imbalanced data for this problem?

In [4]:
from torch import nn

class MLP(torch.nn.Module):
    
    def __init__(self):
        super(MLP, self).__init__()
        self.linear1 = torch.nn.Linear(5400, 2500)
        self.relu1 = nn.ReLU()
        self.linear2 = torch.nn.Linear(2500, 500)
        self.relu2 = nn.ReLU()
        self.linear3 = torch.nn.Linear(500, 2)
        self.relu3 = nn.ReLU()

    def forward(self, x):
        out = torch.flatten(x, start_dim = 1)
        out = self.relu1(self.linear1(out))
        out = self.relu2(self.linear2(out))
        out = self.relu3(self.linear3(out))
        return out
    
model = MLP()

In [5]:
import torch.optim as optim

optimizer: torch.optim.Optimizer = optim.Adam(model.parameters())
criterion = torch.nn.CrossEntropyLoss()
epoch: int = 10
    
for e in range(epoch):
    print(f"EPOCH: {e}")
    
    correct_train: int = 0 
    loss_train : int = 0
    for i, (x, y) in enumerate(train_dataloader):
        optimizer.zero_grad()
        output: torch.Tensor = model(x)
        loss: torch.Tensor = criterion(output, y)
        loss.backward()
        optimizer.step()
        correct_train += float(sum(output.argmax(dim=1) == y))
        loss_train += loss.item()

    print(f"Train accuracy: {correct_train / len(train_dataset)}")
    print(f"Loss: {loss_train / len(train_dataset)}")
        
    with torch.no_grad():
        correct_validation = float(sum(output.argmax(dim=1) == y))
        print(f"Validation accuracy: {correct_validation / len(validation_dataset)}\n")

EPOCH: 0
Train accuracy: 0.8717847249703206
Loss: 0.006891662989815658
Validation accuracy: 0.02585410895660203

EPOCH: 1
Train accuracy: 0.8717847249703206
Loss: 0.005936732164097247
Validation accuracy: 0.02677746999076639

EPOCH: 2
Train accuracy: 0.8717847249703206
Loss: 0.006059842957401162
Validation accuracy: 0.024007386888273315

EPOCH: 3
Train accuracy: 0.8717847249703206
Loss: 0.005919334807790311
Validation accuracy: 0.02585410895660203

EPOCH: 4
Train accuracy: 0.8717847249703206
Loss: 0.005926586159803106
Validation accuracy: 0.024930747922437674

EPOCH: 5
Train accuracy: 0.8717847249703206
Loss: 0.005874053251144204
Validation accuracy: 0.027700831024930747

EPOCH: 6
Train accuracy: 0.8721804511278195
Loss: 0.00592719816883232
Validation accuracy: 0.02585410895660203

EPOCH: 7
Train accuracy: 0.8721804511278195
Loss: 0.005921236619347398
Validation accuracy: 0.024007386888273315

EPOCH: 8
Train accuracy: 0.8725761772853186
Loss: 0.005887972232990695
Validation accuracy: 0

### Result:
As expected, accuracy on validation dataset is really low(!). <br>
Accuracy on training dataset is similar to percents of data representing "bird does not exist" values. <br>
Model hasn't learned how to recognize rarely occuring value.