#  "UJ SN2019 Zadanie 2: Nocne Ptasie Wędrówki"

# ConvNet for balanced dataset

## Train and test data preparation

In [1]:
import numpy as np
SEED = 1024
np.random.seed(SEED)
import pathlib

def load_data(directory, dataset_name):
    p = pathlib.Path(directory)
    if not p.is_dir():
        raise ValueError('Directory: {directory} does not exist. Please, run firstly imbalanced_data.ipynb for creating data')
    return np.load(pathlib.Path(directory + dataset_name + '.npy'))


def load_train_and_validation_data(data_dir):
    X_train = load_data(data_dir, 'X_train')
    y_train = load_data(data_dir, 'y_train')
    X_validation = load_data(data_dir, 'X_validation')
    y_validation = load_data(data_dir, 'y_validation')
    return X_train, y_train, X_validation, y_validation


data_directory = '../../data/balanced/splitted/'

In [2]:
import torch
from torch.utils.data import TensorDataset, DataLoader
from typing import Sequence
torch.manual_seed(SEED) 

BATCH_SIZE = 64
    
def get_train_and_validation_dataloaders(train_X, train_y, validation_X, validation_y, batch_size=64) -> Sequence[torch.utils.data.TensorDataset]:
    X_train: torch.Tensor = torch.from_numpy(train_X).float()
    X_validation: torch.Tensor = torch.from_numpy(validation_X).float()
        
    y_train: torch.Tensor = torch.from_numpy(train_y.flatten()).float()
    y_validation: torch.Tensor = torch.from_numpy(validation_y.flatten()).float()

    train_dataset = TensorDataset(X_train, y_train)
    validation_dataset = TensorDataset(X_validation, y_validation)
    
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    validation_dataloader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=True)
  
    return train_dataloader, validation_dataloader


In [3]:
train_X, train_y, validation_X, validation_y = load_train_and_validation_data(data_directory)
train_dataloader, validation_dataloader = get_train_and_validation_dataloaders(train_X, train_y, validation_X, validation_y)

In [4]:
class BirdDetector(torch.nn.Module):
    
    def __init__(self):
        super(BirdDetector, self).__init__()
        self.pool = torch.nn.MaxPool2d(2)
        
        self.conv1 = torch.nn.Conv2d(in_channels=5, out_channels=5, kernel_size=3, padding=1)
        self.bn1 = torch.nn.BatchNorm2d(5)
        
        self.conv2 = torch.nn.Conv2d(in_channels=5, out_channels=3, kernel_size=3, padding=1)
        self.bn2 = torch.nn.BatchNorm2d(3)
        
        self.conv3 = torch.nn.Conv2d(in_channels=3, out_channels=3, kernel_size=3, padding=1)
        self.bn3 = torch.nn.BatchNorm2d(3)
        
        self.fc1 = torch.nn.Linear(42, 20)
        self.fc2 = torch.nn.Linear(20, 1)
        
        self.sigmoid = torch.nn.Sigmoid()
            
    def forward(self, x):
        out = self.pool(self.bn1(torch.relu(self.conv1(x))))
        out = self.pool(self.bn2(torch.relu(self.conv2(out))))
        out = self.pool(self.bn3(torch.relu(self.conv3(out))))
        
        out = out.view(out.size(0), -1)
        
        out = self.fc1(out)
        out = self.fc2(out)
        
        return self.sigmoid(out)
    
model = BirdDetector()

In [5]:
saved_models_directory = '../../saved_model/'

p = pathlib.Path(saved_models_directory)
if not p.is_dir():
    print(f'Creating directory: {saved_models_directory} as it does not exist')
    p.mkdir(parents=True, exist_ok=True)

In [6]:
import torch.optim as optim
from sklearn.metrics import roc_auc_score

optimizer: torch.optim.Optimizer = optim.Adam(model.parameters())
criterion = torch.nn.BCELoss()
epoch: int = 200

best_validation_roc_auc = 0
epochs_without_improvement = 0
MAX_POSSIBLE_EPOCHS_WITHOUT_IMPROVEMENT = 50

for e in range(epoch):
    print(f"EPOCH: {e}")
    
    correct_train: int = 0 
    loss_train : int = 0
    for i, (x, y) in enumerate(train_dataloader):
        optimizer.zero_grad()
        output: torch.Tensor = model(x)
        loss: torch.Tensor = criterion(output.flatten(), y)
        loss.backward()
        optimizer.step()
        correct_in_batch = 0
        out_copy = output.clone().detach()
        for out, label in zip(output,y):
            if out.item() > 0.5 and label==1:
                correct_in_batch += 1
            if out.item() < 0.5 and label==0:
                correct_in_batch += 1
        correct_train += correct_in_batch
        loss_train += loss.item()

    print(f"Train accuracy: {correct_train / len(train_X)}")
    print(f"Loss: {loss_train / len(train_X)}")
    
    with torch.no_grad():
        preds = []
        for i, (x, y) in enumerate(validation_dataloader):
            output: torch.Tensor = model(x)
            preds += output.tolist()
        score = roc_auc_score(validation_y, preds)
        
        if score > best_validation_roc_auc:
            best_validation_roc_auc = score
            torch.save(model.state_dict(), '../../saved_model/model.pt')
            epochs_without_improvements = 0
        else:
            epochs_without_improvement += 1
            
        if epochs_without_improvement == MAX_POSSIBLE_EPOCHS_WITHOUT_IMPROVEMENT:
            break
            
        print(f"Validation ROC_AUC score: {score}\n")

EPOCH: 0
Train accuracy: 0.533373063170441
Loss: 0.010636616402028523
Validation ROC_AUC score: 0.46584403537550456

EPOCH: 1
Train accuracy: 0.5484704012713548
Loss: 0.010373017003624318
Validation ROC_AUC score: 0.5150287760874126

EPOCH: 2
Train accuracy: 0.5580055621771951
Loss: 0.010151262931232656
Validation ROC_AUC score: 0.4649859342181422

EPOCH: 3
Train accuracy: 0.5756853396901073
Loss: 0.010251434492506014
Validation ROC_AUC score: 0.5273568715072503

EPOCH: 4
Train accuracy: 0.5923718712753278
Loss: 0.010172400000458158
Validation ROC_AUC score: 0.5232496126538958

EPOCH: 5
Train accuracy: 0.5993245927691696
Loss: 0.009889654912880408
Validation ROC_AUC score: 0.32865021572587266

EPOCH: 6
Train accuracy: 0.6128327373857767
Loss: 0.009833418238385594
Validation ROC_AUC score: 0.5496725566864911

EPOCH: 7
Train accuracy: 0.6380611839491458
Loss: 0.009563484360502028
Validation ROC_AUC score: 0.5763912233565278

EPOCH: 8
Train accuracy: 0.6537544696066746
Loss: 0.00940289911

In [7]:
print(f"The highest validation roc auc: {best_validation_roc_auc}")

The highest validation roc auc: 0.5763912233565278


In [8]:
test_data_directory = '../../data/imbalanced/'

test_X = load_data(test_data_directory, 'X_test')
X_test = torch.from_numpy(test_X).float()
test_dataset = TensorDataset(X_test)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

model = BirdDetector()
model.load_state_dict(torch.load('../../saved_model/model.pt'))

test_predictions = []
with torch.no_grad():
    for x in test_dataloader:
        out = model(x[0])
        test_predictions.append(out.squeeze())

In [9]:
import pandas as pd

def get_indices_from_sample_submission():
    sample_submission_csv = pd.read_csv('../../submission/sampleSubmission.csv')
    return sample_submission_csv['sample_id'].tolist()

def create_dataframe_with_predictions(predictions):
    indices_from_sample_submission = get_indices_from_sample_submission()
    submission_df = pd.DataFrame(columns=['sample_id', 'prediction'])
    for i, element in enumerate(predictions):
        submission_df.loc[i] = [str(int(i/10)+1) + '/' +str(i%10)] + [element]
    return submission_df[submission_df['sample_id'].isin(indices_from_sample_submission)]


In [10]:
my_submission_df = create_dataframe_with_predictions(torch.cat(test_predictions).numpy())
my_submission_df.to_csv('../../submission/mySubmission.csv', index=False)