#  "UJ SN2019 Zadanie 2: Nocne Ptasie Wędrówki"

## Balancing data

#### Load previously prepared train data and labels representation

In [1]:
import pathlib
import numpy as np
np.random.seed(0) 

def load_data(directory, dataset_name):
    p = pathlib.Path(directory)
    if not p.is_dir():
        raise ValueError('Directory: {directory} does not exist. Please, run firstly imbalanced_data.ipynb for creating data')
    return np.load(pathlib.Path(directory + dataset_name + '.npy'))


data_dir ='data/imbalanced/'
X_train = load_data(data_dir, 'X_train')
y_train = load_data(data_dir, 'y_train')

In [2]:
def check_amount_of_data(data):
    print(f"Number of samples for 'bird exists': {sum(data.flatten() == 1)}")
    print(f"Number of samples for 'bird does not exists': {sum(data.flatten() == 0)}")

#### Check differences between amount of data for 'bird exists' and 'bird does not exist'

In [3]:
check_amount_of_data(y_train)

Number of samples for 'bird exists': 464
Number of samples for 'bird does not exists': 3146


### Perform random over-sampling to even distribution

In [4]:
initial_x_train_shape = X_train.shape[1:]
initial_y_train_shape = y_train.shape

In [5]:
from imblearn.over_sampling import RandomOverSampler

random_over_sampler = RandomOverSampler(random_state=0)
X_resampled, y_resampled = random_over_sampler.fit_resample(X_train.reshape(X_train.shape[0],-1), y_train.reshape(-1))

In [6]:
X_resampled = X_resampled.reshape(-1, *initial_x_train_shape)
y_resampled = y_resampled.reshape(-1, 1)

#### Check amount of data for 'bird exists' and 'bird does not exist'

In [7]:
check_amount_of_data(y_resampled)

Number of samples for 'bird exists': 3146
Number of samples for 'bird does not exists': 3146


In [8]:
import pathlib

def create_not_existing_directory(directory: str):
    """
    Create not existing directory. 
    If directory exists, do nothing.
    :param directory: str
        directory to create
    """
    p = pathlib.Path(directory)
    if not p.is_dir():
        print(f'Creating directory: {directory} as it does not exist')
        p.mkdir(parents=True, exist_ok=True)

def save_datasets(data_file_mapping, directory='data/'):
    create_not_existing_directory(directory)
    for filename in data_file_mapping:
        np.save(pathlib.Path(directory + filename), data_file_mapping[filename])

In [9]:
train_test_dataset_filename_mapping = {
    'X_resampled': X_resampled,
    'y_resampled': y_resampled
    
}

save_datasets(train_test_dataset_filename_mapping, directory='data/balanced/')

Creating directory: data/balanced/ as it does not exist


In [10]:
from sklearn.model_selection import StratifiedKFold

stratified_k_fold = StratifiedKFold(n_splits=5)
stratified_k_fold.get_n_splits(X_resampled, y_resampled)

dataset_filename_mapping = {}

for train_index, validation_index in stratified_k_fold.split(X_resampled, y_resampled):
    X_train, X_validation = X_resampled[train_index], X_resampled[validation_index]
    y_train, y_validation = y_resampled[train_index], y_resampled[validation_index]
    dataset_filename_mapping['X_train'] = X_train
    dataset_filename_mapping['y_train'] = y_train
    dataset_filename_mapping['X_validation'] = X_validation
    dataset_filename_mapping['y_validation'] = y_validation

#### Check amount of data for 'bird exists' and 'bird does not exist' in train and validation datasets

- Train dataset

In [11]:
check_amount_of_data(y_train)

Number of samples for 'bird exists': 2517
Number of samples for 'bird does not exists': 2517


- Validation dataset

In [12]:
check_amount_of_data(y_validation)

Number of samples for 'bird exists': 629
Number of samples for 'bird does not exists': 629


In [13]:
save_datasets(dataset_filename_mapping, directory='data/balanced/splitted/')

Creating directory: data/balanced/splitted/ as it does not exist
