In [None]:
import pandas as pd

In [None]:
from dotenv import dotenv_values
from pathlib import Path

config = dotenv_values('.config')
data_path = Path(config['DATA_PATH'])
dataset_path = data_path/'processed/v1'

In [None]:
TEST_SPLIT = .2
RANDOM_STATE = 47

In [None]:
landmark_ds = pd.read_csv(dataset_path/'Landmark Dataset.csv')
print(f'Positive class: {landmark_ds.IS_Y.sum() / len(landmark_ds.index) * 100:.2f}%')

In [None]:
num_of_rows = len(landmark_ds.index)

num_of_test = round(num_of_rows * TEST_SPLIT)
num_of_test = num_of_test - num_of_test % 2  # always even

num_of_train = num_of_rows - num_of_test

# output
num_of_train, num_of_test, num_of_rows

In [None]:
# Shuffle
landmark_ds = landmark_ds.sample(frac=1., random_state=RANDOM_STATE).reset_index(drop=True)

test_pos = landmark_ds[landmark_ds.IS_Y].sample(num_of_test // 2, random_state=RANDOM_STATE)
test_neg = landmark_ds[~landmark_ds.IS_Y].sample(num_of_test // 2, random_state=RANDOM_STATE)
print(test_pos.index.sort_values())

train_pos = landmark_ds[~landmark_ds.index.isin(test_pos.index) & landmark_ds.IS_Y]
train_neg = landmark_ds[~landmark_ds.index.isin(test_neg.index) & ~landmark_ds.IS_Y]

In [None]:
assert not train_pos.index.isin(test_pos.index).any()
assert not train_neg.index.isin(test_neg.index).any()
assert len(train_pos.index) + len(train_neg.index) + len(test_pos) + len(test_neg) == num_of_rows

In [None]:
train_df = pd.concat((train_pos, train_neg)).sample(frac=1., random_state=RANDOM_STATE).reset_index(drop=True)
test_df = pd.concat((test_pos, test_neg)).sample(frac=1., random_state=RANDOM_STATE).reset_index(drop=True)

print(f'Positive train: {train_df.IS_Y.sum() / len(train_df.index) * 100:.2f}%')
print(f'Positive test: {test_df.IS_Y.sum() / len(test_df.index) * 100:.2f}%')

In [None]:
assert len(train_df.index) == num_of_train
assert len(test_df.index) == num_of_test

In [None]:
train_df.to_csv(dataset_path/'landmark_train.csv', index=None)
test_df.to_csv(dataset_path/'landmark_test.csv', index=None)