In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from dotenv import dotenv_values

In [2]:
config = dotenv_values('.config')
data_path = Path(config['DATA_PATH'])
dataset_path = data_path/'processed/v2'

In [3]:
TEST_SPLIT = .2
RANDOM_STATE = 47

In [4]:
landmark_ds = pd.read_csv(dataset_path/'landmark_ds_v2.csv')
print(f'Positive class: {landmark_ds.IS_Y.sum() / len(landmark_ds.index) * 100:.2f}%')

Positive class: 9.83%


In [5]:
def get_dataset_partitions_pd(df, train_split=0.8, val_split=0.1, test_split=0.1, seed=None):
    assert (train_split + test_split + val_split) == 1

    # Specify seed to always have the same split distribution between runs
    df_sample = df.sample(frac=1, random_state=seed)
    indices_or_sections = [int(train_split * len(df)), int((1 - val_split - test_split) * len(df))]
    
    train_ds, val_ds, test_ds = np.split(df_sample, indices_or_sections)
    
    return train_ds, val_ds, test_ds

In [6]:
train_ds, _, test_ds = get_dataset_partitions_pd(
    landmark_ds, train_split=.8, val_split=0, test_split=.2, seed=102)


In [7]:
train_ds.to_csv(dataset_path/'landmark_ds_train.csv', index=None)
test_ds.to_csv(dataset_path/'landmark_ds_test.csv', index=None)