# Splitting

## Setup

In [1]:
from glob import glob
from pathlib import Path

import pandas as pd
import numpy as np

import warnings

In [2]:
seed = 789
np.random.seed(789)

In [3]:
train_dir = Path.cwd() / 'train'
train_dir.mkdir(exist_ok=True)

val_dir = Path.cwd() / 'val'
val_dir.mkdir(exist_ok=True)

test_dir = Path.cwd() / 'test'
test_dir.mkdir(exist_ok=True)

## Load and Split Data

Splits data into 60/20/20 Train/Val/Test

In [7]:
for filename in glob('./raw/*.csv'): 
    pth = Path(filename)

    #read and shuffle
    df = pd.read_csv(pth).sample(frac=1, random_state=seed)

    #split
    with warnings.catch_warnings(action="ignore"):
        train, val, test = np.split(df, [int(.6*len(df)), int(.8*len(df))])

    #export
    name = pth.stem
    train.to_csv(train_dir / (name+'_train.csv'), index=False)
    val.to_csv(val_dir / (name+'_val.csv'), index=False)
    test.to_csv(test_dir / (name+'_test.csv'), index=False)
    