# Split data into train, validation, and test set

Here we will split the data into train, validation, and test set in preparation for fitting models in the next sections.

## Read in data

Read in our cleaned data.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

cleaned_data = pd.read_csv('../data/SpecPhotoAllKeplerEtAl2015/dr10_cleaned.csv')
cleaned_data.head()



Unnamed: 0,is_wd,g,u-g,g-r,r-i,i-z,extinction_r
0,False,17.84416,-0.40949,-0.40399,-0.30941,-0.31604,0.260842
1,False,18.97837,-0.03504,-0.35938,-0.23878,-0.32267,0.37324
2,False,19.10553,-0.26039,-0.34127,-0.28533,-0.28499,0.289759
3,False,17.12208,0.04238,0.20833,0.45028,-0.02654,0.358432
4,False,17.69995,-0.34368,-0.4128,-0.26424,-0.27372,0.349579


## Split into train, validation and test set

Split our data into 60% training data, 20% validation data and 20% testing data.

In [2]:
features = cleaned_data.drop('is_wd', axis=1)
labels = cleaned_data['is_wd']

x_train, x_test_and_val, y_train, y_test_and_val = train_test_split(features, labels, test_size=0.4, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_test_and_val, y_test_and_val, test_size=0.5, random_state=42)

In [3]:
for dataset in [y_train, y_val, y_test]:
    print(round(len(dataset) / len(labels), 2))

0.6
0.2
0.2


## Store split data

In [4]:
x_train.to_csv('../data/SpecPhotoAllKeplerEtAl2015/dr10_train_features.csv', index=False)
x_val.to_csv('../data/SpecPhotoAllKeplerEtAl2015/dr10_val_features.csv', index=False)
x_test.to_csv('../data/SpecPhotoAllKeplerEtAl2015/dr10_test_features.csv', index=False)

y_train.to_csv('../data/SpecPhotoAllKeplerEtAl2015/dr10_train_labels.csv', index=False)
y_val.to_csv('../data/SpecPhotoAllKeplerEtAl2015/dr10_val_labels.csv', index=False)
y_test.to_csv('../data/SpecPhotoAllKeplerEtAl2015/dr10_test_labels.csv', index=False)