In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
import numpy as np
from collections import Counter
import os

In [2]:
survey = 'ZTF'

Read the data

In [4]:
path = '/home/iebecker/Desktop/iebecker_stuff/Work/ProcessZTF/notebooks/data/metadata_ztf.csv'
df = pd.read_csv(path)

Define the properties of each light curve by Minimum length!

In [4]:
max_L = 10000
min_L = 500
min_N= 9
max_N = 600
    
bands = [i for i in df.columns if 'N_' in i]
b = np.ones(df.shape[0], dtype=bool)
for band in bands:
    b_band = df[band]>min_N
    b = np.logical_and(b, b_band)

df = df[b].copy()
df = df.reset_index().drop('index', axis=1)


20% -> for testing
70% -> for training
10% -> for validation


In [5]:

def sample_classes(data):
    # Leave up_to N_max objects per class
    dfs = []
    classes= list(data.Class.unique())
    num_classes = len(classes)
    
    for i in classes.copy():
        # Objects of the class
        bol = data.Class == i
        sel = data[bol]

        # Limit the minimum number of light curves
        if sel.shape[0] < min_L:
            # Update the classes
            classes.remove(i)
            num_classes = len(classes)
            # Skip the class
            continue

        # Return the min among the number of objects and max_L
        num = min(max_L, sel.shape[0])
        # Get a random sample
        sel = sel.sample(num, replace=False, axis=0)
        dfs.append(sel)
    # Join the dataframes of each class together
    data = pd.concat(dfs)
    return data

In [6]:
df = sample_classes(df)

In [7]:
Counter(df.Class)

Counter({'E': 10000,
         'RRL': 10000,
         'QSO': 10000,
         'LPV': 9313,
         'AGN': 2417,
         'Blazar': 1089,
         'YSO': 797,
         'CV/Nova': 762,
         'SNIa': 507})

Create the folds, we will work with **5** folds. with this, the train split contains 80% of the total, and the test set contains 20%.

From this train split, we take 12.5% to validate and the remaining 87.5% is used to train.

From the total, the test set contains 20%, the trainning set contains 70% and the validation set, 15%.

In [8]:
kfolds = StratifiedKFold(n_splits=7, shuffle=True)


In [9]:
path_folds = './Folds'
if not os.path.exists(path_folds):
    os.mkdir(path_folds)
# First split test
df_temp, df_test = train_test_split(df, stratify=df.Class, train_size=0.8)
df_temp.reset_index(inplace=True)
df_test.reset_index(inplace=True)

path_test = os.path.join(path_folds, 'test.csv')
df_test.to_csv(path_test, index=False, index_label=False)

for n, (train_index, val_index) in enumerate(kfolds.split(df_temp.index.values, df_temp.Class.values)):
    # Get the train and validation splits
    df_train = df_temp.loc[train_index]
    df_val = df_temp.loc[val_index]


    path_folds_ = os.path.join(path_folds, 'Fold_'+str(n+1))
    if not os.path.exists(path_folds_):
        os.mkdir(path_folds_)
        
    path_train = os.path.join(path_folds_, 'train.csv')

    path_val = os.path.join(path_folds_, 'val.csv')
    
    df_train.to_csv(path_train, index=False, index_label=False)
    df_val.to_csv(path_val, index=False, index_label=False)