In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
import numpy as np
from collections import Counter
import os

In [2]:
survey = 'Gaia'

Read the data

In [11]:
path = '/home/iebecker/Desktop/iebecker_stuff/Disks/Data/Paper_2/Prepare_dataset/Gaia/V5/Dataset_Gaia_Phys_V5.dat'
df = pd.read_csv(path)
df.Path = df.Path.str.replace('/home/','/home/iebecker/Desktop/iebecker_stuff/Disks/')
df.head()

Unnamed: 0,ID,Path,N,N_b,N_r,Class,T_eff,e_T_eff,E_T_eff,Lum,...,E_Rad,logg,e_logg,E_logg,Mass,e_Mass,E_Mass,rho,e_rho,E_rho
0,3985923473972534400,/home/iebecker/Desktop/iebecker_stuff/Disks/Da...,8,4,4,DSCT_SXPHE,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,3986570197263160320,/home/iebecker/Desktop/iebecker_stuff/Disks/Da...,10,5,5,RRAB,6481.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,3986754189367115264,/home/iebecker/Desktop/iebecker_stuff/Disks/Da...,12,6,6,RRAB,7381.3335,7173.793,7558.6665,2.83599,...,-1.0,4.5185,-1.0,-1.0,1.55,-1.0,-1.0,1.0603,-1.0,-1.0
3,3987237630885709312,/home/iebecker/Desktop/iebecker_stuff/Disks/Da...,12,7,5,RRAB,7011.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,3987697089307190016,/home/iebecker/Desktop/iebecker_stuff/Disks/Da...,8,4,4,RRAB,7191.5,6808.0,8884.0,2.30248,...,-1.0,4.6352,-1.0,-1.0,1.58,-1.0,-1.0,1.5719,-1.0,-1.0


Define the properties of each light curve by Minimum length!

In [14]:
min_N =  10
max_N = 1000
    
max_L = 40000
min_L = 1000
    
bands = [i for i in df.columns if 'N_' in i]
b = np.ones(df.shape[0], dtype=bool)
for band in bands:
    b_band = df[band]>min_N
    b = np.logical_and(b, b_band)

df = df[b].copy()
df = df.reset_index().drop('index', axis=1)

Sample at most 40k elements per class.

20% -> for testing
70% -> for training
10% -> for validation


In [15]:

def sample_classes(data):
    # Leave up_to N_max objects per class
    dfs = []
    classes= list(data.Class.unique())
    num_classes = len(classes)
    
    for i in classes.copy():
        # Objects of the class
        bol = data.Class == i
        sel = data[bol]

        # Limit the minimum number of light curves
        if sel.shape[0] < min_L:
            # Update the classes
            classes.remove(i)
            num_classes = len(classes)
            # Skip the class
            continue

        # Return the min among the number of objects and max_L
        num = min(max_L, sel.shape[0])
        # Get a random sample
        sel = sel.sample(num, replace=False, axis=0)
        dfs.append(sel)
    # Join the dataframes of each class together
    data = pd.concat(dfs)
    return data

In [16]:
df = sample_classes(df)

In [17]:
Counter(df.Class)

Counter({'RRAB': 40000,
         'MIRA_SR': 40000,
         'RRC': 17254,
         'CEP': 5831,
         'DSCT_SXPHE': 3399,
         'T2CEP': 1181})

Create the folds, we will work with **5** folds. with this, the train split contains 80% of the total, and the test set contains 20%.

From this train split, we take 12.5% to validate and the remaining 87.5% is used to train.

From the total, the test set contains 20%, the trainning set contains 70% and the validation set, 10%.

In [18]:
kfolds = StratifiedKFold(n_splits=7, shuffle=True, )

In [19]:
path_folds = './Folds'
if not os.path.exists(path_folds):
    os.mkdir(path_folds)
# First split test
df_temp, df_test = train_test_split(df, stratify=df.Class, train_size=0.8)
df_temp.reset_index(inplace=True)
df_test.reset_index(inplace=True)

path_test = os.path.join(path_folds, 'test.csv')
df_test.to_csv(path_test, index=False, index_label=False)

for n, (train_index, val_index) in enumerate(kfolds.split(df_temp.index.values, df_temp.Class.values)):
    # Get the train and validation splits
    df_train = df_temp.loc[train_index]
    df_val = df_temp.loc[val_index]


    path_folds_ = os.path.join(path_folds, 'Fold_'+str(n+1))
    if not os.path.exists(path_folds_):
        os.mkdir(path_folds_)
        
    path_train = os.path.join(path_folds_, 'train.csv')

    path_val = os.path.join(path_folds_, 'val.csv')
    
    df_train.to_csv(path_train, index=False, index_label=False)
    df_val.to_csv(path_val, index=False, index_label=False)

In [20]:
data_train = pd.read_csv(path_train)
data_val = pd.read_csv(path_val)
data_test = pd.read_csv(path_test)
data_train.head()

Unnamed: 0,index,ID,Path,N,N_b,N_r,Class,T_eff,e_T_eff,E_T_eff,...,E_Rad,logg,e_logg,E_logg,Mass,e_Mass,E_Mass,rho,e_rho,E_rho
0,62599,6863003013366410752,/home/iebecker/Desktop/iebecker_stuff/Disks/Da...,27,14,13,RRAB,6336.75,5950.0,6911.0,...,-1.0,3.6941,-1.0,-1.0,1.39,-1.0,-1.0,0.0649,-1.0,-1.0
1,143685,6815011353741855104,/home/iebecker/Desktop/iebecker_stuff/Disks/Da...,24,12,12,RRAB,6627.67,6302.0,7013.0,...,-1.0,4.1192,-1.0,-1.0,1.34,-1.0,-1.0,0.2872,-1.0,-1.0
2,19263,4667949299731626496,/home/iebecker/Desktop/iebecker_stuff/Disks/Da...,26,14,12,RRAB,6146.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,147988,1218995148346916480,/home/iebecker/Desktop/iebecker_stuff/Disks/Da...,53,26,27,MIRA_SR,-1.0,-1.0,-1.0,...,110.59825,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,2303,4294499904783165696,/home/iebecker/Desktop/iebecker_stuff/Disks/Da...,36,18,18,MIRA_SR,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [21]:
def class_proportion(df):
    dict_ = Counter(df.Class)
    dict_ = {key:np.round(value/df.shape[0], 3) for key, value in  zip(dict_.keys(), dict_.values())}
    return dict_

In [22]:
class_proportion(data_train)

{'RRAB': 0.372,
 'MIRA_SR': 0.372,
 'RRC': 0.16,
 'CEP': 0.054,
 'DSCT_SXPHE': 0.032,
 'T2CEP': 0.011}

In [23]:
class_proportion(data_test)

{'RRC': 0.16,
 'MIRA_SR': 0.372,
 'RRAB': 0.372,
 'DSCT_SXPHE': 0.032,
 'T2CEP': 0.011,
 'CEP': 0.054}

In [24]:
class_proportion(data_val)

{'MIRA_SR': 0.372,
 'CEP': 0.054,
 'RRAB': 0.372,
 'RRC': 0.16,
 'DSCT_SXPHE': 0.032,
 'T2CEP': 0.011}