In [3]:
import numpy as np
import pandas as pd
import os

In [4]:
class Train_validation_test_split:
    
    def __init__(self, path_seg_data, path_trn_val_tst_data):
        
        self.path_seg_data = path_seg_data # path to the orginal segment data
        self.path_trn_val_tst_data = path_trn_val_tst_data # path to save trn, val, tst divided segment data
        self.df = self.Csv_data()
        
    def Csv_data(self):
        """read the csv file as a data frame"""
        df = pd.read_csv(self.path_seg_data)
        #df = df[df.Night==self.night]
        return df
        
    def Image_2_ID(self, df):
        """From the data frame select the chirp and ID cols and map each chirp to its ID"""
        df = df[["Segment","ID"]]
        return dict([(img,ID) for img, ID in df.to_records(index = False)])
    
    def ID_2_elements(self, elt_2_ID_dict):
        """Grouping segments by IDs"""
        ID_2_elts = {}
        for elt,ID in elt_2_ID_dict.items():
            if ID not in ID_2_elts:ID_2_elts[ID]= [elt]
            else:ID_2_elts[ID].append(elt)  
        return ID_2_elts
    
    def Image_2_call(self, df):
        """Grouping chirps per call"""
        df = df[["Segment","Call"]]
        return dict([(img,call) for img, call in df.to_records(index = False)])
    
    def trn_val_test_split(self):
        
        """Arrange imgs per night and per ID, and for each ID and then rearrange imgs per call.
        For each call, put 60% of the imgs in training, 20% in validation, and 20 in test set."""
        
        trn, val, tst = [], [], []
        #df = self.Csv_data()
        
        # select segs collected on the 1st night
        df = self.df[self.df.Night == 1]
        Img_2_cal = self.Image_2_call(df)
        
        # arrange images per ID
        
        Im2ID = self.Image_2_ID(df)
        ID2Ims = self.ID_2_elements(Im2ID)
         
        # arrange images of each ID per call

        for lstIms in list(ID2Ims.values()):
            Im2call = {img: Img_2_cal[img] for img in lstIms}
            call2Ims = self.ID_2_elements(Im2call)
            lst_cal_Ims = list(call2Ims.values())

            # put 60, 20, and 20% resp. in trn, val, and test set

            for lst in lst_cal_Ims:
                lst = sorted(lst)
                idx = np.arange(len(lst))
                # randomly shuffle 1_s_segments from each call
                np.random.shuffle(idx)
                lst = list(np.array(lst)[idx])
                p_60 = int(len(lst)*6/10)
                p_80 = int(len(lst)*8/10)
                trn += lst[:p_60]
                val += lst[p_60:p_80]
                tst += lst[p_80:]

        return trn, val, tst
    
    def save_train_val_test_segment_data(self):
        train, val, test = self.trn_val_test_split()
        df_trn = self.df[self.df.Segment.isin(train)]
        df_val = self.df[self.df.Segment.isin(val)]
        df_tst = self.df[self.df.Segment.isin(test)]
        df_trn.to_csv(self.path_trn_val_tst_data+"n1_train.csv", index = False)
        df_val.to_csv(self.path_trn_val_tst_data+"n1_validation.csv", index = False)
        df_tst.to_csv(self.path_trn_val_tst_data+"n1_test.csv", index = False)
        print()
        print("saving train, validation, and test segment data has been completed")
        


In [5]:
p_seg = "../segment_index_extraction/segment_data.csv"
p_t_v_t = "train_val_test_segment_data/"

In [6]:
tvt = Train_validation_test_split(p_seg, p_t_v_t)

In [7]:
tvt.save_train_val_test_segment_data()


saving train, validation, and test segment data has been completed


In [8]:
df = tvt.df
d2 = df[df.Night == 2]
d3 = df[df.Night == 3]
d2.to_csv(p_t_v_t + "n2_test.csv", index = False)
d3.to_csv(p_t_v_t + "n3_test.csv", index = False)