In [8]:
import numpy as np
import pandas as pd
import os
from lap import lapjv
from tqdm import tqdm

In [None]:
class Train_validation_test_split:
    
    def __init__(self, path):
        self.path = path
        #self.night = night
        
    def Csv_data(self):
        """read the csv file as a data frame"""
        df = pd.read_csv(self.path)
        #df = df[df.Night==self.night]
        return df
        
    def Image_2_ID(self, df):
        """From the data frame select the chirp and ID cols and map each chirp to its ID"""
        df = df[["Segment","ID"]]
        return dict([(img,ID) for img, ID in df.to_records(index = False)])
    
    def ID_2_elements(self, elt_2_ID_dict):
        """Grouping segments by IDs"""
        ID_2_elts = {}
        for elt,ID in elt_2_ID_dict.items():
            if ID not in ID_2_elts:ID_2_elts[ID]= [elt]
            else:ID_2_elts[ID].append(elt)  
        return ID_2_elts
    
    def Image_2_call(self, df):
        """Grouping chirps per call"""
        df = df[["Segment","Call"]]
        return dict([(img,call) for img, call in df.to_records(index = False)])
    
    def Execution(self):
        
        """Arrange imgs per night and per ID, and for each ID and then rearrange imgs per call.
        For each call, put 60% of the imgs in training, 20% in validation, and 20 in test set."""
        
        trn, val, tst = [], [], []
        df = self.Csv_data()
        # select segs collected on the 1st night
        df = df[df.Night == 2]
        Img_2_cal = self.Image_2_call(df)
        
        # arrange images per ID
        
        Im2ID = self.Image_2_ID(df)
        ID2Ims = self.ID_2_elements(Im2ID)
         
        # arrange images of each ID per call

        for lstIms in list(ID2Ims.values()):
            Im2call = {img: Img_2_cal[img] for img in lstIms}
            call2Ims = self.ID_2_elements(Im2call)
            lst_cal_Ims = list(call2Ims.values())

            # put 60, 20, and 20% resp. in trn, val, and test set

            for lst in lst_cal_Ims:
                lst = sorted(lst)
                idx = np.arange(len(lst))
                # randomly shuffle 1_s_segments from each call
                np.random.shuffle(idx)
                lst = list(np.array(lst)[idx])
                p_60 = int(len(lst)*6/10)
                p_80 = int(len(lst)*8/10)
                trn += lst[:p_60]
                val += lst[p_60:p_80]
                tst += lst[p_80:]

        return trn, val, tst


In [None]:
tvt = Train_validation_test_split(path)

In [None]:
Ts, Vs, Tes = tvt.Execution()

In [None]:
# all data
df = pd.read_csv("segment_2_characts.cvs")

In [None]:
#df

In [None]:
# keep selected data
Ts_df = df[df.Segment.isin(Ts)]

In [None]:
Ts_df

In [None]:
Ts_df.to_csv("all_tr_val_tst/n2_train.csv", index = False)

In [None]:
path = "segment_2_characts.cvs"

In [None]:
n = pd.read_csv(path)

In [None]:
dn = n[n.Night==3]

In [None]:
dn.to_csv("all_tr_val_tst/n3_test.csv", index = False)

In [None]:
ls all_tr_val_tst/

In [None]:
pk = "all_tr_val_tst/n1_validation.csv"
pe = "all_tr_val_tst/n2_validation.csv"

In [None]:
dk = pd.read_csv(pk)
de = pd.read_csv(pe)

In [None]:
ek = pd.concat([dk,de])

In [None]:
#ek.to_csv("all_tr_val_tst/n1_n2_validation.csv", index = False)

In [None]:
ls all_tr_val_tst/

In [None]:
path = "all_tr_val_tst/n1_n2_validation.csv"

In [None]:
import random

In [None]:
class Nonmatches:

    def __init__(self, path):
        self.path = path
        
    def read_csv_data(self):
        df = pd.read_csv(self.path)
        segments = df.Segment.values
        IDs = df.ID.values
        return segments, IDs
    
    def sampling(self):
        
        samples, IDs = self.read_csv_data()
        
        x1, x2 = np.triu_indices(IDs.shape[0],1)
        IDL, IDR = IDs[x1], IDs[x2]
        c = np.where(IDL == IDR)[0]
        xx1, xx2 = x1[c], x2[c]
        score = np.random.random_sample(size = (IDs.shape[0], IDs.shape[0]))
        score[xx1,xx2] = 1e5,
        score[xx2,xx1] = 1e5
        x = lapjv(score)[-1]
        y = np.arange(x.shape[0])
        LY = IDs[x]
        RY = IDs[y]
        d = np.where(LY==RY)[0]
        print("Number of matches = ", len(d))
        return samples[x], samples[y], LY, RY,

    
    
    

In [None]:
n = Nonmatches(path)

In [None]:
s1,s2,ID1,ID2 = n.sampling()

In [None]:
dic = {"sample_1": s1, "sample_2": s2, "ID_sample_1":ID1, "ID_sample_2":ID2}

In [None]:
df = pd.DataFrame(dic)

In [None]:
df

In [None]:
p1 = "trn_val_test_pairs/"

In [None]:
df.to_csv(p1+"n1_n2_validation_mismatches.csv", index = False)

In [None]:
pd.read_csv(p1+"n1_n2_validation_mismatches.csv")

In [166]:
pa = "all_tr_val_tst/n1_n2_validation.csv"

In [167]:
import random

In [168]:
class matches:
    """Create matches by randomly shuffling list's elements and pairing
    the shuffled list with the original one in a way that two identical
    samples do not occupy the same position"""

    def __init__(self, path):
        self.path = path
        self.seg2id = self.read_csv_data()
        
    def read_csv_data(self):
        df = pd.read_csv(self.path)
        df = df[["Segment", "ID"]]
        return dict([(seg, Id) for seg, Id in df.to_records(index = False)])
    
    def ID_to_samples(self):
        ID2segs = {}
        
        seg2ID = self.read_csv_data()
        for seg, ID in seg2ID.items():
            if ID not in ID2segs:
                ID2segs[ID] = [seg]
            else:
                ID2segs[ID].append(seg)
        return ID2segs
    
    
    def matching(self):
        
        ml, mr = [], []
        
        ID_2_samples = self.ID_to_samples()
        
        for lst in ID_2_samples.values():
            np.random.shuffle(lst)
            # form two lists like these [1 2 3 4 5] and [5 1 2 3 4]
            # shift one position and put the last element of list 2 at first position
            ml += lst
            mr += [lst[-1]]+lst[:-1]
            
        L, R = np.array(ml), np.array(mr)
        assert len(np.where(L==R)[0]) == 0
        
        return L, R 
            
    
    def samples2IDs(self):
        L, R = self.matching()
        L2ID = [(samp,self.seg2id[samp]) for samp in L]
        R2ID = [(samp,self.seg2id[samp]) for samp in R]
        seg_L, ID_L = zip(*L2ID)
        seg_R, ID_R = zip(*R2ID)
        return seg_L, seg_R, ID_L, ID_R, np.ones(len(L),)
                    
            
        
        

In [169]:
m = matches(pa)

In [170]:
a,b,c,d,e = m.samples2IDs()

In [171]:
D = {"sample_1": a, "sample_2": b, "ID_sample_1":c, "ID_sample_2":d, "label":e}

In [172]:
E = pd.DataFrame(D)

In [173]:
E

Unnamed: 0,sample_1,sample_2,ID_sample_1,ID_sample_2,label
0,LAB23_132,LAB23_50,LAB,LAB,1.0
1,LAB23_135,LAB23_132,LAB,LAB,1.0
2,LAB21_135,LAB23_135,LAB,LAB,1.0
3,LAB21_89,LAB21_135,LAB,LAB,1.0
4,LAB23_164,LAB21_89,LAB,LAB,1.0
...,...,...,...,...,...
8763,BLL11_366,BLL21_85,BLL,BLL,1.0
8764,BLL21_88,BLL11_366,BLL,BLL,1.0
8765,BLL21_170,BLL21_88,BLL,BLL,1.0
8766,BLL11_229,BLL21_170,BLL,BLL,1.0


In [174]:
p = "within_across_nights_pairs/n1_n2_validation_pairs.csv"

In [175]:
ls within_across_nights_pairs/

n1_n2_test_pairs.csv        n1_test_pairs.csv        n2_test_pairs.csv
n1_n2_train_pairs.csv       n1_train_pairs.csv       n3_test_pairs.csv
n1_n2_validation_pairs.csv  n1_validation_pairs.csv
n1_n3_test_pairs.csv        n2_n3_test_pairs.csv


In [176]:
t = pd.read_csv(p)

In [177]:
K = t[t.label == 0]

In [178]:
K

Unnamed: 0,sample_1,sample_2,ID_sample_1,ID_sample_2,label
0,PKS21_119,LPL21_41,PKS,LPL,0
1,LAB11_329,WGB11_160,LAB,WGB,0
2,SPK21_258,PSP11_131,SPK,PSP,0
4,LLP11_251,BAS21_185,LLP,BAS,0
6,LPS11_378,SKL21_185,LPS,SKL,0
...,...,...,...,...,...
17530,LLB21_438,PGP11_147,LLB,PGP,0
17531,LKL11_105,SPK12_130,LKL,SPK,0
17532,KAK11_193,KBK11_222,KAK,KBK,0
17533,SAB12_72,BLS21_214,SAB,BLS,0


In [179]:
df = pd.concat([E,K])

In [180]:
df

Unnamed: 0,sample_1,sample_2,ID_sample_1,ID_sample_2,label
0,LAB23_132,LAB23_50,LAB,LAB,1.0
1,LAB23_135,LAB23_132,LAB,LAB,1.0
2,LAB21_135,LAB23_135,LAB,LAB,1.0
3,LAB21_89,LAB21_135,LAB,LAB,1.0
4,LAB23_164,LAB21_89,LAB,LAB,1.0
...,...,...,...,...,...
17530,LLB21_438,PGP11_147,LLB,PGP,0.0
17531,LKL11_105,SPK12_130,LKL,SPK,0.0
17532,KAK11_193,KBK11_222,KAK,KBK,0.0
17533,SAB12_72,BLS21_214,SAB,BLS,0.0


In [181]:
F = df.sample(frac=1).reset_index(drop=True)

In [182]:
F

Unnamed: 0,sample_1,sample_2,ID_sample_1,ID_sample_2,label
0,BKB11_145,PYP21_420,BKB,PYP,0.0
1,BLB21_147,BLB11_382,BLB,BLB,1.0
2,LLK21_180,BLS11_95,LLK,BLS,0.0
3,APP21_175,KKA21_218,APP,KKA,0.0
4,PSP21_333,PSP11_81,PSP,PSP,1.0
...,...,...,...,...,...
17531,LAL21_440,LAL21_270,LAL,LAL,1.0
17532,PKS11_331,PKS12_148,PKS,PKS,1.0
17533,WLL11_415,WLL11_83,WLL,WLL,1.0
17534,SKL21_48,WLL11_405,SKL,WLL,0.0


In [183]:
F.to_csv("pairs/n1_n2_validation_pairs.csv", index = False)

In [184]:
pd.read_csv("pairs/n1_n2_train_pairs.csv")

Unnamed: 0,sample_1,sample_2,ID_sample_1,ID_sample_2,label
0,PKS21_315,PKS11_92,PKS,PKS,1.0
1,WLL11_406,KKP13_298,WLL,KKP,0.0
2,BAS23_197,SAB11_261,BAS,SAB,0.0
3,PSS21_94,RBR13_163,PSS,RBR,0.0
4,SBA11_184,SBA11_70,SBA,SBA,1.0
...,...,...,...,...,...
52537,ASS21_9,ASS21_153,ASS,ASS,1.0
52538,SPK12_158,SPK21_111,SPK,SPK,1.0
52539,BAS11_64,BAS22_300,BAS,BAS,1.0
52540,KAA11_238,KAA11_200,KAA,KAA,1.0


In [None]:
k = pd.read_csv("trn_val_test_pairs/n1_n2_validation_mismatches.csv")

In [None]:
dek = pd.concat([e,k])

In [None]:
dek

In [None]:
f = dek.sample(frac=1).reset_index(drop=True)

In [None]:
f

In [None]:
ID1 = f.ID_sample_1.values
ID2 = f.ID_sample_2.values
Y = np.where(ID1==ID2,1,0)

In [None]:
f["label"] = Y

In [None]:
f

In [None]:
ls within_across_nights_pairs/

In [None]:
#f.to_csv("within_across_nights_pairs/n1_n2_validation_pairs.csv", index = False)

In [None]:
f = pd.read_csv("within_across_nights_pairs/n1_n2_validation_pairs.csv")

In [None]:
f

In [None]:
class Across_nights_pairs:
    """create a set of across-nights balanced pairs, hal matches and mismatches.
    Components are from different nights"""
    
    def __init__(self, samples_1, samples_2, IDs_1, IDs_2,number_pairs):
        self.samples_1 = samples_1
        self.samples_2 = samples_2
        self.IDs_1 = IDs_1
        self.IDs_2 = IDs_2
        self.number_pairs = number_pairs
        
    def pairings(self, lst1,lst2):
        """Generate all possible pairings from two lists of samples"""
        x1, x2 = zip(*[(i,j) for i in lst1 for j in lst2 ])
        return np.array(x1), np.array(x2)

    
    def sample2ID(self, samples, IDs):
        """map each sample to its ID"""
        return {samples[i]:IDs[i] for i in range(len(IDs))}
    
    def assignID(self, sam2ID, lst):
        """Assign IDs to a list of samples given given dict mapping each sample to its ID"""
        return np.array([sam2ID[sample] for sample in lst])
    
    
    def pairs(self, sampL, sampR, L, R, n):
        """Given two lists: sampL and sampR and their IDs: L, and R,
        sample n matches and n mismatches"""
        pos = np.where(L==R)[0]
        idx1 = np.arange(pos.shape[0])
        np.random.shuffle(idx1)
        po = pos[idx1][:n]
        y_p = np.ones(po.shape[0],)
        
        neg = np.where(L!=R)[0]
        idx2 = np.arange(neg.shape[0])
        np.random.shuffle(idx2)
        ne = neg[idx2][:n] 
        y_n = np.zeros(ne.shape[0],)
        
        csegs = np.hstack([po, neg])
        cys = np.hstack([y_p, y_n])
        
        idx3 = np.arange(cys.shape[0])
        np.random.shuffle(idx3)
        new_idx = csegs[idx3]
        
        return sampL[new_idx], sampR[new_idx], L[new_idx], R[new_idx], cys[idx3]
    
    def Execution(self):
        l1, l2 = self.pairings(self.samples_1, self.samples_2)
        sam1_2_IDs = self.sample2ID(self.samples_1, self.IDs_1)
        sam2_2_IDs = self.sample2ID(self.samples_2, self.IDs_2)
        l1_IDs = self.assignID(sam1_2_IDs, l1)
        l2_IDs = self.assignID(sam2_2_IDs, l2)
        L, R, IDL, IDR, Y = self.pairs(l1, l2, l1_IDs, l2_IDs, self.number_pairs)
        return L, R, IDL, IDR, Y
        

In [None]:
p1 = "all_tr_val_tst/n2_test.csv"
p2 = "all_tr_val_tst/n3_test.csv"

In [None]:
df1 = pd.read_csv(p1)
df2 = pd.read_csv(p2)

In [None]:
sam1 = df1.Segment.values
sam2 = df2.Segment.values
ID1 = df1.ID.values
ID2 = df2.ID.values
n = 5000 

In [None]:
pairs = Across_nights_pairs(sam1,sam2,ID1,ID2,n)

In [None]:
samL, samR, IDs_L, IDs_R, Y = pairs.Execution()

In [None]:
D = {"sample_1": samL, "sample_2": samR, "ID_sample_1":IDs_L, "ID_sample_2":IDs_R, "label":Y}

In [None]:
s = pd.DataFrame(D)

In [None]:
s

In [None]:
#s.to_csv("within_night/n2_n3_test_pairs.csv", index = False)

In [None]:
ls within_across_nights_pairs/

In [None]:
df = pd.read_csv("within_across_nights_pairs/n1_test_pairs.csv")

In [None]:
df

In [None]:
sample_1 = df.sample_1.values
sample_2 = df.sample_2.values

In [None]:
idx = np.where(sample_1==sample_2)[0]

In [None]:
a = sample_1[idx]
b = sample_2[idx]

In [None]:
lst = np.arange(10)

In [None]:
np.random.shuffle(lst)

In [None]:
lst

In [2]:
path = "within_across_nights_pairs/"

In [4]:
df = pd.read_csv(path+"n1_n2_test_pairs.csv")