In [1]:
import numpy as np
import pandas as pd
import os
from lap import lapjv
from tqdm import tqdm

In [2]:
import random

In [51]:
class Nonmatches:

    def __init__(self, path):
        self.path = path
        
    def read_csv_data(self):
        df = pd.read_csv(self.path)
        segments = df.Segment.values
        IDs = df.ID.values
        return segments, IDs
    
    def sampling(self):
        
        samples, IDs = self.read_csv_data()
        
        x1, x2 = np.triu_indices(IDs.shape[0],1)
        IDL, IDR = IDs[x1], IDs[x2]
        c = np.where(IDL == IDR)[0]
        xx1, xx2 = x1[c], x2[c]
        score = np.random.random_sample(size = (IDs.shape[0], IDs.shape[0]))
        score[xx1,xx2] = 1e5,
        score[xx2,xx1] = 1e5
        x = lapjv(score)[-1]
        y = np.arange(x.shape[0])
        LY = IDs[x]
        RY = IDs[y]
        d = np.where(LY==RY)[0]
        
        print("Number of matches = ", len(d))
        return samples[x], samples[y], LY, RY

    
    
    

In [52]:
path = "../githubdata/train_val_test_segment_data/n3_test.csv"

In [53]:
n = Nonmatches(path)

In [54]:
s1,s2,ID1,ID2 = n.sampling()

Number of matches =  420


In [55]:
s1.shape

(420,)

In [32]:
dic = {"sample_1": s1, "sample_2": s2, "ID_sample_1":ID1, "ID_sample_2":ID2, "label":[0.0]*len(s1)}

In [33]:
K = pd.DataFrame(dic)

In [34]:
K

Unnamed: 0,sample_1,sample_2,ID_sample_1,ID_sample_2,label
0,LPL21_409,BKB21_0,LPL,BKB,0.0
1,LPL21_399,BKB21_1,LPL,BKB,0.0
2,LPL21_99,BKB21_2,LPL,BKB,0.0
3,LPL21_151,BKB21_3,LPL,BKB,0.0
4,LPL21_216,BKB21_4,LPL,BKB,0.0
...,...,...,...,...,...
771,BKB21_275,LPL21_414,BKB,LPL,0.0
772,BKB21_241,LPL21_415,BKB,LPL,0.0
773,BKB21_361,LPL21_416,BKB,LPL,0.0
774,BKB21_219,LPL21_417,BKB,LPL,0.0


In [35]:
import random

In [56]:
class matches:
    """Create matches by randomly shuffling list's elements and pairing
    the shuffled list with the original one in a way that two identical
    samples do not occupy the same position"""

    def __init__(self, path):
        self.path = path
        self.seg2id = self.read_csv_data()
        
    def read_csv_data(self):
        df = pd.read_csv(self.path)
        df = df[["Segment", "ID"]]
        return dict([(seg, Id) for seg, Id in df.to_records(index = False)])
    
    def ID_to_samples(self):
        ID2segs = {}
        
        seg2ID = self.read_csv_data()
        for seg, ID in seg2ID.items():
            if ID not in ID2segs:
                ID2segs[ID] = [seg]
            else:
                ID2segs[ID].append(seg)
        return ID2segs
    
    
    def matching(self):
        
        ml, mr = [], []
        
        ID_2_samples = self.ID_to_samples()
        
        for lst in ID_2_samples.values():
            np.random.shuffle(lst)
            # form two lists like these [1 2 3 4 5] and [5 1 2 3 4]
            # shift one position and put the last element of list 2 at first position
            ml += lst
            mr += [lst[-1]]+lst[:-1]
            
        L, R = np.array(ml), np.array(mr)
        assert len(np.where(L==R)[0]) == 0
        
        return L, R 
            
    
    def samples2IDs(self):
        L, R = self.matching()
        L2ID = [(samp,self.seg2id[samp]) for samp in L]
        R2ID = [(samp,self.seg2id[samp]) for samp in R]
        seg_L, ID_L = zip(*L2ID)
        seg_R, ID_R = zip(*R2ID)
        return seg_L, seg_R, ID_L, ID_R, np.ones(len(L),)
                    
            
        
        

In [57]:
m = matches(path)

In [58]:
a,b,c,d,e = m.samples2IDs()

In [59]:
D = {"sample_1": a, "sample_2": b, "ID_sample_1":c, "ID_sample_2":d, "label":e}

In [60]:
E = pd.DataFrame(D)

In [61]:
E

Unnamed: 0,sample_1,sample_2,ID_sample_1,ID_sample_2,label
0,LPL31_74,LPL31_395,LPL,LPL,1.0
1,LPL31_256,LPL31_74,LPL,LPL,1.0
2,LPL31_305,LPL31_256,LPL,LPL,1.0
3,LPL31_157,LPL31_305,LPL,LPL,1.0
4,LPL31_25,LPL31_157,LPL,LPL,1.0
...,...,...,...,...,...
415,LPL31_93,LPL31_163,LPL,LPL,1.0
416,LPL31_109,LPL31_93,LPL,LPL,1.0
417,LPL31_255,LPL31_109,LPL,LPL,1.0
418,LPL31_99,LPL31_255,LPL,LPL,1.0


In [42]:
df = pd.concat([E,K])

In [43]:
df

Unnamed: 0,sample_1,sample_2,ID_sample_1,ID_sample_2,label
0,BKB21_65,BKB21_75,BKB,BKB,1.0
1,BKB21_357,BKB21_65,BKB,BKB,1.0
2,BKB21_266,BKB21_357,BKB,BKB,1.0
3,BKB21_293,BKB21_266,BKB,BKB,1.0
4,BKB21_334,BKB21_293,BKB,BKB,1.0
...,...,...,...,...,...
771,BKB21_275,LPL21_414,BKB,LPL,0.0
772,BKB21_241,LPL21_415,BKB,LPL,0.0
773,BKB21_361,LPL21_416,BKB,LPL,0.0
774,BKB21_219,LPL21_417,BKB,LPL,0.0


In [44]:
F = df.sample(frac=1).reset_index(drop=True)

In [45]:
F

Unnamed: 0,sample_1,sample_2,ID_sample_1,ID_sample_2,label
0,BKB21_28,LPL21_28,BKB,LPL,0.0
1,LPL21_290,LPL21_198,LPL,LPL,1.0
2,LPL21_232,BKB21_215,LPL,BKB,0.0
3,BKB21_29,BKB21_59,BKB,BKB,1.0
4,LPL21_130,LPL21_109,LPL,LPL,1.0
...,...,...,...,...,...
1578,BKB21_243,BKB21_132,BKB,BKB,1.0
1579,LPL21_72,BKB21_328,LPL,BKB,0.0
1580,BKB21_40,LPL21_333,BKB,LPL,0.0
1581,BKB21_38,LPL21_1,BKB,LPL,0.0


In [62]:
#F.to_csv("pairs/n3_test_pairs.csv", index = False)

In [49]:
#pd.read_csv("pairs/n1_train_pairs.csv")

Unnamed: 0,sample_1,sample_2,ID_sample_1,ID_sample_2,label
0,BKB12_149,BKB12_64,BKB,BKB,1.0
1,BKB12_158,RBB13_80,BKB,RBB,0.0
2,LPL11_102,RBB11_161,LPL,RBB,0.0
3,LPL11_208,RBB11_1,LPL,RBB,0.0
4,RBB13_162,RBB13_350,RBB,RBB,1.0
...,...,...,...,...,...
1755,RBB13_81,BKB12_159,RBB,BKB,0.0
1756,BKB12_1,BKB12_341,BKB,BKB,1.0
1757,BKB12_121,RBB11_327,BKB,RBB,0.0
1758,BKB12_73,BKB12_48,BKB,BKB,1.0


In [96]:
class Across_nights_pairs:
    """create a set of across-nights balanced pairs, hal matches and mismatches.
    Components are from different nights"""
    
    def __init__(self, samples_1, samples_2, IDs_1, IDs_2,number_pairs):
        self.samples_1 = samples_1
        self.samples_2 = samples_2
        self.IDs_1 = IDs_1
        self.IDs_2 = IDs_2
        self.number_pairs = number_pairs
        
    def pairings(self, lst1,lst2):
        """Generate all possible pairings from two lists of samples"""
        x1, x2 = zip(*[(i,j) for i in lst1 for j in lst2 ])
        return np.array(x1), np.array(x2)

    
    def sample2ID(self, samples, IDs):
        """map each sample to its ID"""
        return {samples[i]:IDs[i] for i in range(len(IDs))}
    
    def assignID(self, sam2ID, lst):
        """Assign IDs to a list of samples given given dict mapping each sample to its ID"""
        return np.array([sam2ID[sample] for sample in lst])
    
    
    def pairs(self, sampL, sampR, L, R, n):
        """Given two lists: sampL and sampR and their IDs: L, and R,
        sample n matches and n mismatches"""
        pos = np.where(L==R)[0]
        idx1 = np.arange(pos.shape[0])
        np.random.shuffle(idx1)
        po = pos[idx1][:n]
        y_p = np.ones(po.shape[0],)
        
        neg = np.where(L!=R)[0]
        idx2 = np.arange(neg.shape[0])
        np.random.shuffle(idx2)
        ne = neg[idx2][:n] 
        y_n = np.zeros(ne.shape[0],)
        
        csegs = np.hstack([po, neg])
        cys = np.hstack([y_p, y_n])
        
        idx3 = np.arange(cys.shape[0])
        np.random.shuffle(idx3)
        new_idx = csegs[idx3]
        
        return sampL[new_idx], sampR[new_idx], L[new_idx], R[new_idx], cys[idx3]
    
    def Execution(self):
        l1, l2 = self.pairings(self.samples_1, self.samples_2)
        sam1_2_IDs = self.sample2ID(self.samples_1, self.IDs_1)
        sam2_2_IDs = self.sample2ID(self.samples_2, self.IDs_2)
        l1_IDs = self.assignID(sam1_2_IDs, l1)
        l2_IDs = self.assignID(sam2_2_IDs, l2)
        L, R, IDL, IDR, Y = self.pairs(l1, l2, l1_IDs, l2_IDs, self.number_pairs)
        return L, R, IDL, IDR, Y
        

In [106]:
p1 = "../githubdata/train_val_test_segment_data/n1_test.csv"
p2 = "../githubdata/train_val_test_segment_data/n2_test.csv"

In [107]:
df1 = pd.read_csv(p1)
df2 = pd.read_csv(p2)

In [108]:
sam1 = df1.Segment.values
sam2 = df2.Segment.values
ID1 = df1.ID.values
ID2 = df2.ID.values
n = 5000 

In [109]:
pairs = Across_nights_pairs(sam1,sam2,ID1,ID2,n)

In [110]:
samL, samR, IDs_L, IDs_R, Y = pairs.Execution()

In [111]:
D = {"sample_1": samL, "sample_2": samR, "ID_sample_1":IDs_L, "ID_sample_2":IDs_R, "label":Y}

In [112]:
s = pd.DataFrame(D)

In [113]:
s

Unnamed: 0,sample_1,sample_2,ID_sample_1,ID_sample_2,label
0,RBB11_9,LPL31_106,RBB,LPL,0.0
1,RBB11_39,LPL31_362,RBB,LPL,0.0
2,RBB11_39,LPL31_391,RBB,LPL,0.0
3,RBB11_46,LPL31_45,RBB,LPL,0.0
4,LPL11_114,LPL31_215,LPL,LPL,1.0
...,...,...,...,...,...
9995,LPL11_211,LPL31_364,LPL,LPL,1.0
9996,LPL11_152,LPL31_173,LPL,LPL,1.0
9997,LPL11_27,LPL31_78,LPL,LPL,1.0
9998,LPL11_158,LPL31_67,LPL,LPL,1.0


In [105]:
#s.to_csv("pairs/n1_n2_test_pairs.csv", index = False)