In [10]:
from typing import Sequence, Callable
from collections import defaultdict
import operator
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split

In [4]:
v = pd.read_csv('data/functions_encoded.csv')
labels = pd.read_csv('./data/labels_encoded.csv')

In [6]:
def mysample(v, sample_size):
    r = random.sample(list(v.apn.unique()), k=sample_size)
    #apns = v.apn.unique sample(sample_size, random_state=42)
    return v[v.apn.isin(r)][['apn', 'nf']]

In [7]:
smp = mysample(v, 100)
funcs_smp = smp.groupby(by='apn')['nf'].apply(set)

In [8]:
funcs = v.groupby(by='apn')['nf'].apply(set)


def adf(apid1: int, apid2: int,funcs ) -> float:
    p1 = funcs[apid1]
    p2 = funcs[apid2]
    
    a= len(p1.difference(p2)) + len(p2.difference(p1))
    return np.sqrt(a)

In [9]:
def create_net(gamma:float, apns: Sequence[int], distance=lambda x, y: adf(x, y, funcs_smp)) -> Sequence[int]:   
    np.random.shuffle(apns)
    net = []

    for a in apns:
        insert = True
        for n in net: 
            if distance(a, n) <= gamma:
                insert = False
                break
        if insert:
            net.append(a)
            
    return net

def calculate_distances(net, distance: Callable)->Sequence[float]:
    distances = []
    for o in net:
        for z in net:
            if o == z:
                continue
            distances.append(distance(o, z))
    return distances



def create_aggregating_net(gamma:float, apns: Sequence[int], distance=lambda x, y: adf(x, y, funcs_smp)):   
  #  np.random.shuffle(apns)
    net = defaultdict(list)

    for a in apns:
        insert = True
        for n in net.keys(): 
            if distance(a, n) <= gamma:
                insert = False
                net[n].append(a)
                break #does it always belong to only one point? probably
        if insert:
            net[a] = list()
            
    return net

def app_k_nearest(k, apps, new_app, distance):
    byd = sorted(apps, key=lambda lp: distance(lp, new_app))
    return byd[:k]

In [11]:
a1, a2 = train_test_split(smp.apn.unique(), test_size=.5)
s1 = smp[smp.apn.isin(a1)]
s2 = smp[smp.apn.isin(a2)]

In [12]:
n1 = create_aggregating_net(gamma=30, apns=a1, distance=lambda x,y: adf(x,y, funcs_smp) )
n2 = create_aggregating_net(gamma=30, apns=a2, distance=lambda x,y: adf(x,y, funcs_smp) )

## Marging methods

In [18]:
# takes keys from both networks if key overlap the aggregates are merged
def naive_merge(n1, n2):
    nx = {**n1, **n2}
    for key in nx.keys():
        if key in n1:
            nx[key] = list(set(nx[key]+n1[key]))
    return nx

# takes only keys that are at least gamma from each other, for keys closer than gamma, their aggregats are merged
def key_based_merg(n1, n2, distance, gamma):
    nx = {**n1}
    for k, l in n2.items():
        idf = True
        for k2 in nx.keys():
            if distance(k, k2)<gamma:
                nx[k2]+=[li for li in l]
                idf = False
                break
       
        if idf:
            if k not in nx:
                nx[k]=[]
            nx[k]+=[li for li in l]
    
    return nx

#calculates net over point nets and use it as keys of the merged network (similar/same? to the key-based)
def net_based_merge(n1, n2, distance, gamma):
    nn = create_aggregating_net(gamma=gamma, 
                                apns=list(n1.keys())+list(n2.keys()), 
                                distance=distance)
    targ = defaultdict(list)
    nets = [n1, n2]
    for k,v in nn.items():
        nestr = [el for el in v]
        nestr.append(k)
        for kv in nestr:
            for n in nets:
                if kv in n.keys():
                    targ[k] += n[kv]
    
    return targ

## Merge Quality Measurements

In [17]:
#checks if pairweise key distances are >= gamma (should be the case)
def are_keys_correct(net, distance, gamma):
    isok = True
    keylist = list(net.keys())
    for i,k in enumerate(keylist):
        for j in keylist[i+1:]:
            if (d:=distance(k, j)) < gamma:
                #isok=False
                return False
                
    return True

# checks if distance between key and points it aggregates are < gamma (should be the case)
def are_distances_correct(net, gamma, distance):
    for k, l in net.items():
        m = [distance(le, k) for le in l if distance(le, k) > gamma]
        if len(m)> 0:
            return False
    return True

def get_elements(net):
    all_elements = []
    for k, v in net.items():
        l = [vi for vi in v]
        all_elements = all_elements+l
        all_elements.append(k)
    return all_elements

## Examples

In [23]:
distance = lambda x,y: adf(x,y, funcs_smp)

In [33]:
n_merged = naive_merge(n1, n2)

In [26]:
# distances between keys might be to small
# distances within key will be ok (as they stam from a correct network)
are_keys_correct(n_merged, distance=distance, gamma=30), are_distances_correct(n_merged, distance=distance, gamma=30)

(False, True)

In [27]:
k_merged = key_based_merg(n1, n2, distance, 30)

In [30]:
# keys will be in proper distances
# withing one key they might be some aggregates that are too far
are_keys_correct(k_merged, distance=distance, gamma=30), are_distances_correct(k_merged, distance=distance, gamma=30)

(True, True)

In [32]:
net_merged = net_based_merge(n1, n2, distance, 30)

In [36]:
# keys will be correct (they form a correct network)
# agrregates might not be correct
are_keys_correct(net_merged, distance=distance, gamma=30), are_distances_correct(net_merged, distance=distance, gamma=30)

(True, True)

## Loop

In [38]:
from tqdm.notebook import tqdm

In [39]:
def get_stats(net, distance, gamma):
    kc = are_keys_correct(net, distance=distance, gamma=gamma)
    dc = are_distances_correct(net, distance=distance, gamma=gamma)
    kl = len(net.keys())
    
    return {'kc': kc, 'dc': dc, 'kl':kl, 'gamma': gamma}

In [40]:
naive_stats = list()
k_stats = list()
net_stats = list()

gamma = 30
distance = lambda x,y: adf(x,y, funcs_smp)

for run in tqdm(range(10)):
    a1, a2 = train_test_split(smp.apn.unique(), test_size=.5)

    n1 = create_aggregating_net(gamma=gamma, apns=a1, distance=lambda x,y: adf(x,y, funcs_smp) )
    n2 = create_aggregating_net(gamma=gamma, apns=a2, distance=lambda x,y: adf(x,y, funcs_smp) )

    n_merged = naive_merge(n1, n2)
    naive_stats.append(get_stats(n_merged, distance=distance, gamma=gamma))
    
    k_merged = key_based_merg(n1, n2, distance, gamma)
    k_stats.append(get_stats(k_merged, distance=distance, gamma=gamma))
    
    net_merged = net_based_merge(n1, n2, distance, gamma)
    net_stats.append(get_stats(net_merged, distance=distance, gamma=gamma))
    

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [42]:
pd.DataFrame(net_stats)

Unnamed: 0,kc,dc,kl,gamma
0,True,True,49,30
1,True,True,51,30
2,True,False,53,30
3,True,False,52,30
4,True,True,49,30
5,True,True,49,30
6,True,True,49,30
7,True,False,50,30
8,True,True,49,30
9,True,True,49,30


In [43]:
pd.DataFrame(k_stats)

Unnamed: 0,kc,dc,kl,gamma
0,True,True,49,30
1,True,True,51,30
2,True,False,53,30
3,True,False,52,30
4,True,True,49,30
5,True,True,49,30
6,True,True,49,30
7,True,False,50,30
8,True,True,49,30
9,True,True,49,30
