In [28]:
import numpy as np
import networkx as nx 
from collections import defaultdict
import time
import statistics
import pprint
# from pykeops.numpy import LazyTensor as LazyTensor_np

def dist(x,y):
    return np.sum((x-y)**2)

class graph:
    def __init__(self):
        self.graph=defaultdict(set)
        self.ddict=defaultdict(float)
        self.explored_edges=defaultdict(set)
    def add_key(self,a,x):
        self.keys[a].add(x)
    def add_edge(self,a,b):
        self.graph[str(a)].add(b)
        self.graph[str(b)].add(a)
    def replace_edge_list(self,a,x):
        self.graph[str(a)]=set(x)
    def del_edge(self,a,b):
        self.graph[str(a)].remove(b)
        self.graph[str(b)].remove(a)
    def edge_list(self):
        edge=set()
        for key in set(self.graph.keys()):
            tmp=set([(int(key),i) for i in self.graph[key]])
            edge=edge.union(tmp)
        return edge
    def get_edge(self,a):
        return self.graph[str(a)]
    def visualize(self): 
        G = nx.Graph() 
        G.add_edges_from(self.edge_list()) 
        nx.draw_networkx(G) 
        plt.show() 

class tree:
    def __init__(self,x):
        self.min_size=5
        self.leaves=[]
        self.sizes=[]
#         self.min_size=x.shape[0]/10
        self.tree=self.make_tree(x)

    def make_tree(self,x):
        node=dict()
        node['left']=[]
        node['right']=[]
        node['elements']=[]
        node['isleaf']=False
        if x.shape[0] > self.min_size:
            v,threshold=self.choose_rule(x)
            distances=np.dot(x,v) # create array of distances
            left_bool=distances<=threshold # create boolean array where entries are true if distance <= threshold
            right_bool=np.invert(left_bool)
#             left_bool=np.empty(0,dtype=bool)
#             for i in range(len(x)):
#                 if np.dot(x[i],v) <= threshold:
#                     left_bool=np.append(left_bool,True)
#                 else:
#                     left_bool=np.append(left_bool,False)
#             right_bool=np.invert(left_bool)
            node['left']=self.make_tree(x[left_bool,:])
            node['right']=self.make_tree(x[right_bool,:])
#         else:
        elif x.shape[0] != 0:
            node['elements']=x.copy()
            node['isleaf']=True
            self.leaves.append(x)
            self.sizes.append(len(x))
        return node

    def choose_rule(self,x):
        dim=x.shape[1]
        v=np.random.random(dim)
        v/=np.linalg.norm(v) # find random unit vector
        
        index=np.random.randint(x.shape[0]) # select random point in set x
        
        # find distance from this random point to furthest point in set x
        x_1=x[index]
#         x_1=x[:,None,:]
#         x_2=x[None,:,:]
        distances=((x_1-x)**2).sum(-1)
        max_distance=np.max(distances)
#         max_distance=0
#         d_list=[]
#         for i in range(x.shape[0]):
#             distance=dist(x[i],x[index])
#             d_list.append(distance)
#             if distance > max_distance:
#                 max_distance=distance
#                 y=x[i]
        max=6*np.sqrt(max_distance)/np.sqrt(dim)
        d=np.random.uniform(-max,max)
        median=statistics.median(distances)
        threshold=median+d
        return v, threshold
    
class forest:
    def __init__(self,x,trees=10):
        self.forest=[]
        self.tree_num=trees
        self.make_trees()
        
    def make_trees(self):
        for i in range(self.tree_num):
            t=tree(x)
            self.forest.append(t)
    
    def find_leaf(self,x): # returns KNN based on all trees.
        leaf_union=np.array([[]])
        for tree in self.forest:
            for leaf in tree.leaves:
                if x in leaf:
                    leaf_union=np.append(leaf_union,leaf,axis=0)
                    break
        distances=((leaf_union-x)**2).sum(-1)
        indices=np.argmin(distances,axis=1)
        indices=indices[:5]
        return leaf_union[indices]

#construct NN graph
def construct_graph(x,k=3,count=3,init=3,RP=False):
    def combi(x):
        combi=set()
        for i in x:
            for j in x:
                if (j,i) not in combi and i!=j:
                    combi.add((i,j))
        return combi  
    g=graph()
    l=len(x)
    if RP == False:
        for i in range(len(x)):
            for r in range(init):
                while True:
                    j=int(l * np.random.random())
                    if j!=i:
                        break
                g.add_edge(i,j)
                d=dist(x[i],x[j])
                g.ddict[(i,j)]=d
                g.ddict[(j,i)]=d
                g.explored_edges[i].add(j)
                g.explored_edges[j].add(i)  
    else: # use RP tree for graph initialisation
        t=tree(x)
        i_index=0
        j_index=0
        for leaf in t.leaves:
            if len(leaf)!=1:
                for i in leaf:
                    for j in leaf:
                        if i.all()!=j.all():
                            g.add_edge(i_index,j_index)
                            d=dist(i,j)
                            g.ddict[(i_index,j_index)]=d
                            g.ddict[(j_index,i_index)]=d
                            g.explored_edges[i_index].add(j_index)
                            g.explored_edges[j_index].add(i_index)
                        j_index+=1
                    i_index+=1

  #start update here
    for i in range(count):
        neighbours_set=set()

        for index in range(l):
            el=g.get_edge(index)
            neighbours=set([j for j in combi(el) if j[1] not in g.get_edge(j[0])])
            neighbours_set=neighbours_set.union(neighbours)   

        if neighbours_set==set():      
            break

        for pair in list(neighbours_set):
            d=dist(x[pair[0]],x[pair[1]])
            g.ddict[pair]=d
            g.ddict[(pair[1],pair[0])]=d
            g.explored_edges[pair[0]].add(pair[1])
            g.explored_edges[pair[1]].add(pair[0])

    
    #recalculate all neighbours
    for index in range(l):
        nodes_update=list(g.explored_edges[index])
      
        dist_nodes=[g.ddict[(i,index)] for i in nodes_update] #the distances of these pairs
        d,final_nodes=[list(j) for j in list(zip(*sorted(zip(dist_nodes,nodes_update))[:k]))] #sort these pairs and take the top k pairs

        final_nodes=set(final_nodes)
        g.replace_edge_list(index,final_nodes)       
    
    return g   

def argmin(x,y,g,k=5):
    indices=[]
    d=[]
    visited=set()
    def add_index(i):
        indices.append(i)
        d.append(dist(y,x[i]))
        add_index(np.random.randint(len(x)))
    while True:
        i=None
        for index in indices:
            #select this node for expansion
            if index not in visited:
                #print(index)
                i=index
                break
        if i is None:
            return indices
        for new_node in g.get_edge(i):
            if new_node not in indices:
                add_index(new_node)    
        visited.add(i)
        d,indices=[list(i) for i in list(zip(*sorted(zip(d,indices))[:k]))]  

In [29]:
#testing large dataset
size=1000
x = np.random.randn(size, 2)
y = np.random.randn(size, 2)

def get_k_argmin(x,y,k=3):
    argmin=np.zeros([len(x),k])
    for i in range(len(y)):
        d=[dist(j,y[i]) for j in x]
        argmin[i]=np.argsort(d)[:k]
    return np.squeeze(argmin)

In [38]:
# x=np.array([[7,8],[9,10],[11,12],[1,2],[3,4],[5,6]])
t=tree(x)
print(t.leaves)
print(t.sizes)
# print(sum(t.sizes)/len(t.sizes))
# print(len(t.sizes))

[array([[1, 2],
       [3, 4]]), array([[ 7,  8],
       [ 9, 10],
       [11, 12],
       [ 5,  6]])]
[2, 4]


In [82]:
#test nndescent
nndescent=[]
start=time.time()
g=construct_graph(x,8,init=8,count=3)
print('construct NN graph time taken:',time.time()-start)
start=time.time()
for i in y:
    nndescent.append(argmin(x,i,g,3)[0])
print('NN descent search time taken',time.time()-start)
nndescent=np.squeeze(np.array(nndescent))

construct NN graph time taken: 21.847113609313965


IndexError: list index out of range

In [83]:
#test nndescent with random projection tree
nndescent=[]
start=time.time()
g=construct_graph(x,8,init=8,count=3,RP=True)
print('construct NN graph time taken:',time.time()-start)
start=time.time()
for i in y:
    nndescent.append(argmin(x,i,g,3)[0])
print('NN descent search time taken',time.time()-start)
nndescent=np.squeeze(np.array(nndescent))

ValueError: not enough values to unpack (expected 2, got 0)

In [None]:
#test argmin and argsort
npargmin=[]
npargsort=[]
start=time.time()
for j in y:
    d=[dist(i,j) for i in x]
    npargmin.append(np.argmin(d))

print('argmin search time taken',time.time()-start)
npargmin=np.array(npargmin) 
start=time.time()
for j in y:
    d=[dist(i,j) for i in x]
    npargsort.append(np.argsort(d)[0])

print('argsort search time taken',time.time()-start)
npargsort=np.array(npargsort) 

In [19]:
x=np.array([[1,2,3,4],[5,6,7,8],[9,10,11,12]])
print(x)
x=np.append(x,[[13,14,15,16]],axis=0)
print(x)
print(x.shape[0])
print(np.random.randint((10000)))
print(len(x))
y=np.empty(0)
print(y)
y=np.append(y,[1])
print(y)

left_bool = (x[:,3] < 10)
print(left_bool)
print(type(left_bool))
print(x[left_bool,:])

test=np.empty(0,dtype=bool)
print(test)

b=[[1,2,3],[4,5,6],[7,8,9]]
a={}

pprint.pprint(a)
print(a)

[[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]]
[[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]
 [13 14 15 16]]
4
7037
4
[]
[1.]
[ True  True False False]
<class 'numpy.ndarray'>
[[1 2 3 4]
 [5 6 7 8]]
[]
{'1': {'2': array([[ 1,  2,  3,  4],
       [ 5,  6,  7,  8],
       [ 9, 10, 11, 12],
       [13, 14, 15, 16]]),
       '4': {'5'}},
 '2': {'2': {3}, '4': {'5'}}}
{'1': {'2': array([[ 1,  2,  3,  4],
       [ 5,  6,  7,  8],
       [ 9, 10, 11, 12],
       [13, 14, 15, 16]]), '4': {'5'}}, '2': {'2': {3}, '4': {'5'}}}


In [11]:
import numpy as np

size=2
x = np.random.randn(size, 2)
y = np.random.randn(size, 2)

print(x)
print(y)

index = 0
x_1=x[index]
print(x_1)

x_1=x[:,None,:]
x_2=x[None,:,:]
distances=((x_1-x_2)**2).sum(-1)
print(distances)
max_distance=np.max(distances)
print(max_distance)

[[-0.17928501  0.09548515]
 [-0.65578304  0.86442423]]
[[-0.16278965  2.30711348]
 [ 1.99754948 -2.91815543]]
[-0.17928501  0.09548515]
[[0.         0.81831768]
 [0.81831768 0.        ]]
0.8183176812288301


In [60]:
a=np.array(([1,2]))
b=np.array(([[3,4],[5,6],[7,8]]))
print(b.shape)
c=np.dot(b,a)
print(c)
d=c>1
print(d)
distances=((a-b)**2).sum(-1)
print(distances)

print([1,2] in a)
# e=np.append(b,a,axis=0)
e=np.append([[3,4],[5,6]],[[1,2],[7,8],[9,10]],axis=0)
distances=((a-e)**2).sum(-1)
print(distances)
indices=np.argmin(distances,axis=0)
print(indices)
print(distances[indices[:5]])

(3, 2)
[[ 3  8]
 [ 5 12]
 [ 7 16]]
[[ True  True]
 [ True  True]
 [ True  True]]
[ 8 32 72]
True
[  8  32   0  72 128]
2


IndexError: invalid index to scalar variable.

In [222]:
import torch

a=torch.tensor(([1,2]))
b=torch.tensor(([[3,4],[5,6],[7,8]]))
print(b.shape)
c=(b*a).sum(-1)
print(c)
d=c>8
print(d)
f=(d*(-1)).type(torch.bool)
print(f,type(f))
distances=((a-b)**2).sum(-1)
print(distances)

print(torch.tensor([1,2]) in a)
# e=np.append(b,a,axis=0)
e=torch.cat((torch.tensor([]),torch.tensor([[1,2],[7,8],[9,10],[11,12],[3,4]])),0)
e=e[e!=torch.tensor([1,2])]
e=torch.reshape(e,(-1,2))
print('e',e)
print('sliced',e[(1,2),:])
print('shape',a.shape[0])
distances=((a-e)**2).sum(-1)
print('distances',distances)
indices=distances.topk(k=3, largest=False).values
print('indices',indices)
# print(torch.index_select(distances, 0, indices))

torch.Size([3, 2])
tensor([11, 17, 23])
tensor([True, True, True])
tensor([True, True, True]) <class 'torch.Tensor'>
tensor([ 8, 32, 72])
True
e tensor([[ 7.,  8.],
        [ 9., 10.],
        [11., 12.],
        [ 3.,  4.]])
sliced tensor([[ 9., 10.],
        [11., 12.]])
shape 2
distances tensor([ 72., 128., 200.,   8.])
indices tensor([  8.,  72., 128.])


In [117]:
max = 5
print((torch.rand() * 2 - 1) * max)

tensor([[ 2.9288,  3.0501, -0.3409,  1.9870,  3.1102],
        [ 0.1568,  1.2949, -0.8454,  0.9983, -2.6546],
        [ 2.5131, -0.6652, -0.2224,  2.8916,  2.3770],
        [ 4.5459, -2.5440,  1.1498,  1.6462, -3.0504],
        [-0.4603, -4.6296,  0.0622,  0.0927,  2.0834]])


In [150]:
x = torch.randn(3, 4)
print("Original Matrix:\n",x)
indices = torch.tensor([0, 2, 1])
print("Indexed Matrix:\n",torch.index_select(x, 0, indices))

Original Matrix:
 tensor([[-1.0561,  0.5606,  1.1953,  0.3313],
        [ 0.4535,  0.6016, -0.1120,  1.0139],
        [-0.1144, -0.7559,  0.4429,  1.1954]])
Indexed Matrix:
 tensor([[-1.0561,  0.5606,  1.1953,  0.3313],
        [-0.1144, -0.7559,  0.4429,  1.1954],
        [ 0.4535,  0.6016, -0.1120,  1.0139]])


In [206]:
a=torch.randint(0,10,(1,2)).type(torch.float)
print(a)
a=a/torch.norm(a,2)
print(a)

x = torch.tensor([[1,2,3],[4,5,6]])
print(x.shape[0])
index = torch.randint(0,x.shape[0]+1,(1,)) # select random point in set x
print(index)

tensor([[4., 0.]])
tensor([[1., 0.]])
2
tensor([2])


In [211]:
print(torch.FloatTensor(1,).uniform_(-100, 100))

tensor([-65.6179, -31.7254, -25.6140, -94.6345, -93.0680, -56.7784, -36.0555,
        -48.5385,  25.4823, -15.4595])


In [229]:
a = torch.tensor((1,2,3,4))
b = torch.tensor((4,5,6))
c = torch.cat((a,b),0)
c = torch.unique(c).type(torch.LongTensor)
print(c,c.type())

tensor([1, 2, 3, 4, 5, 6]) torch.LongTensor


In [None]:
# Trees - SUPER inefficient

class tree:
  def __init__(self, x, k = 5):
    '''
    x - dataset (torch.tensor)
    k - no. of nearest neighbours = max no. of elements in leaf
    Creates a tree with elements of dataset x split into leaves in self.leaves
    All elements within the same leaf node are KNN of each other
    (e.g. self.leaves = [[x1,x2,x3],[x4,x5,x6]] -> [x1,x2,x3] is a leaf, x1, x2 and x3 are KNN)
    '''
    self.max_size = k
    self.leaves = []
    self.sizes = []
#    self.max_size=x.shape[0]/10
    self.tree = self.make_tree(x)

  def make_tree(self,x):
    '''
    recursively splits inputs dataset x into subsets.
    returns leaf and places elements into self.leaves, when no. of elements in input < max_size
    x - dataset for splitting into leaf nodes
    '''
    node = dict()
    node['left'] = []
    node['right'] = []
    node['elements'] = []
    node['isleaf'] = False
    if x.shape[0] > self.max_size: # if input size is larger than max leaf size, do the recursion
      v,threshold = self.choose_rule(x)
      distances = (x * v).sum(-1) # create array of dot product each datapoint and random vector v
#       print('distances',distances,'threshold',threshold)
      left_bool = distances <= threshold # create boolean array where entries are true if distance <= threshold
      right_bool = ~left_bool
#       print(left_bool,right_bool)
      node['left'] = self.make_tree(x[left_bool,:])
      node['right'] = self.make_tree(x[right_bool,:])
#     else:
    elif x.shape[0] != 0: # if 0 < input size < max leaf size, place all input elements into self.leaves
      node['elements'] = x.detach().clone() 
      node['isleaf'] = True
      self.leaves.append(x)
      self.sizes.append(len(x))
    return node

  def choose_rule(self,x):
    dim = x.shape[1]
    v = torch.rand(dim) # create random vector
    v = v / torch.norm(v,2) # normalize vector

    index = torch.randint(0,x.shape[0],(1,)) # select random point in set x
    
    # find distance from this random point to furthest point in set x
    x_1 = x[index]
#         x_1=x[:,None,:]
#         x_2=x[None,:,:]
    distances = ((x_1 - x) ** 2).sum(-1)
    distances = torch.sqrt(distances)
    max_distance = torch.max(distances)
    
#     max_range = max_distance
    max_range = 6 * max_distance / math.sqrt(dim)
    d = torch.FloatTensor(1,).uniform_(-max_range, max_range) # select d uniformly within range
#     print('distances',distances,'max_distance',max_distance,'d',d)
#     print('d',d,'max',max_range)
    median = statistics.median(distances)
    threshold = median + d
#     print('median',median,'threshold',threshold)
    return v, threshold
    
class forest:
  def __init__(self, x, trees = 10, k = 5):
    '''
    creates forest class that contains trees within self.forest
    x - input dataset
    trees - number of trees in forest
    k - number of nearest neighbours = max size of leaf in each tree
    '''
    self.forest = []
    self.tree_num = trees
    self.k = k
    self.make_trees(x)

  def make_trees(self, x):
    '''
    creates tree_num trees and appends trees to self.forest
    '''    
    for i in range(self.tree_num):
      t = tree(x, self.k)
      self.forest.append(t)

  def find_leaf(self, query):
    '''
    returns KNN based on all trees
    step 1: find which leaf the query point sits in, for each tree in the forest
    step 2: combine all elements from these leaves
    step 3: find the k nearest neighbours out of this combined leaves set
    
    query - datapoint for which we want to find the KNN
    note: query must belong in the dataset that the trees were made from
    '''
#     dim = query.shape[0]
#     leaf_union = set()
#     for tree in self.forest:
#       for leaf in tree.leaves:
#         if query in leaf:
#           for element in leaf:
#             # convert tensor to list, then to tuple, to be recognised by set. set used to remove duplicates
#             leaf_union.add(tuple(element.tolist()))
#           break
#     leaf_union.remove(tuple(query.tolist())) # remove query from set
#     leaf_union = list(leaf_union)
#     leaf_union = torch.tensor(leaf_union) # convert tuple to list, then to tensor
#     print('leaf_union',leaf_union)
#     distances = ((leaf_union - query) ** 2).sum(-1)
#     print('distances',distances,distances.shape[0])
#     indices = distances.topk(k = self.k, largest = False).indices # find k nearest neighbours
#     print(leaf_union[indices,:])
#     return leaf_union[indices,:]

    dim = query.shape[0]
    leaf_union = torch.tensor(())
    for tree in self.forest:
      for leaf in tree.leaves:
        if query in leaf:
          leaf_union = torch.cat((leaf_union, leaf), 0)
          break
    leaf_union = torch.unique(leaf_union, dim = 0)
    leaf_union = leaf_union.tolist()
    leaf_union.remove(query.tolist()) # convert to list to remove query itself
    leaf_union = torch.tensor(leaf_union)
    print('leaf_union',leaf_union)
    print('size',leaf_union.shape[0])
#     leaf_union = leaf_union[leaf_union != query] # remove query point from the leaf union
#     print('leaf_union',leaf_union)
#     leaf_union = torch.reshape(leaf_union,(-1,dim)) # reshape into dimension
    distances = ((leaf_union - query) ** 2).sum(-1)
#     distances = distances[distances != 0] # remove zeros
    print('distances',distances,distances.shape[0])
    indices = distances.topk(k = self.k, largest = False).indices # find k nearest neighbours
    print(leaf_union[indices,:])
#     leaf_union = torch.index_select(leaf_union, 1, indices) # select k nearest neighbours via index
    return leaf_union[indices,:]


In [None]:
data = torch.Tensor([[1.0,1.0], [2.0,1.0], [3.0,1.0], [4.0,1.0],
                     [1.0,2.0], [2.0,2.0], [3.0,2.0], [4.0,2.0]])  
data = torch.randn(size=[10000,4])
# print(data)  

# t=tree(data)
# # print('leaves',t.leaves)
# print('sizes',t.sizes)

# f=forest(data,k=3,trees=10)
# for tree in f.forest:
# #   print(tree.leaves)
#   print(tree.sizes)

# f.find_leaf(torch.tensor([1.0,2.0]))