In [1]:
import tensorflow as tf 
import script_config as sc
import pandas as pd
import heapq as hq
import numpy as np
import csv 

data_folder      = sc._config_data_folder
hops             = sc._config_hops
max_list_size    = sc._config_relation_list_size_neighborhood

In [2]:
users     = pd.read_csv(data_folder+"filtered_users.csv",delimiter=',').values 
relations = pd.read_csv(data_folder+"filtered_relations.csv",delimiter=',').values 

In [3]:
# BFS search to find neighborhood of radius "hops"
def find_neighborhood(user):
    
    # Looking for elements in numpy array (better than lists)
    def element_in_narray(narray,row): 
        count = np.where(narray==row)
        if len(count[0]) is 0: 
            return False
        else:
            return True
        
    # Function Global Variables
    current_id         = "{:07d}".format(user[0])
    current_hops       = 0
    relations_idx      = 0 # Index of next free spot in retained_relations
    neighbors_idx      = 0 # Index of next free spot in visited_neighbors
    
    queue_head          = 0 # Index of queue head 
    queue_tail          = 1 # Index of next free spot in queue 
    
    
    # Data Structures     
    queue              = np.ndarray(max_list_size,dtype='i4')
    visited_neighbors  = np.ndarray(max_list_size,dtype='i4')
    retained_relations = np.ndarray(max_list_size,dtype='i4, i4, i4, i4, i4')
    queue[0]           = int(current_id)

    # Loop until queue is empty 
    while( queue_head != queue_tail ):
        
        current_id = "{:07d}".format(queue[queue_head])
        queue_head += 1 
        
        # Treat incoming edges and outgoing edges equally
        relations_1  = relations[np.where(relations[:,2] == int(current_id))]
        relations_2  = relations[np.where(relations[:,3] == int(current_id))]
        neigh_ids    = np.union1d(relations_1[:,3],relations_2[:,2])
        
        # Cutoff Condition
        if current_hops + 1 <= hops: 
            
            for neigh_id in neigh_ids: 
                
                # Check that node has not been visited 
                #    and has not been marked to be visited
                if ( not element_in_narray(visited_neighbors[:neighbors_idx],int(neigh_id)))\
                   and ( not element_in_narray(queue[queue_head:queue_tail],int(neigh_id))): 
                    
                    if queue_tail == max_list_size: 
                        raise MemoryError("Increase _config_list_size_neighborhood_creation \
                           from config.py")
                        
                    # Mark node to be visited 
                    queue[queue_tail] = int(neigh_id)
                    queue_tail += 1
               

            
            for relation_set in [relations_1,relations_2] :
                for relation in relation_set:
                    
                    # Memory Checking
                    if relations_idx == max_list_size:
                        raise MemoryError("Increase _config_list_size_neighborhood_creation \
                           from config.py")
                        
                    relation_tuple = (int(relation[0]),int(relation[1]),int(relation[2]),
                                          int(relation[3]),int(relation[4]))
                    
                    # Only add relations with visited neighbors (directionally agnostic)
                    if  element_in_narray(visited_neighbors[:neighbors_idx],relation_tuple[2]) or \
                        element_in_narray(visited_neighbors[:neighbors_idx],relation_tuple[3]): 
                    
                        # Retain Relation if not already done 
                        if not element_in_narray(retained_relations[:relations_idx],\
                                                 np.array([relation_tuple],dtype='i4,i4,i4,i4,i4')):
                            retained_relations[relations_idx] = relation_tuple
                            relations_idx += 1
        
        # Memory Checking
        if neighbors_idx == max_list_size:
            raise MemoryError("Increase _config_list_size_neighborhood_creation \
                           from config.py")
        
        # Mark node as visited 
        visited_neighbors[neighbors_idx] = int(current_id)
        neighbors_idx += 1
        
    return visited_neighbors[:neighbors_idx], retained_relations[:relations_idx]

In [4]:
!mkdir /nvme/drive_1/NTDS_Final/local_list/
!rm -r /nvme/drive_1/NTDS_Final/local_list/filtered
!mkdir /nvme/drive_1/NTDS_Final/local_list/filtered

mkdir: cannot create directory ‘/nvme/drive_1/NTDS_Final/local_list/’: File exists


In [5]:
i        = 0 
max_size = 0
for user in users: 
    neighs, rels = find_neighborhood(user)
    np.savez_compressed(data_folder+"local_list/filtered/"+str(user[0]),\
                        local_neighbors=neighs,local_relations=rels)
    i+=1
    
    if len(neighs)>max_size: 
        max_size = len(neighs)
    print("\r"+str(i) +" neighborhoods processed / Max Size: "+str(max_size), sep=' ', end='', flush=True)

2879 neighborhoods processed / Max Size: 62173

KeyboardInterrupt: 