In [2]:
from pygraphblas import *
#from _pygraphblas import lib
import pygraphblas.descriptor
import csv
import sys
import logging
import glob
import operator
import sys
sys.path.append("..")
from loader.data_loader import DataLoader
from algorithms.search import naive_bfs_levels



In [3]:
# Setup logger
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)-5s %(message)s'))
log = logging.getLogger(__name__)
log.propagate = False
log.addHandler(handler)
log.setLevel(logging.INFO)

### Load data

In [4]:
data_dir = '../../csvs/o1k/'
data_format = 'csv'
loader = DataLoader(data_dir, data_format)

#vertices, mappings, matrices = loader.load_all_csvs()

person = loader.load_vertex('person')
comment = loader.load_vertex('comment')

replyOf = loader.load_edge('replyOf', comment, comment)
knows = loader.load_edge('knows', person, person)
hasCreator = loader.load_edge('hasCreator', comment, person)



2020-06-08 00:09:50,076 INFO  Loading ../../csvs/o1k/person.csv with headers: ['id:ID(Person)', 'firstName:STRING', 'lastName:STRING', 'gender:STRING', 'birthday:DATE', 'creationDate:DATETIME', 'locationIP:STRING', 'browserUsed:STRING']
2020-06-08 00:09:50,089 INFO  Loading ../../csvs/o1k/comment.csv with headers: ['id:ID(Comment)', 'creationDate:DATETIME', 'locationIP:STRING', 'browserUsed:STRING', 'content:STRING']
2020-06-08 00:09:53,121 INFO  Loading ../../csvs/o1k/comment_replyOf_comment.csv with headers: [':START_ID(Comment)', ':END_ID(Comment)']
2020-06-08 00:09:54,779 INFO  Loading ../../csvs/o1k/person_knows_person.csv with headers: [':START_ID(Person)', ':END_ID(Person)']
2020-06-08 00:09:54,912 INFO  Loading ../../csvs/o1k/comment_hasCreator_person.csv with headers: [':START_ID(Comment)', ':END_ID(Person)']


### Queries

In [5]:
# Query 1
def shortest_distance_over_frequent_communication_paths(person, replyOf, knows, hasCreator, num_of_interactions, person1_id, person2_id):

    person1_id_remapped = person.index2id[person1_id]
    person2_id_remapped = person.index2id[person2_id] 

    hasCreatorTransposed = hasCreator.transpose()

    personA_to_comment2 = hasCreatorTransposed @ replyOf
    
    person_to_person = personA_to_comment2.mxm(hasCreator, mask=knows)
    
    person_to_person_filtered = person_to_person.select(lib.GxB_GT_THUNK, num_of_interactions)
    
    overlay_graph = person_to_person_filtered.pattern()
    if num_of_interactions == -1:
        overlay_graph = knows
        
    levels = naive_bfs_levels(overlay_graph, person1_id_remapped)
    
    
    result = levels[person2_id_remapped] - 1 # Get hop count
    
    return result
    

In [20]:
x = shortest_distance_over_frequent_communication_paths(person, replyOf, knows, hasCreator, 1, 786, 799)
print(f'RESULT: {x}', x==4)
x = shortest_distance_over_frequent_communication_paths(person, replyOf, knows, hasCreator, 1, 422, 736)
print(f'RESULT: {x}', x==-1)
x = shortest_distance_over_frequent_communication_paths(person, replyOf, knows, hasCreator, 1, 858, 587)
print(f'RESULT: {x}', x==4)
x = shortest_distance_over_frequent_communication_paths(person, replyOf, knows, hasCreator, -1, 266, 106)
print(f'RESULT: {x}', x==3)
x = shortest_distance_over_frequent_communication_paths(person, replyOf, knows, hasCreator, 0, 58, 402)
print(f'RESULT: {x}', x==3)
x = shortest_distance_over_frequent_communication_paths(person, replyOf, knows, hasCreator, 105, 608, 3) 
print(f'RESULT: {x}', x==-1)

NoValue: b''

In [21]:
# Optimized version: do not create overlay graph but investigate investigate KNOWS edges on-the-fly.
def shortest_distance_over_frequent_communication_paths_opt(person, replyOf, knows, hasCreator, num_of_interactions, person1_id, person2_id):
    person1_id_remapped = person.index2id[person1_id]
    person2_id_remapped = person.index2id[person2_id] 

    numpersons = len(person.id2index)
    frontier = Vector.from_lists([person1_id_remapped], [True], numpersons)
    visited = frontier

    for level in range(1, numpersons):
        #print("===== " + str(level) + " =====")
        #print("frontier persons: " + str(frontierPersonIndices))

        frontierPersonIndices = frontier.to_lists()[0]
        if num_of_interactions >= 0:
            sel = Matrix.from_lists(frontierPersonIndices, frontierPersonIndices, [1]*len(frontierPersonIndices), numpersons, numpersons)
            #sel.mxm(hasCreator, desc=descriptor.tooo) # does not work due to DimensionMismatch
            K1 = sel.mxm(hasCreator.transpose()).mxm(replyOf            ).mxm(hasCreator, mask=knows).select(lib.GxB_GT_THUNK, num_of_interactions)
            K2 = sel.mxm(hasCreator.transpose()).mxm(replyOf.transpose()).mxm(hasCreator, mask=knows).select(lib.GxB_GT_THUNK, num_of_interactions)
            K = K1*K2
        else:
            K = knows

        next = frontier.vxm(K, mask=visited, desc=descriptor.ooco)

        if person2_id_remapped in next.to_lists()[0]:
            return level
        if next.nvals == 0:
            return -1

        visited = visited + next
        frontier = next

x = shortest_distance_over_frequent_communication_paths_opt(person, replyOf, knows, hasCreator, 1, 786, 799)
print(f'RESULT: {x}', x==4)
x = shortest_distance_over_frequent_communication_paths_opt(person, replyOf, knows, hasCreator, 1, 422, 736)
print(f'RESULT: {x}', x==-1)
x = shortest_distance_over_frequent_communication_paths_opt(person, replyOf, knows, hasCreator, 1, 858, 587)
print(f'RESULT: {x}', x==4)
x = shortest_distance_over_frequent_communication_paths_opt(person, replyOf, knows, hasCreator, -1, 266, 106)
print(f'RESULT: {x}', x==3)
x = shortest_distance_over_frequent_communication_paths_opt(person, replyOf, knows, hasCreator, 0, 58, 402)
print(f'RESULT: {x}', x==3)
x = shortest_distance_over_frequent_communication_paths_opt(person, replyOf, knows, hasCreator, 105, 608, 3) 
print(f'RESULT: {x}', x==-1)


RESULT: -1 False
RESULT: 3 True
RESULT: -1 False
RESULT: -1 True
