In [1]:
from pygraphblas import *
import pygraphblas.descriptor
import csv
import sys
import logging



In [2]:
# Setup logger
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)-5s %(message)s'))
log = logging.getLogger(__name__)
log.propagate = False
log.addHandler(handler)
log.setLevel(logging.INFO)

In [3]:
#Load data from CSV format
class DataLoader:
    
    def __init__(self, path):
        self.path = path
        
    def load_node(self, filename):
        filename = self.path + filename
        with open(filename, newline='') as csvfile:
            reader = csv.DictReader(csvfile, delimiter='|', quotechar='"')
            headers = reader.fieldnames
            log.info(f'Loading {filename} with headers: {headers}')
            node_key = headers[0]
            original_ids = [int(row[node_key]) for row in reader]
            id_mapping = {}
            for index in range(len(original_ids)):
                id_mapping[original_ids[index]] = index
            
        return original_ids, id_mapping

    def load_edge(self, filename, start_mapping, end_mapping, typ=INT64, drop_dangling_edges=False):
        filename = self.path + filename
        with open(filename, newline='') as csvfile:
            reader = csv.DictReader(csvfile, delimiter='|', quotechar='"')
            row_ids = []
            col_ids = []
            values = []
            headers = reader.fieldnames
            log.info(f'Loading {filename} with headers: {headers}')
            start_key = headers[0]
            end_key = headers[1]
            for row in reader:
                start_id = int(row[start_key])
                end_id = int(row[end_key])
                if not drop_dangling_edges or (start_id in start_mapping and end_id in end_mapping):
                    row_ids.append(start_mapping[start_id])
                    col_ids.append(end_mapping[end_id])
                    values.append(1)
        
            edge_matrix = Matrix.from_lists(
            row_ids,
            col_ids,
            values,
            nrows=len(start_mapping), 
            ncols=len(end_mapping), 
            typ=typ)
            return edge_matrix

### Load data

In [4]:
path = 'sf1k-converted/'
loader = DataLoader(path)


vertices = {}
mapping = {}
matrices = {}
vertices['Comment'], mapping['Comment'] = loader.load_node('comment.csv')
vertices['Person'], mapping['Person'] = loader.load_node('person.csv')
matrices['replyOf'] = loader.load_edge('comment_replyOf_comment.csv', mapping['Comment'], mapping['Comment'])
matrices['hasCreator'] = loader.load_edge('comment_hasCreator_person.csv', mapping['Comment'], mapping['Person'])
matrices['knows'] = loader.load_edge('person_knows_person.csv', mapping['Person'], mapping['Person'])




2020-03-30 06:47:10,658 INFO  Loading sf1k-converted/comment.csv with headers: ['id:ID(Comment)', 'creationDate:DATETIME', 'locationIP:STRING', 'browserUsed:STRING', 'content:STRING']
2020-03-30 06:47:18,744 INFO  Loading sf1k-converted/person.csv with headers: ['id:ID(Person)', 'firstName:STRING', 'lastName:STRING', 'gender:STRING', 'birthday:DATE', 'creationDate:DATETIME', 'locationIP:STRING', 'browserUsed:STRING']
2020-03-30 06:47:18,763 INFO  Loading sf1k-converted/comment_replyOf_comment.csv with headers: [':START_ID(Comment)', ':END_ID(Comment)']
2020-03-30 06:47:22,014 INFO  Loading sf1k-converted/comment_hasCreator_person.csv with headers: [':START_ID(Comment)', ':END_ID(Person)']
2020-03-30 06:47:26,942 INFO  Loading sf1k-converted/person_knows_person.csv with headers: [':START_ID(Person)', ':END_ID(Person)']


### Queries

In [31]:
# Query 1
def shortest_distance_over_frequent_communication_paths(matrices, threshhold):
    hasCreatorTransposed = matrices['hasCreator'].transpose()
    
    
    personA_to_comment2 = hasCreatorTransposed @ matrices['replyOf']
    person_to_person = personA_to_comment2.mxm(matrices['hasCreator'], mask=matrices['knows'])
    person_to_person_mutual = person_to_person + person_to_person.transpose()
    person_to_person_mutual_filtered = person_to_person_mutual.select('<', threshhold)    
    person_to_person_mutual_filtered = person_to_person_mutual_filtered.pattern() # every element will be boolean
    # TODO: BFS
    
    return person_to_person_mutual_filtered
    
    
    
    
    
    

In [34]:
x = shortest_distance_over_frequent_communication_paths(matrices, 10)

In [35]:
x.to_string()

'    0 1 2 3 4 5 6 7 8 91011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643

'    0 1 2 3 4 5 6 7 8 91011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643