In [1]:
from pygraphblas import *
import pygraphblas.descriptor
import csv
import sys
import logging
import glob
import operator
from data_loader import DataLoader


In [2]:
# Setup logger
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)-5s %(message)s'))
log = logging.getLogger(__name__)
log.propagate = False
log.addHandler(handler)
log.setLevel(logging.INFO)

### Load data

In [3]:
data_dir = 'sf1k-converted/'
loader = DataLoader(data_dir)

vertices, mappings, matrices = loader.load_all_csvs()

2020-04-07 10:36:35,113 INFO  Loading nodes...
2020-04-07 10:36:35,124 INFO  Loading sf1k-converted/place.csv with headers: ['id:ID(Place)', 'name:STRING', 'url:STRING', ':LABEL']
2020-04-07 10:36:35,136 INFO  Loading sf1k-converted/comment.csv with headers: ['id:ID(Comment)', 'creationDate:DATETIME', 'locationIP:STRING', 'browserUsed:STRING', 'content:STRING']
2020-04-07 10:36:42,533 INFO  Loading sf1k-converted/organisation.csv with headers: ['id:ID(Organisation)', ':LABEL', 'name:STRING', 'url:STRING']
2020-04-07 10:36:42,551 INFO  Loading sf1k-converted/person.csv with headers: ['id:ID(Person)', 'firstName:STRING', 'lastName:STRING', 'gender:STRING', 'birthday:DATE', 'creationDate:DATETIME', 'locationIP:STRING', 'browserUsed:STRING']
2020-04-07 10:36:42,566 INFO  Loading sf1k-converted/forum.csv with headers: ['id:ID(Forum)', 'title:STRING', 'creationDate:DATETIME']
2020-04-07 10:36:42,654 INFO  Loading sf1k-converted/tag.csv with headers: ['id:ID(Tag)', 'name:STRING', 'url:STRING'

### Queries

In [5]:
# Query 1
def shortest_distance_over_frequent_communication_paths(matrices, threshhold):
    hasCreatorTransposed = matrices['hasCreator'].transpose()
    
    
    personA_to_comment2 = hasCreatorTransposed @ matrices['replyOf']
    person_to_person = personA_to_comment2.mxm(matrices['hasCreator'], mask=matrices['knows'])
    person_to_person_mutual = person_to_person + person_to_person.transpose()
    person_to_person_mutual_filtered = person_to_person_mutual.select('<', threshhold)    
    person_to_person_mutual_filtered = person_to_person_mutual_filtered.pattern() # every element will be boolean
    # TODO: BFS
    
    return person_to_person_mutual_filtered
    
    
    
    
    
    

In [6]:
x = shortest_distance_over_frequent_communication_paths(matrices, 10)

In [None]:
x.to_string()

In [5]:
#Query 3
def RelevantPeopleInPlace(place: int):
    #Relevant places
    isPartOfTransposed = matrices['isPartOf'].transpose()
    placeVector = Vector.from_type(BOOL,isPartOfTransposed.nrows)
    placeID = mappings['place'][place]
    placeVector[placeID] = True
    relevantPlacesVector = placeVector + placeVector.vxm(isPartOfTransposed) + placeVector.vxm(isPartOfTransposed).vxm(isPartOfTransposed)
    #People located in the given place
    peopleInThePlaceVector = matrices['personIsLocatedIn'].mxv(relevantPlacesVector)
    #People working at a Company or studying at a University located in the given place
    organisationsVector = matrices['organisationIsLocatedIn'].mxv(relevantPlacesVector)
    with semiring.LOR_LAND_BOOL:
        peopleWorkAtVector = matrices['workAt'].mxv(organisationsVector)
        peopleStudyAtVector = matrices['studyAt'].mxv(organisationsVector) 
    #All the relevant people in the given place    
    with binaryop.PLUS_BOOL:    
        relevantPeopleVector = peopleWorkAtVector + peopleStudyAtVector + peopleInThePlaceVector   
    return relevantPeopleVector

In [88]:
def HHopKnows(h,vec):
    if(h < 2) : return vec.vxm(matrices['knows'])
    mtx = matrices['knows']
    while(h-1 > 0):
        mtx = mtx.mxm(matrices['knows'])
        h -= 1
    return(vec.vxm(mtx) + function(h-1,vec))

In [89]:
def query3(k,h,p):
    relevantPeopleVector = RelevantPeopleInPlace(p)

    #Maping the id-s back, sorting, then remapping
    mapedPersonIds = relevantPeopleVector.to_lists()[0]
    mapedPersonIds[:] = [vertices['person'][id] for id in mapedPersonIds]
    mapedPersonIds.sort()
    remapedSortedPersonIds = mapedPersonIds
    remapedSortedPersonIds[:] = [mappings['person'][id] for id in mapedPersonIds]

    #A list of maped id-s, sorted by the original id-s
    personList = list(zip(remapedSortedPersonIds,relevantPeopleVector.to_lists()[1]))
    resultList = []

    for person in personList:
        personVector = Vector.from_type(BOOL,relevantPeopleVector.size)
        personVector[person[0]] = True
        
        personKnowsHHopVector = HHopKnows(h,personVector)
        personKnowsHHopVector *= relevantPeopleVector     #only the relevant people are needed
        
        personHasInterestVector = personVector.vxm(matrices['hasInterest'])
        numTagsVector = Vector.from_type(INT64,relevantPeopleVector.size)    

        for tag in personHasInterestVector:
            tagVector = Vector.from_type(BOOL,personHasInterestVector.size)
            tagVector[tag[0]] = True
            peopleInterestedInTagVector = tagVector.vxm(matrices['hasInterest'].transpose())
            numTagsVector += peopleInterestedInTagVector
        
        numTagsVector[person[0]] = 0    #p1|p1 should be 0
        numTagsVector *= personKnowsHHopVector

        personID = vertices['person'][person[0]]
        for value in numTagsVector:
            valueID = vertices['person'][value[0]]
            if(value[1] != 0):
                for element in resultList:
                    if(personID == element[1] and valueID == element[0]):
                        break
                else:
                    resultList += [[personID,valueID,value[1]]]

    resultList.sort(key = operator.itemgetter(1))  
    resultList.sort(key = operator.itemgetter(0))
    resultList.sort(key = operator.itemgetter(2), reverse = True)

    for element in resultList[:k]:
        print('{}|{}|{}'.format(element[0],element[1],element[2]))

In [93]:
#India.id = 38
query3(3,2,38)