In [None]:
from pygraphblas import *
from pygraphblas.base import _check, lib
import pygraphblas.descriptor
import csv
import logging
from data_loader import DataLoader

import heapq

In [None]:
# Setup logger
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)-5s %(message)s'))
log = logging.getLogger(__name__)
log.propagate = False
log.addHandler(handler)
log.setLevel(logging.INFO)

In [None]:
data_dir = 'sf1k-converted/'
loader = DataLoader(data_dir)

vertices, mappings, matrices = loader.load_all_csvs()

hasInterestTranMx=matrices['hasInterest'].transpose()

In [None]:
def load_extra_columns(filename, columnNames):
    with open(data_dir + filename, newline='') as csvfile:
        reader = csv.DictReader(csvfile, delimiter='|', quotechar='"')
        
        fullColumnNames = []
        for columnName in columnNames:
            # find full column name with type info after ':'
            fullColumnName, = [fullName for fullName in reader.fieldnames
                               if fullName.split(':')[0]==columnName]
            fullColumnNames.append(fullColumnName)
        
        if len(fullColumnNames) == 1:
            fullColumnName = fullColumnNames[0]
            result = [row[fullColumnName] for row in reader]
        else:
            result = [[row[fullColumnName] for fullColumnName in fullColumnNames] for row in reader]
        
        return result
        
personBirthdays = load_extra_columns('person.csv', ['birthday'])
tagNames = load_extra_columns('tag.csv', ['name'])

In [None]:
def LAGraph_cc_fastsv(mx, sanitize=True):
    out = ffi.new('GrB_Vector*')
    _check(lib.LAGraph_cc_fastsv(
            out,
            mx.matrix[0],
            sanitize))
    
    new_type = ffi.new('GrB_Type*')
    _check(lib.GxB_Vector_type(new_type, out[0]))
    
    return Vector(out, types.gb_type_to_type(new_type[0]))

In [None]:
def getScoreForTag(tagCol, birthdayPersonMask):
    personVec = hasInterestTranMx[tagCol]
    personVec *= birthdayPersonMask
    
    personColsInSubgraph = [colValPair[0] for colValPair in personVec]
    personCountInSubgraph = len(personColsInSubgraph)
    
    subgraphMx = matrices['knows'][personColsInSubgraph, personColsInSubgraph]
    
    _, componentIds = LAGraph_cc_fastsv(subgraphMx, False).to_lists()
    
    componentSizes = [0]*personCountInSubgraph
    for componentId in componentIds:
        componentSizes[componentId] += 1
    
    maxComponentSize = max(componentSizes) if componentSizes \
        else 0
    return maxComponentSize

def query2(topK, birthdayLimit):
    isPersonSelected = [birthday>=birthdayLimit for birthday in personBirthdays]
    birthdayPersonMask = Vector.from_list(isPersonSelected)
    birthdayPersonMask.select(lib.GxB_NONZERO, out=birthdayPersonMask)
        
    tagsWithScore = {(tagNames[tagCol], getScoreForTag(tagCol, birthdayPersonMask))
                     for tagCol in range(len(vertices['tag']))}
    
    return heapq.nsmallest(topK, tagsWithScore, key=lambda kv: (-kv[1], kv[0]))

result = query2(3, '1986-06-14')
stringResult = '{0} % component sizes {1}'.format(*[' '.join(map(str, l)) for l in zip(*result)])
print(stringResult)
# expected result:
# Chiang_Kai-shek Mohandas_Karamchand_Gandhi Joseph_Stalin % component sizes 6 6 5