In [None]:
import csv
import heapq
import logging

from data_loader import DataLoader
from pygraphblas import *
from pygraphblas.base import lib
from pygraphblas.lagraph import LAGraph_cc_fastsv

In [None]:
# Setup logger
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)-5s %(message)s'))
log = logging.getLogger(__name__)
log.propagate = False
log.addHandler(handler)
log.setLevel(logging.INFO)

In [None]:
data_dir = 'csvs/sf1k/'
data_format = 'csv'

loader = DataLoader(data_dir, data_format)

#vertices, mappings, matrices = loader.load_all_csvs()

person = loader.load_vertex('person')
tag = loader.load_vertex('tag')
hasInterest = loader.load_edge('hasInterest', person, tag)
knows = loader.load_edge('knows', person, person)
hasInterest_tran_mx=hasInterest.transpose()

In [None]:
personBirthdays = loader.load_extra_columns('person', ['birthday'])
tagNames = loader.load_extra_columns('tag', ['name'])

In [None]:
def get_score_for_tag(tag_index, birthday_person_mask):
    person_vec = hasInterest_tran_mx[tag_index]
    person_vec *= birthday_person_mask
    
    person_cols_in_subgraph, _ = person_vec.to_lists()
    person_count_in_subgraph = len(person_cols_in_subgraph)
    
    subgraph_mx = knows[person_cols_in_subgraph, person_cols_in_subgraph]
    
    _, component_ids = LAGraph_cc_fastsv(subgraph_mx, False).to_lists()
    
    component_sizes = [0]*person_count_in_subgraph
    for componentId in component_ids:
        component_sizes[componentId] += 1
    
    max_component_size = max(component_sizes) if component_sizes \
        else 0
    return max_component_size

def query2(top_k, birthday_limit):
    is_person_selected = [birthday>=birthday_limit for birthday in personBirthdays]
    birthday_person_mask = Vector.from_list(is_person_selected)
    birthday_person_mask.select(lib.GxB_NONZERO, out=birthday_person_mask)
        
    tags_with_score = {(tagNames[tagCol], get_score_for_tag(tagCol, birthday_person_mask))
                     for tagCol in range(len(tag.id2index))}
    
    return heapq.nsmallest(top_k, tags_with_score, key=lambda kv: (-kv[1], kv[0]))

def format_result(result_list):
    return '{0} % component sizes {1}'.format(*[' '.join(map(str, l)) for l in zip(*result_list)])

result = query2(3, '1986-06-14')
string_result = format_result(result)
print(string_result)
# expected result:
# Chiang_Kai-shek Mohandas_Karamchand_Gandhi Joseph_Stalin % component sizes 6 6 5

# Tests

In [None]:
results = [
    query2(3, '1980-02-01'),
    query2(4, '1981-03-10'),
    query2(3, '1982-03-29'),
    query2(3, '1983-05-09'),
    query2(5, '1984-07-02'),
    query2(3, '1985-05-31'),
    query2(3, '1986-06-14'),
    query2(7, '1987-06-24'),
    query2(3, '1988-11-10'),
    query2(4, '1990-01-25'),
]
expected_results = r'''
Chiang_Kai-shek Augustine_of_Hippo Napoleon % component sizes 22 16 16
Chiang_Kai-shek Napoleon Mohandas_Karamchand_Gandhi Sukarno % component sizes 17 13 11 11
Chiang_Kai-shek Mohandas_Karamchand_Gandhi Napoleon % component sizes 13 11 10
Chiang_Kai-shek Mohandas_Karamchand_Gandhi Augustine_of_Hippo % component sizes 12 10 8
Chiang_Kai-shek Aristotle Mohandas_Karamchand_Gandhi Augustine_of_Hippo Fidel_Castro % component sizes 10 7 6 5 5
Chiang_Kai-shek Mohandas_Karamchand_Gandhi Joseph_Stalin % component sizes 6 6 5
Chiang_Kai-shek Mohandas_Karamchand_Gandhi Joseph_Stalin % component sizes 6 6 5
Chiang_Kai-shek Augustine_of_Hippo Genghis_Khan Haile_Selassie_I Karl_Marx Lyndon_B._Johnson Robert_John_\"Mutt\"_Lange % component sizes 4 3 3 3 3 3 3
Aristotle Ho_Chi_Minh Karl_Marx % component sizes 2 2 2
Arthur_Conan_Doyle Ashoka Barack_Obama Benito_Mussolini % component sizes 1 1 1 1
'''.strip()

results_string = '\n'.join(map(format_result, results))

results_ok = results_string == expected_results
assert results_ok

print(f'RESULTS MATCH: {results_ok}\n')
print(results_string)