In [1]:
from pygraphblas import *
import pygraphblas.descriptor
import csv
import sys
import logging



In [2]:
# Setup logger
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)-5s %(message)s'))
log = logging.getLogger(__name__)
log.propagate = False
log.addHandler(handler)
log.setLevel(logging.INFO)

In [9]:
#Load data from CSV format
class DataLoader:
    
    def __init__(self, path):
        self.path = path
        
    def load_node(self, filename):
        filename = self.path + filename
        with open(filename, newline='') as csvfile:
            reader = csv.DictReader(csvfile, delimiter='|', quotechar='"')
            headers = reader.fieldnames
            log.info(f'Loading {filename} with headers: {headers}')
            node_key = headers[0]
            original_ids = [int(row[node_key]) for row in reader]
            id_mapping = {}
            for index in range(len(original_ids)):
                id_mapping[original_ids[index]] = index
            
        return original_ids, id_mapping

    def load_edge(self, filename, start_mapping, end_mapping, typ=BOOL, drop_dangling_edges=False):
        filename = self.path + filename
        with open(filename, newline='') as csvfile:
            reader = csv.DictReader(csvfile, delimiter='|', quotechar='"')
            row_ids = []
            col_ids = []
            values = []
            headers = reader.fieldnames
            log.info(f'Loading {filename} with headers: {headers}')
            start_key = headers[0]
            end_key = headers[1]
            for row in reader:
                start_id = int(row[start_key])
                end_id = int(row[end_key])
                if not drop_dangling_edges or (start_id in start_mapping and end_id in end_mapping):
                    row_ids.append(start_mapping[start_id])
                    col_ids.append(end_mapping[end_id])
                    values.append(1)
        
            edge_matrix = Matrix.from_lists(
            row_ids,
            col_ids,
            values,
            nrows=len(start_mapping), 
            ncols=len(end_mapping), 
            typ=typ)
            return edge_matrix

In [10]:
path = 'sf1k-converted/'
loader = DataLoader(path)


vertices = {}
mapping = {}
matrices = {}
vertices['Comment'], mapping['Comment'] = loader.load_node('comment.csv')
vertices['Person'], mapping['Person'] = loader.load_node('person.csv')
matrices['replyOf'] = loader.load_edge('comment_replyOf_comment.csv', mapping['Comment'], mapping['Comment'])



2020-03-19 14:36:08,434 INFO  Loading sf1k-converted/comment.csv with headers: ['id:ID(Comment)', 'creationDate:DATETIME', 'locationIP:STRING', 'browserUsed:STRING', 'content:STRING']
2020-03-19 14:36:17,449 INFO  Loading sf1k-converted/person.csv with headers: ['id:ID(Person)', 'firstName:STRING', 'lastName:STRING', 'gender:STRING', 'birthday:DATE', 'creationDate:DATETIME', 'locationIP:STRING', 'browserUsed:STRING']
2020-03-19 14:36:17,471 INFO  Loading sf1k-converted/comment_replyOf_comment.csv with headers: [':START_ID(Comment)', ':END_ID(Comment)']


{'replyOf': <Matrix (632042x632042 : 407920:BOOL)>}