In [6]:
import requests as req
import numpy as np
import json

In [7]:
class InterestsParser:
    def __init__(self, trees):
        self.trees = trees
        self.count = 0
        
        for tree in trees:
            self.count += _count_interests(tree)
            
    def parse(self, line):
        interests_vec = np.zeros(self.count)
        interests_set = frozenset(line.split(','))
        i_p = [0]
        weight = 1 / len(self.trees)

        for tree in self.trees:
            _parse_interests(interests_set, tree, interests_vec, i_p, weight )
        
        return interests_vec

def _count_interests(node):
    count = 1
    if 'children' in node:
        for c in node['children']:
            count = count + _count_interests(c)

    return count

def _parse_interests(i_set, i_node, i_vec, i_p, weight):
    val = 0

    if 'children' in i_node:
        children = i_node['children']
        n_children = len(children)
        for c in children:
            val += _parse_interests(i_set, c, i_vec, i_p, 1 / n_children)

    if i_node['name'] in i_set:
        val = 1

    i_vec[i_p[0]] = val * weight
    i_p[0] += 1

    return val

In [8]:
class Person:
    def __init__(self, p_id, interests):
        self.p_id = p_id
        self.interests = interests
        
    def __str__(self):
        return 'id: {0}\ninterests: {1}'.format(self.p_id, self.interests)
    
    def __repr__(self):
        return self.__str__()
    
def parse_person(p, interest_parser):
    i = 0
    num = ''
    if len(p) < 1:
        return None

    while p[i] != ' ':
        num = num + p[i]
        i += 1

    p_id = int(num)

    while p[i] != '[':
        i += 1

    s = i + 1

    while p[i] != ']':
        i += 1

    f = i

    interests = interest_parser.parse(p[s:f])

    return Person(p_id, interests)

In [9]:
with open('interests-tree-nested.json') as tree:
    interest_parser = InterestsParser(json.loads(tree.read()))

with open('synthtetic_dataset.lg') as dataset_file:
    raw_data = dataset_file.read()

lines = raw_data.split('\n')

headers = lines[0:4]
body = lines[5:-1]

In [10]:
persons = [parse_person(p, interest_parser) for p in body]

In [11]:
display(persons[0].interests)

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     