In [1]:
import json
import numpy as np
import csv

In [2]:
trees = {}
root_nodes = {}
root2id = {}
root2prefix = {}
prefixes = ['propn', 'adj', 'loc', 'misc', 'noun', 'org', 'per', 'verb']

def load_nodes():
    n = 0
    for prefix in prefixes:
        with open('profiles/profiles_' + prefix + '/topicTree.nodes.json', 'r') as read_file:
            nodes = json.load(read_file)
        for node in nodes:
            root_nodes[prefix + node['id']] = n
            root2id[prefix + node['id']] = node['id']
            root2prefix[prefix + node['id']] = prefix
            n += 1
        trees[prefix] = nodes

load_nodes()

In [3]:
assignments = {}

def load_assignments():
    for prefix in prefixes:
        with open('profiles/profiles_' + prefix + '/myAssignment.topics.json', 'r') as read_file:
            nodes = json.load(read_file)
        d = {}
        for node in nodes:
            topic = node['topic']
            d[topic] = {}
            for doc in node['doc']:
                d[topic][int(doc[0])] = doc[1]
        assignments[prefix] = d

load_assignments()

In [4]:
def load_data_files():
    cnt = 0
    files = {}
    with open('profiles/profiles_propn/myData.files.txt', 'r') as read_file:
        tmp = read_file.read().split('\n')
    for i in tmp:
        files[cnt] = i
        cnt += 1
    return files

data_files = load_data_files()

In [5]:
tf_idf = {}

def load_words():
    for prefix in prefixes:
        with open('profiles/profiles_' + prefix + '/myData.dict.csv', 'r') as read_file:
            tmp = {}
            for tfidf in csv.reader(read_file):
                tmp[tfidf[0]] = tfidf[3]
            tf_idf[prefix] = tmp

load_words()

In [6]:
words_dic = {}
nodes_words = {}
from collections import deque

def get_words_dic():
    cnt = 0
    q = deque()
    prefix_q = deque()
    for prefix in prefixes:
        nodes_words[prefix] = {}
        for node in trees[prefix]:
            q.append(node)
            prefix_q.append(prefix)

    while len(q) > 0:
        node = q.pop()
        p = prefix_q.pop()
        nodes_words[p][node['id']] = node['text'].split(' ')
        for word in node['text'].split(' '):
            if not word in words_dic:
                words_dic[word] = cnt
                cnt += 1
        for child in node['children']:
            q.append(child)
            prefix_q.append(p)

get_words_dic()

In [7]:
def save_np(matrix, view_id):
    m_min = matrix.min()
    m_max = matrix.max()
    matrix -= m_min
    matrix /= (m_max - m_min)
    matrix *= 2
    matrix -= 1
    with open("views/" + view_id + ".npy", 'wb') as f:
        np.save(f, matrix)

In [8]:
def set_assignments(roots, view_id):
    view_files = np.zeros((len(root_nodes), len(data_files)))
    for root in roots:
        for file in assignments[root2prefix[root]][roots[root]]:
            view_files[root_nodes[root]][file] = assignments[root2prefix[root]][roots[root]][file]
    save_np(view_files, 'files_' + view_id)

In [9]:
def set_words(roots, view_id):
    view_words = np.zeros((len(root_nodes), len(words_dic)))
    for root in roots:
        for word in nodes_words[root2prefix[root]][roots[root]]:
            view_words[root_nodes[root]][words_dic[word]] = tf_idf[root2prefix[root]][word]
    save_np(view_words, 'words_' + view_id)

In [10]:
def get_views():
    set_assignments(root2id, '0_0')
    set_words(root2id, '0_0')

    queues = {}
    for prefix in prefixes:
        for node in trees[prefix]:
            q_id = prefix + node['id']
            queues[q_id] = []
            for child in node['children']:
                queues[q_id].append(child)
    level = 0
    keep = True
    while keep:
        keep = False
        level += 1
        view_by_level(queues, level)
        for q in queues:
            tmp = []
            for node in queues[q]:
                for child in node['children']:
                    tmp.append(child)
            if len(tmp) > 0:
                keep = True
            queues[q] = tmp

def view_by_level(queues, level):
    max_q = 0
    for q in queues:
        max_q = max(max_q, len(queues[q]))
    print(max_q)
    for i in range(max_q):
        roots = {}
        for root in root_nodes:
            if i < len(queues[root]):
                roots[root] = queues[root][i]['id']
        set_assignments(roots, str(level) + '_' + str(i))
        set_words(roots, str(level) + '_' + str(i))

get_views()

15


In [16]:
def decode_words_view(level, row):
    with open("views/words_" + str(level) + "_" + str(row) + ".npy", 'rb') as f:
        view = np.load(f)
    for root in root_nodes:
        print('\n', root, root_nodes[root])
        for word in words_dic:
            if view[root_nodes[root]][words_dic[word]] != -1:
                #print(word, view[root_nodes[root]][words_dic[word]], end=', ')
                print(word, end=', ')

In [17]:
def decode_files_view(level, row):
    with open("views/files_" + str(level) + "_" + str(row) + ".npy", 'rb') as f:
        view = np.load(f)
    for root in root_nodes:
        print('\n', root, root_nodes[root])
        for file in data_files:
            if view[root_nodes[root]][file] != -1:
                #print(file, view[root_nodes[root]][file], end=', ')
                print(file, end=', ')

In [21]:
decode_files_view(1, 14)


 propnZ29 0

 propnZ23 1

 propnZ216 2
41, 95, 98, 127, 147, 195, 292, 297, 299, 330, 337, 356, 443, 498, 515, 651, 652, 721, 726, 728, 
 propnZ222 3

 propnZ22 4

 propnZ210 5

 propnZ219 6

 propnZ213 7

 propnZ25 8
6, 50, 53, 57, 66, 75, 87, 106, 121, 195, 237, 310, 317, 410, 413, 426, 463, 519, 580, 592, 600, 621, 626, 692, 713, 
 propnZ28 9

 propnZ212 10

 propnZ221 11

 propnZ215 12

 propnZ218 13

 propnZ21 14

 propnZ24 15

 propnZ27 16

 propnZ217 17

 propnZ211 18

 propnZ26 19
91, 113, 124, 353, 357, 424, 437, 480, 590, 601, 633, 
 propnZ220 20

 propnZ214 21

 adjZ23 22
2, 8, 91, 104, 124, 128, 189, 282, 285, 300, 310, 384, 427, 516, 517, 528, 537, 557, 605, 623, 689, 718, 
 adjZ22 23

 adjZ25 24
45, 47, 81, 237, 269, 299, 460, 471, 600, 623, 662, 
 adjZ21 25
270, 286, 289, 
 adjZ24 26
49, 110, 484, 671, 
 locZ23 27
81, 132, 225, 230, 335, 526, 634, 646, 
 locZ22 28
36, 39, 552, 647, 683, 
 locZ25 29
49, 106, 142, 179, 386, 393, 554, 650, 
 locZ21 30
15, 46, 186, 297, 474

In [22]:
decode_words_view(1, 14)


 propnZ29 0

 propnZ23 1

 propnZ216 2
marco, suarez, aurelio, santo-domingo, bavaria, paez, 
 propnZ222 3

 propnZ22 4

 propnZ210 5

 propnZ219 6

 propnZ213 7

 propnZ25 8
seguro, eps, agro, 
 propnZ28 9

 propnZ212 10

 propnZ221 11

 propnZ215 12

 propnZ218 13

 propnZ21 14

 propnZ24 15

 propnZ27 16

 propnZ217 17

 propnZ211 18

 propnZ26 19
betancourt, raul, narino, pasto, guerrero, 
 propnZ220 20

 propnZ214 21

 adjZ23 22
cortar, previo, patrimonial, judicial, terminar, injustificado, rapido, 
 adjZ22 23

 adjZ25 24
beneficiar, avalar, general-nuevo, segundar-politico, bueno-importante, presidencial-presidencial, politico-mayor, 
 adjZ21 25
fallecer, tradicional-politico, vigente, ferreo, afectar, ubicar, caucana, 
 adjZ24 26
liberal-apoyar, poderoso-nacido, nuevo-conservador, politico-colombiano, efectivo, incluir, primero-publico, 
 locZ23 27
tavera, patino, santander-gobernacion, gobernacion-santander, parque-nacional, udes, teologia, 
 locZ22 28
cambio-radical, conseje

In [23]:
decode_files_view(0, 0)


 propnZ29 0
0, 17, 128, 162, 177, 255, 295, 296, 299, 371, 449, 512, 561, 728, 
 propnZ23 1
29, 32, 60, 110, 129, 138, 193, 215, 233, 238, 267, 278, 285, 290, 302, 308, 316, 333, 336, 487, 514, 598, 649, 651, 664, 693, 695, 727, 
 propnZ216 2
18, 47, 84, 121, 133, 230, 252, 283, 293, 351, 365, 413, 450, 459, 497, 533, 534, 543, 586, 617, 626, 648, 658, 693, 706, 722, 
 propnZ222 3
73, 342, 396, 435, 513, 539, 608, 620, 662, 724, 
 propnZ22 4
7, 20, 25, 31, 36, 73, 78, 82, 86, 94, 124, 128, 150, 162, 170, 171, 175, 181, 191, 192, 198, 201, 205, 212, 215, 216, 244, 258, 263, 280, 285, 289, 299, 310, 333, 367, 373, 400, 418, 431, 457, 461, 462, 477, 494, 496, 532, 558, 570, 588, 589, 591, 594, 601, 606, 621, 640, 653, 668, 669, 681, 683, 686, 697, 702, 710, 715, 717, 721, 
 propnZ210 5
46, 80, 89, 90, 94, 330, 368, 395, 475, 515, 520, 545, 578, 588, 651, 
 propnZ219 6
8, 13, 20, 45, 53, 64, 95, 108, 114, 131, 156, 198, 217, 239, 243, 276, 286, 310, 315, 317, 352, 389, 414, 421, 422, 434,

In [24]:
decode_words_view(0, 0)


 propnZ29 0
omar-yepes, oscar-ivan, barco, manizales, caldas, renan, victor, 
 propnZ23 1
gerlein, rosa, verano, char, barranquilla, atlantico, elsa, 
 propnZ216 2
rodolfo, asamblea, opcion-ciudadana, pin, universidad-santo, tomas, nestor, 
 propnZ222 3
besaile, cabrales, lyons, cordoba, monteria, musa, nono, 
 propnZ22 4
ernesto-samper, carlos-gaviria, caguan, andres-pastrana, farc, eln, comunista, 
 propnZ210 5
general, militar, brigada, comandante, comando, ejercito, operaciones, 
 propnZ219 6
luis-perez, sergio-fajardo, antioquia, medellin, federico, anibal-gaviria, universidad-antioquia, 
 propnZ213 7
serpa, horacio-serpa, liberal, partido-liberal, constituyente, concejo-bogota, diego, 
 propnZ25 8
huila, arias, seguro, tribunal, ingreso, agro, registrador, 
 propnZ28 9
samper, comunicaciones, daniel-coronell, radio, television, tiempo, caracol, 
 propnZ212 10
antanas-mockus, enrique-penalosa, penalosa, partido-verde, alcaldia, fajardo, mockus, 
 propnZ221 11
alejandro-ordonez, o