In [None]:
import json
import numpy as np
import csv
from collections import deque

In [2]:
trees = {}
root_nodes = {}
nodes = {}
words_dic = {}
nodes_words = {}
prefixes = ['propn', 'adj', 'loc', 'misc', 'noun', 'org', 'per', 'verb']

def load_nodes():
    n = 0
    root_n = 0
    cnt = 0
    for prefix in prefixes:
        with open('profiles/profiles_' + prefix + '/topicTree.nodes.json', 'r') as read_file:
            j_nodes = json.load(read_file)
        q = deque()
        for node in j_nodes:
            root_nodes[prefix + node['id']] = root_n
            root_n += 1
            q.append(node)

        while len(q) > 0:
            node = q.popleft()
            nodes[prefix + node['id']] = n
            n += 1
            nodes_words[prefix + node['id']] = node['text'].split(' ')
            for word in node['text'].split(' '):
                if not word in words_dic:
                    words_dic[word] = cnt
                    cnt += 1
            for child in node['children']:
                q.append(child)

        trees[prefix] = j_nodes

load_nodes()

In [3]:
assignments = {}

def load_assignments():
    for prefix in prefixes:
        with open('profiles/profiles_' + prefix + '/myAssignment.topics.json', 'r') as read_file:
            j_nodes = json.load(read_file)
        for node in j_nodes:
            topic = node['topic']
            assignments[prefix + topic] = {}
            for doc in node['doc']:
                assignments[prefix + topic][int(doc[0])] = doc[1]

load_assignments()

In [4]:
data_files = {}

def load_data_files():
    cnt = 0
    with open('profiles/profiles_propn/myData.files.txt', 'r') as read_file:
        tmp = read_file.read().split('\n')
    for i in tmp:
        data_files[cnt] = i
        cnt += 1

load_data_files()

In [5]:
tf_idf = {}

def load_words():
    for prefix in prefixes:
        with open('profiles/profiles_' + prefix + '/myData.dict.csv', 'r') as read_file:
            tmp = {}
            for tfidf in csv.reader(read_file):
                tmp[tfidf[0]] = tfidf[3]
            tf_idf[prefix] = tmp

load_words()

In [6]:
def save_np(folder, matrix, view_id):
    m_min = matrix.min()
    m_max = matrix.max()
    matrix -= m_min
    matrix /= (m_max - m_min)
    matrix *= 2
    matrix -= 1
    with open(folder + view_id + ".npy", 'wb') as f:
        np.save(f, matrix)

In [27]:
def get_views_by_node():
    view_files = np.zeros((len(nodes), len(data_files)))
    view_words = np.zeros((len(nodes), len(words_dic)))
    for node in nodes:
        for file in assignments[node]:
            view_files[nodes[node]][file] = assignments[node][file]
        for word in nodes_words[node]:
            view_words[nodes[node]][words_dic[word]] = tf_idf[node[:node.index('Z')]][word]
    save_np('views/nodes/', view_words, 'words_' + 'nodes')
    save_np('views/nodes/', view_files, 'files_' + 'nodes')

get_views_by_node()

In [7]:
def set_assignments_by_level(roots, view_id):
    view_files = np.zeros((len(root_nodes), len(data_files)))
    for root in roots:
        for file in assignments[roots[root]]:
            view_files[root_nodes[root]][file] = assignments[roots[root]][file]
    save_np('views/all/', view_files, 'files_' + view_id)

In [8]:
def set_words_by_level(roots, view_id):
    view_words = np.zeros((len(root_nodes), len(words_dic)))
    for root in roots:
        for word in nodes_words[roots[root]]:
            view_words[root_nodes[root]][words_dic[word]] = tf_idf[root[:root.index('Z')]][word]
    save_np('views/all/', view_words, 'words_' + view_id)

In [9]:
def get_views_by_level():
    queues = {}
    for prefix in prefixes:
        for node in trees[prefix]:
            q_id = prefix + node['id']
            queues[q_id] = []
            queues[q_id].append(node)
            # for child in node['children']:
            #     queues[q_id].append(child)
    level = 0
    keep = True
    while keep:
        keep = False
        view_by_level(queues, level)
        for q in queues:
            tmp = []
            for node in queues[q]:
                for child in node['children']:
                    tmp.append(child)
            if len(tmp) > 0:
                keep = True
            queues[q] = tmp
        level += 1

def view_by_level(queues, level):
    max_q = 0
    for q in queues:
        max_q = max(max_q, len(queues[q]))
    print(max_q)
    for i in range(max_q):
        roots = {}
        for root in root_nodes:
            if i < len(queues[root]):
                roots[root] = root[:root.index('Z')] + queues[root][i]['id']
        set_assignments_by_level(roots, str(level) + '_' + str(i))
        set_words_by_level(roots, str(level) + '_' + str(i))

get_views_by_level()

1
15


In [29]:
def decode_words_view(folder, nodes_id, view_id):
    with open(folder + "words_" + view_id + ".npy", 'rb') as f:
        view = np.load(f)
    for node in nodes_id:
        print('\n', node, nodes_id[node])
        for word in words_dic:
            if view[nodes_id[node]][words_dic[word]] != -1:
                print(word, end=', ')

In [34]:
def decode_files_view(folder, nodes_id, view_id):
    with open(folder + "files_" + view_id + ".npy", 'rb') as f:
        view = np.load(f)
    for node in nodes_id:
        print('\n', node, nodes_id[node])
        for file in data_files:
            if view[nodes_id[node]][file] != -1:
                print(file, end=', ')


In [37]:
#decode_files_view('views/nodes/', nodes, 'nodes')

In [38]:
#decode_words_view('views/nodes/', nodes, 'nodes')