In [1]:
import json
import numpy as np
import csv
from collections import deque

In [2]:
trees = {}
root_nodes = {}
nodes = {}
words_dic = {}
node_words = {}
prefixes = ['propn', 'verb', 'org', 'loc', 'per', 'adj', 'noun']
node_parent = {}
node_level = {}
node_children = {}

def load_nodes():
    n = 0
    root_n = 0
    cnt = 0
    for prefix in prefixes:
        with open('profiles/profiles_' + prefix + '/topicTree.nodes.json', 'r') as read_file:
            j_nodes = json.load(read_file)
        q = deque()
        for node in j_nodes:
            root_nodes[prefix + node['id']] = root_n
            node_parent[prefix + node['id']] = ['ROOT']
            root_n += 1
            q.append(node)

        while len(q) > 0:
            node = q.popleft()
            node_id = prefix + node['id']
            nodes[node_id] = n
            n += 1
            node_words[node_id] = node['text'].split(' ')
            node_level[node_id] = node['data']['level']
            node_children[node_id] = []
            for word in node['text'].split(' '):
                if not word in words_dic:
                    words_dic[word] = cnt
                    cnt += 1
            for child in node['children']:
                node_parent[prefix + child['id']] = [node_id]
                node_children[node_id].append(prefix + child['id'])
                q.append(child)

        trees[prefix] = j_nodes

load_nodes()

In [3]:
assignments = {}

def load_assignments():
    for prefix in prefixes:
        with open('profiles/profiles_' + prefix + '/myAssignment.topics.json', 'r') as read_file:
            j_nodes = json.load(read_file)
        for node in j_nodes:
            topic = node['topic']
            assignments[prefix + topic] = {}
            for doc in node['doc']:
                assignments[prefix + topic][int(doc[0])] = doc[1]

load_assignments()

In [4]:
data_files = {}

def load_data_files():
    cnt = 0
    with open('profiles/profiles_verb/myData.files.txt', 'r') as read_file:
        tmp = read_file.read().split('\n')
    for i in tmp:
        data_files[cnt] = i
        cnt += 1

load_data_files()

In [5]:
tf_idf = {}

def load_words():
    for prefix in prefixes:
        with open('profiles/profiles_' + prefix + '/myData.dict.csv', 'r') as read_file:
            tmp = {}
            for tfidf in csv.reader(read_file):
                tmp[tfidf[0]] = tfidf[3]
            tf_idf[prefix] = tmp

load_words()

In [6]:
def save_np(folder, matrix, view_id):
    m_min = matrix.min()
    m_max = matrix.max()
    matrix -= m_min
    matrix /= (m_max - m_min)
    matrix *= 2
    matrix -= 1
    with open(folder + view_id + ".npy", 'wb') as f:
        np.save(f, matrix)

In [7]:
def get_views_by_node():
    view_files = np.zeros((len(nodes), len(data_files)))
    view_words = np.zeros((len(nodes), len(words_dic)))
    for node in nodes:
        for file in assignments[node]:
            view_files[nodes[node]][file] = assignments[node][file]
        for word in node_words[node]:
            view_words[nodes[node]][words_dic[word]] = tf_idf[node[:node.index('Z')]][word]
    save_np('views/profiles/', view_words, 'words_' + 'nodes')
    save_np('views/profiles/', view_files, 'files_' + 'nodes')

In [8]:
def set_assignments_by_level(roots, view_id):
    view_files = np.zeros((len(root_nodes), len(data_files)))
    for root in roots:
        for file in assignments[roots[root]]:
            view_files[root_nodes[root]][file] = assignments[roots[root]][file]
    save_np('views/all/', view_files, 'files_' + view_id)

In [9]:
def set_words_by_level(roots, view_id):
    view_words = np.zeros((len(root_nodes), len(words_dic)))
    for root in roots:
        for word in node_words[roots[root]]:
            view_words[root_nodes[root]][words_dic[word]] = tf_idf[root[:root.index('Z')]][word]
    save_np('views/all/', view_words, 'words_' + view_id)

In [10]:
def get_views_by_level():
    queues = {}
    for prefix in prefixes:
        for node in trees[prefix]:
            q_id = prefix + node['id']
            queues[q_id] = []
            queues[q_id].append(node)
    level = 0
    keep = True
    while keep:
        keep = False
        view_by_level(queues, level)
        for q in queues:
            tmp = []
            for node in queues[q]:
                for child in node['children']:
                    tmp.append(child)
            if len(tmp) > 0:
                keep = True
            queues[q] = tmp
        level += 1

def view_by_level(queues, level):
    max_q = 0
    for q in queues:
        max_q = max(max_q, len(queues[q]))
    print(max_q)
    for i in range(max_q):
        roots = {}
        for root in root_nodes:
            if i < len(queues[root]):
                roots[root] = root[:root.index('Z')] + queues[root][i]['id']
        set_assignments_by_level(roots, str(level) + '_' + str(i))
        set_words_by_level(roots, str(level) + '_' + str(i))

# get_views_by_level()

In [11]:
def decode_words_view(folder, nodes_id, view_id):
    with open(folder + "words_" + view_id + ".npy", 'rb') as f:
        view = np.load(f)
    for node in nodes_id:
        print('\n', node, nodes_id[node])
        for word in words_dic:
            if view[nodes_id[node]][words_dic[word]] != -1:
                print(word, end=', ')

def decode_files_view(folder, nodes_id, view_id):
    with open(folder + "files_" + view_id + ".npy", 'rb') as f:
        view = np.load(f)
    for node in nodes_id:
        print('\n', node, nodes_id[node])
        for file in data_files:
            if view[nodes_id[node]][file] != -1:
                print(file, end=', ')

#decode_files_view('views/nodes/', nodes, 'nodes')
#decode_words_view('views/nodes/', nodes, 'nodes')

In [12]:
id2node = {}

def get_ids2nodes():
    for i in nodes:
        id2node[nodes[i]] = i
get_ids2nodes()

def read_results(name):
    with open(name, 'rb') as f:
        return np.load(f)

In [13]:
def process_clusters(clusters):
    d = {}
    cnt = 0
    for i in clusters:
        if i in d:
            d[i].append(cnt)
        else:
            d[i] = [cnt]
        cnt += 1

    cnt = 0
    for c_id in d:
        groups = {}
        for i in d[c_id]:
            name = id2node[i]
            level = node_level[name]
            if level in groups:
                groups[level].append(name)
            else:
                groups[level] = []
                groups[level].append(name)

        for level in groups:
            if len(groups[level]) <= 1:
                continue
            s = set()
            for node in groups[level]:
                s.add(node_parent[node][0])
            if len(s) == 1 and list(s)[0] != 'ROOT':
                continue
            u_name = 'U' + str(cnt)
            node_parent[u_name] = []
            node_children[u_name] = []
            cnt += 1
            for node in groups[level]:
                node_parent[node] = [u_name]
                node_children[u_name].append(node)
            for parent in s:
                node_parent[u_name].append(parent)
                if parent != 'ROOT':
                    for node in groups[level]:
                        if node in node_children[parent]:
                            node_children[parent].remove(node)
                    node_children[parent].append(u_name)

            node_words[u_name] = set()
            i = 0
            while True:
                no_more = True
                for node in groups[level]:
                    if len(node_words[node]) > i:
                        no_more = False
                        node_words[u_name].add(node_words[node][i])
                        if len(node_words[u_name]) >= 7:
                            break
                if len(node_words[u_name]) >= 7 or no_more:
                    break
                i += 1

In [14]:
html = []

def html_dfs(node):
    words = node + ': ' + ' '.join(node_words[node])
    if len(node_children[node]) == 0:
        html.append('<li>' + words + '</li>')
    else:
        html.append('<li>' + words + '<ul>')
        for child in node_children[node]:
            html_dfs(child)
        html.append('</ul></li>')

def create_topic_graph(name):
    global html
    cnt = 0
    for node in node_parent:
        if len(node_parent[node]) == 1 and node_parent[node][0] == 'ROOT':
            cnt += 1
            html_dfs(node)
    with open('graphs/' + name + '.html', 'w') as f:
        f.write('\n'.join(html))
    html = []
    print(cnt, "trees created")

In [15]:
# get_views_by_node()

In [16]:
process_clusters(read_results('clusters/profiles_70_0.1_1000.0_0.01.npy'))
create_topic_graph('fusion_70')

32 trees created
