In [1]:
import csv
import json
from collections import deque

import numpy as np

In [2]:
prefixes = ['propn', 'verb', 'org', 'loc', 'per', 'adj', 'noun']

In [3]:
def load_nodes():
    cnt = 0
    for prefix in prefixes:
        with open('profiles/profiles_' + prefix + '/topicTree.nodes.json', 'r') as read_file:
            j_nodes = json.load(read_file)
        q = deque()
        for node in j_nodes:
            node_parent[prefix + node['id']] = ['ROOT']
            q.append(node)

        while len(q) > 0:
            node = q.popleft()
            node_id = prefix + node['id']
            nodes[node_id] = len(nodes)
            node_words[node_id] = node['text'].split(' ')
            node_level[node_id] = node['data']['level']
            node_children[node_id] = []
            for word in node['text'].split(' '):
                if not word in words_dic:
                    words_dic[word] = cnt
                    cnt += 1
            for child in node['children']:
                node_parent[prefix + child['id']] = [node_id]
                node_children[node_id].append(prefix + child['id'])
                q.append(child)

In [4]:
def load_assignments():
    for prefix in prefixes:
        with open('profiles/profiles_' + prefix + '/myAssignment.topics.json', 'r') as read_file:
            j_nodes = json.load(read_file)
        for node in j_nodes:
            topic = node['topic']
            assignments[prefix + topic] = {}
            for doc in node['doc']:
                assignments[prefix + topic][int(doc[0])] = doc[1]

In [5]:
def load_data_files():
    cnt = 0
    with open('profiles/profiles_verb/myData.files.txt', 'r') as read_file:
        tmp = read_file.read().split('\n')
    for i in tmp:
        data_files[cnt] = i
        cnt += 1

In [6]:
def load_words():
    for prefix in prefixes:
        with open('profiles/profiles_' + prefix + '/myData.dict.csv', 'r') as read_file:
            tmp = {}
            for tfidf in csv.reader(read_file):
                tmp[tfidf[0]] = tfidf[3]
            tf_idf[prefix] = tmp

In [7]:
def save_np(folder, matrix, view_id):
    m_min = matrix.min()
    m_max = matrix.max()
    matrix -= m_min
    matrix /= (m_max - m_min)
    matrix *= 2
    matrix -= 1
    with open(folder + view_id + ".npy", 'wb') as f:
        np.save(f, matrix)

In [8]:
def get_views_by_node():
    view_files = np.zeros((len(nodes), len(data_files)))
    view_words = np.zeros((len(nodes), len(words_dic)))
    for node in nodes:
        for file in assignments[node]:
            view_files[nodes[node]][file] = assignments[node][file]
        for word in node_words[node]:
            view_words[nodes[node]][words_dic[word]] = tf_idf[node[:node.index('Z')]][word]
    save_np('views/profiles/', view_words, 'words_' + 'nodes')
    save_np('views/profiles/', view_files, 'files_' + 'nodes')

In [9]:
def get_ids2nodes():
    for i in nodes:
        id2node[nodes[i]] = i

def read_results(name):
    with open(name, 'rb') as f:
        return np.load(f)

In [10]:
def process_clusters(clusters):
    d = {}
    cnt = 0
    for i in clusters:
        if i in d:
            d[i].append(cnt)
        else:
            d[i] = [cnt]
        cnt += 1

    cnt = 0
    for c_id in d:
        groups = {}
        for i in d[c_id]:
            name = id2node[i]
            level = node_level[name]
            node_group[name] = c_id
            if level in groups:
                groups[level].append(name)
            else:
                groups[level] = []
                groups[level].append(name)

        for level in groups:
            if len(groups[level]) <= 1:
                continue
            s = set()
            for node in groups[level]:
                s.add(node_parent[node][0])
            if len(s) == 1 and list(s)[0] != 'ROOT':
                continue
            u_name = 'U' + str(cnt)
            node_parent[u_name] = []
            node_children[u_name] = []
            cnt += 1
            nodes[u_name] = len(nodes)
            node_group[u_name] = c_id
            for node in groups[level]:
                node_parent[node] = [u_name]
                node_children[u_name].append(node)
            for parent in s:
                node_parent[u_name].append(parent)
                if parent != 'ROOT':
                    for node in groups[level]:
                        if node in node_children[parent]:
                            node_children[parent].remove(node)
                    node_children[parent].append(u_name)

            node_words[u_name] = set()
            i = 0
            while True:
                no_more = True
                for node in groups[level]:
                    if len(node_words[node]) > i:
                        no_more = False
                        node_words[u_name].add(node_words[node][i])
                        if len(node_words[u_name]) >= 7:
                            break
                if len(node_words[u_name]) >= 7 or no_more:
                    break
                i += 1

In [11]:
html = []
def html_dfs(node):
    words = node + ': ' + ' '.join(node_words[node])
    if len(node_children[node]) == 0:
        html.append('<li>' + words + '</li>')
    else:
        html.append('<li>' + words + '<ul>')
        for child in node_children[node]:
            html_dfs(child)
        html.append('</ul></li>')

def create_topics_html_lists(name):
    global html
    cnt = 0
    for node in node_parent:
        if len(node_parent[node]) == 1 and node_parent[node][0] == 'ROOT':
            cnt += 1
            html_dfs(node)
    with open('graphs/html/' + name + '.html', 'w') as f:
        f.write('\n'.join(html))
    html = []
    print(cnt, "trees created")

In [12]:
def create_topics_d3(name, expanded):
    graph = {"nodes": {}, "links": []}
    for node in nodes:
        words = ' '.join(node_words[node])
        is_root = False
        if len(node_parent[node]) == 1 and node_parent[node][0] == 'ROOT':
            is_root = True
        d = {"id":node, "name":words, "group":int(node_group[node]),
             "isRoot":expanded or is_root, "children":node_children[node], "expanded":expanded}
        graph["nodes"][node] = d
        if expanded:
            for child in node_children:
                 graph["links"].append((dict([('id', node+'-'+child), ("source", node), ("target", child)])))
    with open("graphs/" + name, "w") as fp:
        json.dump(graph, fp, indent=2)

In [18]:
def create_topics_gephi(name):
    colors = {}
    r, g, b, rr, gr, br, cnt = 80, 70, 60, 80, 70, 60, 0
    for i in node_group.values():
        if i not in colors:
            color = '{:02x}'.format(r) if cnt < 4 else '00'
            color += '{:02x}'.format(g) if cnt % 2 == 0 else '00'
            color += '{:02x}'.format(b) if 1 < cnt < 6 else '00'
            cnt += 1
            if cnt == 7:
                r = (r + rr) % 256
                g = (g + gr) % 256
                b = (b + br) % 256
                cnt = 0
            colors[i] = color

    graph = ['graph [ directed 1']
    for node in nodes:
        words = ' '.join(node_words[node])
        graph.append('node [')
        graph.append('id ' + str(nodes[node]))
        graph.append('label "' + words + '"')
        graph.append('graphics [fill "#' + colors[node_group[node]] + '"]]')
    for node in node_children:
        for child in node_children[node]:
            graph.append('edge [')
            graph.append('source ' + str(nodes[node]))
            graph.append('target ' + str(nodes[child]) + ' ]')
    graph.append(']')
    with open('graphs/gephi/' + name + '.gml', 'w') as f:
        f.write('\n'.join(graph))

In [20]:
nodes, words_dic, node_words = {}, {}, {}
node_parent, node_level, node_children = {}, {}, {}
id2node, node_group, data_files, assignments, tf_idf = {}, {}, {}, {}, {}
load_nodes()
load_assignments()
load_data_files()
load_words()
get_ids2nodes()
# get_views_by_node()
process_clusters(read_results('clusters/profiles_70_0.1_1000.0_0.01.npy'))
create_topics_gephi('silla_vacia')
# create_topics_html_lists('fusion_70')
# create_topics_d3("graphs/d3/graph_expanded.json", False)