In [1]:
import csv
import json
from collections import deque

import numpy as np

In [2]:
prefixes = ['per', 'verb', 'org', 'loc', 'adj']

In [3]:
def load_nodes(name):
    n_trees = 0
    for prefix in prefixes:
        with open(name + prefix + '/topicTree.nodes.json', 'r') as read_file:
            j_nodes = json.load(read_file)
        q = deque()
        for node in j_nodes:
            node_parent[prefix + node['id']] = ['ROOT']
            q.append(node)
            n_trees += 1

        while len(q) > 0:
            node = q.popleft()
            node_id = prefix + node['id']
            nodes[node_id] = len(nodes)
            node_words[node_id] = node['text'].split(' ')
            node_level[node_id] = node['data']['level']
            node_children[node_id] = []
            for child in node['children']:
                node_parent[prefix + child['id']] = [node_id]
                node_children[node_id].append(prefix + child['id'])
                q.append(child)
    print(n_trees, ' trees found')
    print(len(nodes), ' topics found')

In [4]:
def load_assignments(name):
    for prefix in prefixes:
        with open(name + prefix + '/myAssignment.topics.json', 'r') as read_file:
            j_nodes = json.load(read_file)
        for node in j_nodes:
            topic = node['topic']
            assignments[prefix + topic] = {}
            for doc in node['doc']:
                assignments[prefix + topic][int(doc[0])] = doc[1]

In [5]:
def load_data_files(name):
    cnt = 0
    with open(name + 'myData.files.txt', 'r') as read_file:
        tmp = read_file.read().split('\n')
    for i in tmp:
        if len(i) == 0:
            continue
        data_files[cnt] = i
        cnt += 1

In [6]:
def load_words(name):
    repeated = {}
    for prefix in prefixes:
        with open(name + prefix + '/myData.dict.csv', 'r') as read_file:
            for tfidf in csv.reader(read_file):
                if tfidf[3] != 'tfidf':
                    if tfidf[0] not in words_dic:
                        words_dic[tfidf[0]] = len(words_dic)
                        tf_idf[tfidf[0]] = float(tfidf[3])
                    else:
                        if tfidf[0] not in repeated:
                            repeated[tfidf[0]] = 2
                        else:
                            repeated[tfidf[0]] += 1
                        tf_idf[tfidf[0]] += float(tfidf[3])
    for word in repeated:
        tf_idf[word] /= repeated[word]
    print(len(repeated), 'repeated words')

In [7]:
def load_sparse(name):
    with open(name + '/myData.sparse.txt', 'r') as read_file:
        doc_words = read_file.read().split('\n')
    for i in data_files:
        sparse[i] = []
    for i in doc_words:
        if len(i) == 0:
            continue
        tmp = i.split(", ")
        tmp[0] = int(tmp[0])
        sparse[tmp[0]].append(tmp[1])

In [8]:
def read_java_bayes(name):
    for node in nodes:
        words_prop[node] = {}
    for prefix in prefixes:
        with open(name + prefix + '/myModel.bif', 'r') as f:
            for i in f:
                if i.count('|') == 0:
                    continue
                if i.startswith('probability ( '):
                    a = i[i.index('"') + 1: i.index('|') - 2]
                    b = prefix + i[i.index('|') + 3: i.index(')') - 2]
                    prob = float(f.readline().split(' ')[1])
                    if a.startswith('Z'):
                        if b not in node_prob:
                            node_prob[b] = {}
                        a = prefix + a
                        node_prob[b][a] = prob
                    else:
                        if a in words_dic:
                            words_prop[b][a] = prob

In [9]:
def prob_dfs(node):
    for child in node_children[node]:
        words = prob_dfs(child)
        for word in words:
            words_prop[node][word] = node_prob[node][child] * words_prop[child][word]
    return words_prop[node]

def get_words_prop(name):
    read_java_bayes(name)
    for node in nodes:
        if len(node_parent[node]) == 1 and node_parent[node][0] == 'ROOT':
            prob_dfs(node)

In [10]:
def save_np(folder, matrix, view_id):
    m_min = matrix.min()
    m_max = matrix.max()
    matrix -= m_min
    matrix /= (m_max - m_min)
    matrix *= 2
    matrix -= 1
    with open(folder + view_id + ".npy", 'wb') as f:
        np.save(f, matrix)

In [11]:
def get_views(name):
    view_files = np.zeros((len(nodes), len(data_files)))
    view_bayes = np.zeros((len(nodes), len(words_dic)))
    view_tfidf = np.zeros((len(nodes), len(words_dic)))

    for node in nodes:
        for file in assignments[node]:
            view_files[nodes[node]][file] = assignments[node][file]
        for word in words_prop[node]:
            view_bayes[nodes[node]][words_dic[word]] = words_prop[node][word]
        for doc in assignments[node]:
            for word in sparse[doc]:
                if word in tf_idf:
                    view_tfidf[nodes[node]][words_dic[word]] = tf_idf[word] * assignments[node][doc]

    save_np('views/' + name, view_files, 'files')
    save_np('views/' + name, view_bayes, 'bayes')
    save_np('views/' + name, view_tfidf, 'tfidf')


In [12]:
def get_ids2nodes():
    for i in nodes:
        id2node[nodes[i]] = i

def read_results(name):
    with open(name, 'rb') as f:
        return np.load(f)

In [13]:
def process_clusters(clusters):
    d = {}
    cnt = 0
    for i in clusters:
        if i in d:
            d[i].append(cnt)
        else:
            d[i] = [cnt]
        cnt += 1

    cnt = 0
    for c_id in d:
        groups = {}
        for i in d[c_id]:
            name = id2node[i]
            level = node_level[name]
            node_group[name] = c_id
            if level in groups:
                groups[level].append(name)
            else:
                groups[level] = []
                groups[level].append(name)

        for level in groups:
            if len(groups[level]) <= 1:
                continue
            s = set()
            for node in groups[level]:
                s.add(node_parent[node][0])
            if len(s) == 1 and list(s)[0] != 'ROOT':
                continue
            u_name = 'U' + str(cnt)
            node_parent[u_name] = []
            node_children[u_name] = []
            cnt += 1
            nodes[u_name] = len(nodes)
            node_group[u_name] = c_id
            for node in groups[level]:
                node_parent[node] = [u_name]
                node_children[u_name].append(node)
            for parent in s:
                node_parent[u_name].append(parent)
                if parent != 'ROOT':
                    for node in groups[level]:
                        if node in node_children[parent]:
                            node_children[parent].remove(node)
                    node_children[parent].append(u_name)

            s = set()
            node_words[u_name] = []
            i = 0
            while True:
                no_more = True
                for node in groups[level]:
                    if len(node_words[node]) > i:
                        no_more = False
                        if node_words[node][i] not in s:
                            s.add(node_words[node][i])
                            node_words[u_name].append(node_words[node][i])
                        for aux in node_words[node][i].split('-'):
                            s.add(aux)
                        if len(node_words[u_name]) >= 7:
                            break
                if len(node_words[u_name]) >= 7 or no_more:
                    break
                i += 1
    roots = 0
    for node in nodes:
        if len(node_parent[node]) == 1 and node_parent[node][0] == 'ROOT':
            roots += 1
    print(roots, ' trees after fusion')
    print(len(nodes), ' nodes after fusion')

In [14]:
def export_nodes_json(name):
    graph = []
    for node in nodes:
        entry = {'id': node, 'text': ' '.join(node_words[node]), 'children': []}
        graph.append(entry)
    with open(name, 'w') as f:
        json.dump(graph, f, indent=2)

In [15]:
def create_topics_d3(name, expanded):
    graph = {"nodes": {}, "links": []}
    for node in nodes:
        words = node if node.startswith('U') else node[:node.index('Z')]
        words = words + ': ' + ' '.join(node_words[node]).replace('zzz', '_')
        is_root = False
        if len(node_parent[node]) == 1 and node_parent[node][0] == 'ROOT':
            is_root = True
        d = {"id":node, "name":words, "group":int(node_group[node]),
             "isRoot":expanded or is_root, "children":node_children[node], "expanded":expanded}
        graph["nodes"][node] = d
        if expanded:
            for child in node_children:
                 graph["links"].append((dict([('id', node+'-'+child), ("source", node), ("target", child)])))
    with open("graphs/d3/" + name + '.json', "w") as fp:
        json.dump(graph, fp, indent=2)

In [16]:
def create_topics_gephi(name):
    colors = {}
    r, g, b, rr, gr, br, cnt = 80, 70, 60, 80, 70, 60, 0
    if len(node_group) == 0:
        for i in nodes:
            node_group[i] = 0
        colors[0] = '000000'
    else:
        for i in node_group.values():
            if i not in colors:
                color = '{:02x}'.format(r) if cnt < 4 else '00'
                color += '{:02x}'.format(g) if cnt % 2 == 0 else '00'
                color += '{:02x}'.format(b) if 1 < cnt < 6 else '00'
                cnt += 1
                if cnt == 7:
                    r = (r + rr) % 256
                    g = (g + gr) % 256
                    b = (b + br) % 256
                    cnt = 0
                colors[i] = color

    graph = ['graph [ directed 1']
    for node in nodes:
        words = node if node.startswith('U') else node[:node.index('Z')]
        words = words + ': ' + ' '.join(node_words[node]).replace('zzz', '_')
        graph.append('node [')
        graph.append('id ' + str(nodes[node]))
        graph.append('label "' + words + '"')
        graph.append('graphics [fill "#' + colors[node_group[node]] + '"]]')
    for node in node_children:
        for child in node_children[node]:
            graph.append('edge [')
            graph.append('source ' + str(nodes[node]))
            graph.append('target ' + str(nodes[child]) + ' ]')
    graph.append(']')
    with open('graphs/gephi/' + name + '.gml', 'w') as f:
        f.write('\n'.join(graph))

In [22]:
nodes, words_dic, node_words = {}, {}, {}
node_parent, node_level, node_children = {}, {}, {}
id2node, node_group, data_files, assignments, tf_idf = {}, {}, {}, {}, {}
node_prob, words_prop, sparse = {}, {}, {}
load_nodes('profiles/')
load_assignments('profiles/')
load_data_files('profiles/adj/')
load_words('profiles/')
get_words_prop('profiles/')
load_sparse('profiles/')
# get_views('profiles/')
get_ids2nodes()
process_clusters(read_results('clusters/profiles/150_means.npy'))
export_nodes_json('profiles/evaluation/150_means.nodes.json')
create_topics_gephi('profiles_150_means')
# create_topics_d3("profiles", False)

76  trees found
1236  topics found
241 repeated words
42  trees after fusion
1374  nodes after fusion
