In [1]:
import json
import numpy as np
import csv

In [2]:
trees = {}
root_nodes = {}
root2id = {}
root2prefix = {}
prefixes = ['propn', 'adj', 'loc', 'misc', 'noun', 'org', 'per', 'verb']

def load_nodes():
    n = 0
    for prefix in prefixes:
        with open('profiles/profiles_' + prefix + '/topicTree.nodes.json', 'r') as read_file:
            nodes = json.load(read_file)
        for node in nodes:
            root_nodes[prefix + node['id']] = n
            root2id[prefix + node['id']] = node['id']
            root2prefix[prefix + node['id']] = prefix
            n += 1
        trees[prefix] = nodes

load_nodes()

In [3]:
assignments = {}

def load_assignments():
    for prefix in prefixes:
        with open('profiles/profiles_' + prefix + '/myAssignment.topics.json', 'r') as read_file:
            nodes = json.load(read_file)
        d = {}
        for node in nodes:
            topic = node['topic']
            d[topic] = {}
            for doc in node['doc']:
                d[topic][int(doc[0])] = doc[1]
        assignments[prefix] = d

load_assignments()

In [4]:
def load_data_files():
    cnt = 0
    files = {}
    with open('profiles/profiles_propn/myData.files.txt', 'r') as read_file:
        tmp = read_file.read().split('\n')
    for i in tmp:
        files[cnt] = i
        cnt += 1
    return files

data_files = load_data_files()

In [5]:
tf_idf = {}

def load_words():
    for prefix in prefixes:
        with open('profiles/profiles_' + prefix + '/myData.dict.csv', 'r') as read_file:
            tmp = {}
            for tfidf in csv.reader(read_file):
                tmp[tfidf[0]] = tfidf[3]
            tf_idf[prefix] = tmp

load_words()

In [6]:
words_dic = {}
nodes_words = {}
from collections import deque

def get_words_dic():
    cnt = 0
    q = deque()
    prefix_q = deque()
    for prefix in prefixes:
        nodes_words[prefix] = {}
        for node in trees[prefix]:
            q.append(node)
            prefix_q.append(prefix)

    while len(q) > 0:
        node = q.pop()
        p = prefix_q.pop()
        nodes_words[p][node['id']] = node['text'].split(' ')
        for word in node['text'].split(' '):
            if not word in words_dic:
                words_dic[word] = cnt
                cnt += 1
        for child in node['children']:
            q.append(child)
            prefix_q.append(p)

get_words_dic()

In [7]:
def set_assignments(roots, view_id):
    view_files = np.zeros((len(root_nodes), len(data_files)))
    for root in roots:
        for file in assignments[root2prefix[root]][roots[root]]:
            view_files[root_nodes[root]][file] = assignments[root2prefix[root]][roots[root]][file]
    with open("views/files_" + view_id + ".npy", 'wb') as f:
        np.save(f, view_files)

In [8]:
def set_words(roots, view_id):
    view_words = np.zeros((len(root_nodes), len(words_dic)))
    for root in roots:
        for word in nodes_words[root2prefix[root]][roots[root]]:
            view_words[root_nodes[root]][words_dic[word]] = tf_idf[root2prefix[root]][word]
    with open("views/words_" + view_id + ".npy", 'wb') as f:
        np.save(f, view_words)


In [9]:
def get_views():
    set_assignments(root2id, '0_0')
    set_words(root2id, '0_0')

    queues = {}
    for prefix in prefixes:
        for node in trees[prefix]:
            q_id = prefix + node['id']
            queues[q_id] = []
            for child in node['children']:
                queues[q_id].append(child)
    level = 0
    keep = True
    while keep:
        keep = False
        level += 1
        view_by_level(queues, level)
        for q in queues:
            tmp = []
            for node in queues[q]:
                for child in node['children']:
                    tmp.append(child)
            if len(tmp) > 0:
                keep = True
            queues[q] = tmp

def view_by_level(queues, level):
    max_q = 0
    for q in queues:
        max_q = max(max_q, len(queues[q]))
    print(max_q)
    for i in range(max_q):
        roots = {}
        for root in root_nodes:
            if i < len(queues[root]):
                roots[root] = queues[root][i]['id']
        set_assignments(roots, str(level) + '_' + str(i))
        set_words(roots, str(level) + '_' + str(i))

get_views()

15


In [10]:
def decode_words_view(level, row):
    with open("views/words_" + str(level) + "_" + str(row) + ".npy", 'rb') as f:
        view = np.load(f)
    for root in root_nodes:
        print('\n', root)
        for word in words_dic:
            if view[root_nodes[root]][words_dic[word]] != 0:
                print(word, view[root_nodes[root]][words_dic[word]], end=', ')

In [11]:
def decode_files_view(level, row):
    with open("views/files_" + str(level) + "_" + str(row) + ".npy", 'rb') as f:
        view = np.load(f)
    for root in root_nodes:
        print('\n', root)
        for file in data_files:
            if view[root_nodes[root]][file] != 0:
                print(file, view[root_nodes[root]][file], end=', ')