In [1]:
import json
import pandas
import numpy as np
from gensim import models

In [None]:
model = models.Doc2Vec.load('../../d2w_model/model.bin')

In [None]:
def compare_sentences(sentence_a, sentence_b, iters=50):
    s = 0
    if sentence_a == sentence_b:
        return 1.0
    for i in range(iters):
        a = model.infer_vector(sentence_a.split(), steps=10)
        b = model.infer_vector(sentence_b.split(), steps=10)
        s += np.dot(a, b)/np.linalg.norm(a)/np.linalg.norm(b)
    return s/iters

In [None]:
def take_second(elem):
    return elem[1]

In [None]:
entities = [['the united states of america', 98.8],
            ['washington', 96.6],
            ['the united states', 98.6],
            ['five', 82.5],
            ['north america', 100.0],
            ['canada', 96.0],
            ['mexico', 89.5],
            ['alaska', 100.0],
            ['hawaii', 100.0],
            ['the mid - pacific', 93.2], 
            ['the pacific ocean', 97.5],
            ['the caribbean sea', 100.0]]

In [None]:
entities.sort(key=take_second, reverse=True)
print(entities)

In [None]:
matrix_shape = 10
entities_popular = entities[:matrix_shape]
print(entities_popular)

In [None]:
relationship_matrix = np.zeros(shape=(matrix_shape, matrix_shape), dtype=float)

for i in range(matrix_shape):
    for j in range(i, matrix_shape):
        # print(compare_sentences(entities_popular[i][0], entities_popular[j][0]))
        # print(entities_popular[i][0], '--->', entities_popular[j][0])
        relationship_matrix[i, j] = compare_sentences(entities_popular[i][0], entities_popular[j][0])

In [None]:
labels = [x[0] for x in entities_popular]
# print(labels)

In [None]:
df = pandas.DataFrame(data=relationship_matrix, columns=labels, index=labels)

In [None]:
df

In [None]:
print(df.loc['north america'].at['alaska'])
print(df.iloc[3, 4])
print(df.index[3])
print(df.columns[9])

In [None]:
def check_max_repeat(ent):
    first_max = ent[0]
    result = []
    for x in ent:
        if x[1] == first_max[1]:
            result.append(x)
    return result

In [None]:
def check_max_score(ent_max, matrix, matrix_shape):
    result = [0]*len(ent_max)
    for number in range(len(ent_max)):
        sum_row = 0
        sum_col = 0
        for i in range(matrix_shape):
            sum_row += matrix[number, i]
            sum_col += matrix[i, number]
        sum_row -= 1
        sum_col -= 1
        result[number] = sum_row + sum_col
    return result.index(max(result))

In [None]:
entities_popular_max = check_max_repeat(entities_popular)
print(entities_popular_max)

In [None]:
parent_node_num = check_max_score(entities_popular_max, relationship_matrix, matrix_shape)
print('Index of parent node is %s' % parent_node_num)
tree = {'text': labels[parent_node_num], 'id': 0, 'nodes':[]}

In [None]:
def find_n_nodes(parent_node_num, df, matrix_shape, n):
    nodes = []
    for i in range(matrix_shape):
        i_row = df.iloc[parent_node_num, i]
        if (i_row != 0) and (i_row != 1):
            nodes.append([df.index[i], i_row])
        i_col = df.iloc[i, parent_node_num]
        if (i_col != 0) and (i_col != 1):
            nodes.append([df.columns[i], i_col])
    nodes.sort(key=take_second, reverse=True)
    result = [x[0] for x in nodes[:n]]
    return result

In [None]:
second_level_nodes = find_n_nodes(parent_node_num, df, matrix_shape, 5)
print(second_level_nodes)
for i in range(len(second_level_nodes)):
    tree['nodes'].append({'text': second_level_nodes[i], 'id': i+1, 'nodes':[]})
print(tree)

In [None]:
def find_rest_nodes(second_level_nodes, df, matrix_shape, labels, tree, parent_node):
    result = []
    for node in labels:
        if (node not in second_level_nodes) and node != parent_node:
            nodes_list = []
            node_num = labels.index(node)
            for i in range(matrix_shape):
                i_row = df.iloc[node_num, i]
                if ((i_row != 0) and (i_row != 1) and (df.index[i] in second_level_nodes)):
                    nodes_list.append([df.index[i], i_row])
                i_col = df.iloc[i, node_num]
                if ((i_col != 0) and (i_col != 1) and (df.columns[i] in second_level_nodes)):
                    nodes_list.append([df.columns[i], i_col])
            nodes_list.sort(key=take_second, reverse=True)
            result.append([nodes_list[0][0], node])
    return result

In [None]:
third_level_nodes = find_rest_nodes(second_level_nodes, df, matrix_shape, labels, tree, labels[parent_node_num])
print(third_level_nodes)

In [None]:
idx = len(second_level_nodes) + 1
for node in third_level_nodes:
    for x in tree['nodes']:
        if x['text'] == node[0]:
            x['nodes'].append({'text': node[1], 'id': idx, 'nodes':[]})
            idx += 1

In [None]:
output = json.dumps(tree, indent=2)
line_list = output.split("\n")
for line in line_list:
    print(line)