# Imports and settings

In [23]:
%load_ext autoreload
%autoreload 2

import os
import logging
import numpy as np
import pandas as pd

from gensim.models.poincare import PoincareModel, PoincareKeyedVectors, PoincareRelations

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

logging.basicConfig(level=logging.INFO)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
from pathlib import Path

In [None]:
import spacy 
nlp = spacy.load('en')

In [None]:
from tqdm import tqdm  # for notebooks
tqdm.pandas()

# Load data and models

In [None]:
from aspects.pipelines.aspect_analysis import AspectAnalysis
from aspects.utilities import settings
from aspects.rst.edu_tree_rules_extractor import EDURelation

DATASET_PATH = settings.AMAZON_REVIEWS_CELL_PHONES_AND_ACCESSORIES_DATASET_JSON

# TODO: better parametrization for experiments
aspect_analysis_gerani = AspectAnalysis(
    input_path=DATASET_PATH.as_posix(),
    output_path=settings.DEFAULT_OUTPUT_PATH / DATASET_PATH.stem,
    experiment_name='gerani',
    jobs=2,
    batch_size=100,
    max_docs=50000
)

In [None]:
discourse_tree_df = pd.read_pickle(aspect_analysis_gerani.paths.discourse_trees_df)

In [None]:
from typing import List, NamedTuple

class AspectsRelation(NamedTuple):
    aspect_1: str
    aspect_2: str
    relation_type: str
    weight: float

In [None]:
from itertools import product

relations = []
cycles = []

for row_id, row in tqdm(discourse_tree_df.iterrows(), total=len(discourse_tree_df), desc='Generating aspect-aspect graph based on rules'):
    for edu_left, edu_right, relation, weight in row.rules:
        for aspect_left, aspect_right in product(row.aspects[edu_left], row.aspects[edu_right]):
            if aspect_left != aspect_right:
                relations.append((aspect_left, aspect_right))
            else:
                cycles.append((aspect_left, aspect_right))

In [None]:
len(relations)

In [None]:
relations[:10]

In [None]:
# from aspects.enrichments import conceptnets

# conceptnet = conceptnets.load_conceptnet_io()
# #  'car' -> 'HasPrerequisite' -> 'tire'

# hierarchical_relations_parent_child = ['HasA', 'MadeOf', 'HasPrerequisite']
# hierarchical_relations_child_parent = ['PartOf', 'IsA']

# def get_neighbours_child_and_parents(conceptnet, concept, relation_types_get_child, relation_types_get_parent):
#     neighbours_childs = set(
#         (concept, concept_info['relation'], "---->", concept_info['end'])
#         for concept_info
#         in conceptnet[concept]
#         if concept_info['relation'] in relation_types_get_child
#     )
#     neighbours_parents = set(
#         (concept_info['start'], concept_info['relation'],  "<----", concept)
        
#         for concept_info
#         in conceptnet[concept]
#         if concept_info['relation'] in relation_types_get_parent
#     )

#     return list(neighbours_childs.union(neighbours_parents))

# get_neighbours_child_and_parents(conceptnet, 'tire', hierarchical_relations_child_parent, hierarchical_relations_parent_child)

In [None]:
# def get_concepts_based_on_hierarchy_relation(conceptnet, concept, relation_types_get_child, relation_types_get_parent):
#     neighbours_childs = set(
#         (concept_info['end'].replace('_', ' '), concept)
#         for concept_info
#         in conceptnet[concept]
#         if concept_info['relation'] in relation_types_get_child
#     )
#     neighbours_parents = set(
#         (concept_info['start'].replace('_', ' '), concept)
#         for concept_info
#         in conceptnet[concept]
#         if concept_info['relation'] in relation_types_get_parent
#     )

#     return list(neighbours_childs.union(neighbours_parents))

In [None]:
# get_concepts_based_on_hierarchy_relation(conceptnet, 'battery', hierarchical_relations_child_parent, hierarchical_relations_parent_child)

## Add concepts from conceptnet 

In [None]:
# from more_itertools import flatten

# unique_aspects = set(flatten(relations))

# conceptnet_relations = list(flatten([
#     get_concepts_based_on_hierarchy_relation(conceptnet, aspect, hierarchical_relations_child_parent, hierarchical_relations_parent_child)
#     for aspect
#     in unique_aspects 
# ]))

# len(conceptnet_relations)

# conceptnet_relations[:10]

# all_relations = relations + conceptnet_relations

# len(all_relations)

## Train or load pre-trained embedding

In [None]:
all_relations = relations

In [66]:
model = PoincareModel(train_data=all_relations, size=50, burn_in=0)

INFO:gensim.models.poincare:loading relations from train data..
INFO:gensim.models.poincare:loaded 22436 relations from train data, 7210 nodes


In [67]:
model.train(epochs=100, print_every=500)

INFO:gensim.models.poincare:training model of size 50 with 1 workers on 22436 relations for 100 epochs and 0 burn-in epochs, using lr=0.10000 burn-in lr=0.01000 negative=10
INFO:gensim.models.poincare:starting training (100 epochs)----------------------------------------
INFO:gensim.models.poincare:training on epoch 1, examples #4990-#5000, loss: 24.05
INFO:gensim.models.poincare:time taken for 5000 examples: 1.64 s, 3045.27 examples / s
INFO:gensim.models.poincare:training on epoch 1, examples #9990-#10000, loss: 23.98
INFO:gensim.models.poincare:time taken for 5000 examples: 1.50 s, 3331.17 examples / s
INFO:gensim.models.poincare:training on epoch 1, examples #14990-#15000, loss: 23.90
INFO:gensim.models.poincare:time taken for 5000 examples: 1.50 s, 3322.34 examples / s
INFO:gensim.models.poincare:training on epoch 1, examples #19990-#20000, loss: 23.83
INFO:gensim.models.poincare:time taken for 5000 examples: 1.56 s, 3204.97 examples / s
INFO:gensim.models.poincare:training on epo

INFO:gensim.models.poincare:time taken for 5000 examples: 1.76 s, 2833.01 examples / s
INFO:gensim.models.poincare:training on epoch 12, examples #19990-#20000, loss: 17.64
INFO:gensim.models.poincare:time taken for 5000 examples: 1.63 s, 3070.17 examples / s
INFO:gensim.models.poincare:training on epoch 13, examples #4990-#5000, loss: 17.28
INFO:gensim.models.poincare:time taken for 5000 examples: 1.53 s, 3261.48 examples / s
INFO:gensim.models.poincare:training on epoch 13, examples #9990-#10000, loss: 17.35
INFO:gensim.models.poincare:time taken for 5000 examples: 1.68 s, 2968.06 examples / s
INFO:gensim.models.poincare:training on epoch 13, examples #14990-#15000, loss: 17.32
INFO:gensim.models.poincare:time taken for 5000 examples: 1.63 s, 3062.03 examples / s
INFO:gensim.models.poincare:training on epoch 13, examples #19990-#20000, loss: 17.38
INFO:gensim.models.poincare:time taken for 5000 examples: 1.59 s, 3138.46 examples / s
INFO:gensim.models.poincare:training on epoch 14, e

INFO:gensim.models.poincare:time taken for 5000 examples: 1.61 s, 3107.04 examples / s
INFO:gensim.models.poincare:training on epoch 24, examples #19990-#20000, loss: 15.39
INFO:gensim.models.poincare:time taken for 5000 examples: 1.58 s, 3165.94 examples / s
INFO:gensim.models.poincare:training on epoch 25, examples #4990-#5000, loss: 15.29
INFO:gensim.models.poincare:time taken for 5000 examples: 1.54 s, 3248.93 examples / s
INFO:gensim.models.poincare:training on epoch 25, examples #9990-#10000, loss: 15.44
INFO:gensim.models.poincare:time taken for 5000 examples: 1.53 s, 3269.18 examples / s
INFO:gensim.models.poincare:training on epoch 25, examples #14990-#15000, loss: 15.27
INFO:gensim.models.poincare:time taken for 5000 examples: 1.70 s, 2947.16 examples / s
INFO:gensim.models.poincare:training on epoch 25, examples #19990-#20000, loss: 15.27
INFO:gensim.models.poincare:time taken for 5000 examples: 1.56 s, 3203.22 examples / s
INFO:gensim.models.poincare:training on epoch 26, e

INFO:gensim.models.poincare:time taken for 5000 examples: 2.01 s, 2486.19 examples / s
INFO:gensim.models.poincare:training on epoch 36, examples #19990-#20000, loss: 14.73
INFO:gensim.models.poincare:time taken for 5000 examples: 1.72 s, 2913.26 examples / s
INFO:gensim.models.poincare:training on epoch 37, examples #4990-#5000, loss: 14.51
INFO:gensim.models.poincare:time taken for 5000 examples: 1.76 s, 2834.40 examples / s
INFO:gensim.models.poincare:training on epoch 37, examples #9990-#10000, loss: 14.73
INFO:gensim.models.poincare:time taken for 5000 examples: 1.73 s, 2892.83 examples / s
INFO:gensim.models.poincare:training on epoch 37, examples #14990-#15000, loss: 14.61
INFO:gensim.models.poincare:time taken for 5000 examples: 1.57 s, 3175.44 examples / s
INFO:gensim.models.poincare:training on epoch 37, examples #19990-#20000, loss: 14.66
INFO:gensim.models.poincare:time taken for 5000 examples: 1.60 s, 3129.36 examples / s
INFO:gensim.models.poincare:training on epoch 38, e

INFO:gensim.models.poincare:time taken for 5000 examples: 1.72 s, 2910.31 examples / s
INFO:gensim.models.poincare:training on epoch 48, examples #19990-#20000, loss: 14.34
INFO:gensim.models.poincare:time taken for 5000 examples: 1.63 s, 3062.19 examples / s
INFO:gensim.models.poincare:training on epoch 49, examples #4990-#5000, loss: 13.98
INFO:gensim.models.poincare:time taken for 5000 examples: 1.66 s, 3018.85 examples / s
INFO:gensim.models.poincare:training on epoch 49, examples #9990-#10000, loss: 14.24
INFO:gensim.models.poincare:time taken for 5000 examples: 1.65 s, 3023.15 examples / s
INFO:gensim.models.poincare:training on epoch 49, examples #14990-#15000, loss: 14.38
INFO:gensim.models.poincare:time taken for 5000 examples: 1.75 s, 2859.34 examples / s
INFO:gensim.models.poincare:training on epoch 49, examples #19990-#20000, loss: 14.26
INFO:gensim.models.poincare:time taken for 5000 examples: 1.71 s, 2922.33 examples / s
INFO:gensim.models.poincare:training on epoch 50, e

INFO:gensim.models.poincare:time taken for 5000 examples: 1.70 s, 2947.41 examples / s
INFO:gensim.models.poincare:training on epoch 60, examples #19990-#20000, loss: 14.00
INFO:gensim.models.poincare:time taken for 5000 examples: 1.62 s, 3092.01 examples / s
INFO:gensim.models.poincare:training on epoch 61, examples #4990-#5000, loss: 13.68
INFO:gensim.models.poincare:time taken for 5000 examples: 1.67 s, 2992.91 examples / s
INFO:gensim.models.poincare:training on epoch 61, examples #9990-#10000, loss: 13.93
INFO:gensim.models.poincare:time taken for 5000 examples: 1.86 s, 2686.43 examples / s
INFO:gensim.models.poincare:training on epoch 61, examples #14990-#15000, loss: 13.84
INFO:gensim.models.poincare:time taken for 5000 examples: 1.76 s, 2847.63 examples / s
INFO:gensim.models.poincare:training on epoch 61, examples #19990-#20000, loss: 14.20
INFO:gensim.models.poincare:time taken for 5000 examples: 1.59 s, 3141.96 examples / s
INFO:gensim.models.poincare:training on epoch 62, e

INFO:gensim.models.poincare:time taken for 5000 examples: 1.52 s, 3279.26 examples / s
INFO:gensim.models.poincare:training on epoch 72, examples #19990-#20000, loss: 13.97
INFO:gensim.models.poincare:time taken for 5000 examples: 1.55 s, 3218.67 examples / s
INFO:gensim.models.poincare:training on epoch 73, examples #4990-#5000, loss: 13.77
INFO:gensim.models.poincare:time taken for 5000 examples: 1.55 s, 3225.23 examples / s
INFO:gensim.models.poincare:training on epoch 73, examples #9990-#10000, loss: 13.81
INFO:gensim.models.poincare:time taken for 5000 examples: 1.54 s, 3248.96 examples / s
INFO:gensim.models.poincare:training on epoch 73, examples #14990-#15000, loss: 13.81
INFO:gensim.models.poincare:time taken for 5000 examples: 1.58 s, 3165.68 examples / s
INFO:gensim.models.poincare:training on epoch 73, examples #19990-#20000, loss: 13.80
INFO:gensim.models.poincare:time taken for 5000 examples: 1.50 s, 3342.79 examples / s
INFO:gensim.models.poincare:training on epoch 74, e

INFO:gensim.models.poincare:time taken for 5000 examples: 1.55 s, 3225.00 examples / s
INFO:gensim.models.poincare:training on epoch 84, examples #19990-#20000, loss: 13.77
INFO:gensim.models.poincare:time taken for 5000 examples: 1.53 s, 3260.14 examples / s
INFO:gensim.models.poincare:training on epoch 85, examples #4990-#5000, loss: 13.71
INFO:gensim.models.poincare:time taken for 5000 examples: 1.53 s, 3263.97 examples / s
INFO:gensim.models.poincare:training on epoch 85, examples #9990-#10000, loss: 13.64
INFO:gensim.models.poincare:time taken for 5000 examples: 1.53 s, 3267.85 examples / s
INFO:gensim.models.poincare:training on epoch 85, examples #14990-#15000, loss: 13.74
INFO:gensim.models.poincare:time taken for 5000 examples: 1.59 s, 3144.51 examples / s
INFO:gensim.models.poincare:training on epoch 85, examples #19990-#20000, loss: 13.64
INFO:gensim.models.poincare:time taken for 5000 examples: 1.53 s, 3270.36 examples / s
INFO:gensim.models.poincare:training on epoch 86, e

INFO:gensim.models.poincare:time taken for 5000 examples: 1.55 s, 3226.46 examples / s
INFO:gensim.models.poincare:training on epoch 96, examples #19990-#20000, loss: 13.65
INFO:gensim.models.poincare:time taken for 5000 examples: 1.55 s, 3235.31 examples / s
INFO:gensim.models.poincare:training on epoch 97, examples #4990-#5000, loss: 13.41
INFO:gensim.models.poincare:time taken for 5000 examples: 1.53 s, 3276.46 examples / s
INFO:gensim.models.poincare:training on epoch 97, examples #9990-#10000, loss: 13.40
INFO:gensim.models.poincare:time taken for 5000 examples: 1.56 s, 3212.76 examples / s
INFO:gensim.models.poincare:training on epoch 97, examples #14990-#15000, loss: 13.77
INFO:gensim.models.poincare:time taken for 5000 examples: 1.55 s, 3222.84 examples / s
INFO:gensim.models.poincare:training on epoch 97, examples #19990-#20000, loss: 13.87
INFO:gensim.models.poincare:time taken for 5000 examples: 1.56 s, 3198.90 examples / s
INFO:gensim.models.poincare:training on epoch 98, e

In [68]:
model_path = aspect_analysis_gerani.paths.aspects_poincare_embeddings.with_suffix('.without_cycles_size_50').as_posix()

In [None]:
# Saves the entire PoincareModel instance, the loaded model can be trained further
model.save(model_path)

In [29]:
model = PoincareModel.load(model_path)

INFO:gensim.utils:loading PoincareModel object from /home/laugustyniak/github/phd/sentiment-backend/aspects/results/reviews_Cell_Phones_and_Accessories-50000-docs/gerani/aspects_poincare_embeddings.without_cycles
INFO:gensim.utils:loading kv recursively from /home/laugustyniak/github/phd/sentiment-backend/aspects/results/reviews_Cell_Phones_and_Accessories-50000-docs/gerani/aspects_poincare_embeddings.without_cycles.kv.* with mmap=None
INFO:gensim.utils:setting ignored attribute _node_counts_cumsum to None
INFO:gensim.utils:setting ignored attribute _node_probabilities to None
INFO:gensim.utils:loaded /home/laugustyniak/github/phd/sentiment-backend/aspects/results/reviews_Cell_Phones_and_Accessories-50000-docs/gerani/aspects_poincare_embeddings.without_cycles


# Experiment with embeddings distances and other attributes

In [69]:
model.kv.distance('phone', 'battery')

1.489919649694419

In [70]:
model.kv.distance('battery', 'phone')

1.4899196496944187

In [71]:
model.kv.distance('battery', 'sound')

1.9501985741689873

In [72]:
model.kv.most_similar('battery')

[('samsung galaxay s2', 0.20379016345395914),
 ('ky', 0.20779270918596948),
 ('batterys', 0.20908684712468184),
 ('nokia 6085', 0.23167321185122372),
 ('mouse / keyboard', 0.23376832314210613),
 ('bluetooth wireless headphone', 0.2361699841695625),
 ('the nokia battery', 0.2398347598502745),
 ('v60s motorola', 0.24003675853699497),
 ('sure performs', 0.24409806685430396),
 ('thin battery', 0.24630286722724992)]

In [73]:
model.kv.closest_child('battery')

'samsung galaxay s2'

In [74]:
model.kv.closest_parent('battery')

'rocksteady xl'

In [75]:
model.kv.ancestors('phone')

['br50', 'built - in games', 'nokia 6010', '700w']

In [76]:
model.kv.descendants('phone')

['nokia 8260', '100', 'phone - sprint', 'negative 110', 't1/t3']

In [77]:
# Saves only the vectors from the PoincareModel instance, in the commonly used word2vec format
# model.kv.save_word2vec_format('aspect_rules_vectors')
# PoincareKeyedVectors.load_word2vec_format('aspect_rules_vectors')

In [78]:
# Rank of distance of node 2 from node 1 in relation to distances of all nodes from node 1
model.kv.rank('phone', 'battery')

655

In [79]:
# Closest child node
model.kv.closest_child('sound')

'cassette slot'

In [80]:
# Closest child node
model.kv.closest_parent('bluetooth')

'the qwerty key650'

In [81]:
# # Position in hierarchy - lower values represent that the node is higher in the hierarchy
# print(model.kv.norm('virginia_deer.n.01'))
# print(model.kv.norm('sheep.n.01'))
# print(model.kv.norm('dog.n.01'))
# print(model.kv.norm('placental.n.01'))
# print(model.kv.norm('mammal.n.01'))

In [82]:
print(model.kv.difference_in_hierarchy('phone', 'battery'))

0.12124990535767222


In [83]:
print(model.kv.difference_in_hierarchy('battery', 'battery life'))

-0.13077939037375474


In [84]:
model.kv.difference_in_hierarchy('bluetooth', 'headset')

0.07872856870200895

In [85]:
model.kv.difference_in_hierarchy('charger', 'phone')

-0.28987449609222293

In [86]:
model.kv.difference_in_hierarchy('signal', 'phone')

-0.31579033459654904

In [87]:
# One possible descendant chain
model.kv.descendants('sound')

['cassette slot',
 'motorola mini wall charger',
 'central florida',
 'buzzing',
 'earbuds hook']

In [88]:
# One possible ancestor chain
model.kv.ancestors('sound')

['motorola hs820 bluetooth',
 'control',
 'jabra 250v',
 'boring startac',
 'motrola',
 'palm applications screen',
 '700w']

In [89]:
def check_hierarchy(model, parent: str, child: str) -> bool:
    return model.kv.difference_in_hierarchy(parent, child) > 0

In [90]:
check_hierarchy(model, 'battery', 'battery life')

False

# Check manually create hierarchy

In [91]:
from aspects.analysis.unsupervised_aspect_hierarchies import amazon_cellphone_aspect_hierarchy_100000_reviews

In [92]:
hierarchy_check_df = pd.DataFrame([
    (rule.nucleus, rule.satellite, check_hierarchy(model, rule.nucleus, rule.satellite))
    for rule 
    in amazon_cellphone_aspect_hierarchy_100000_reviews
    if rule.nucleus in model.kv.vocab and rule.satellite in model.kv.vocab
], columns=['nucleus', 'satellite', 'is_correct_hierarchy'])

In [93]:
# with cycles
len(hierarchy_check_df[hierarchy_check_df.is_correct_hierarchy]) / len(hierarchy_check_df)

0.68

In [94]:
hierarchy_check_df[~hierarchy_check_df.is_correct_hierarchy]

Unnamed: 0,nucleus,satellite,is_correct_hierarchy
4,phone,price,False
10,case,price,False
13,phone,battery life,False
14,headset,sound,False
16,headset,bluetooth,False
20,headset,price,False
22,headset,quality,False
23,phone,use,False
26,case,design,False
27,case,color,False


# Visualization

In [56]:
from gensim.viz.poincare import poincare_2d_visualization, poincare_distance_heatmap

In [57]:
all_relations = list(set(relations))

In [58]:
show_node_labels = ['phone', 'battery', 'sound', 'bluetooth', 'headset', 'price', 'quality', 'case', 'use', 'volume', 'battery life']
filtered_set = set()
for relation in all_relations:
    if relation[0] in show_node_labels and relation[1] in show_node_labels:
        filtered_set.add(relation)

In [59]:
filtered_set

{('battery', 'battery life'),
 ('battery', 'bluetooth'),
 ('battery', 'case'),
 ('battery', 'headset'),
 ('battery', 'phone'),
 ('battery', 'price'),
 ('battery', 'quality'),
 ('battery', 'sound'),
 ('battery', 'use'),
 ('battery', 'volume'),
 ('battery life', 'battery'),
 ('battery life', 'bluetooth'),
 ('battery life', 'case'),
 ('battery life', 'headset'),
 ('battery life', 'phone'),
 ('battery life', 'price'),
 ('battery life', 'quality'),
 ('battery life', 'sound'),
 ('battery life', 'use'),
 ('battery life', 'volume'),
 ('bluetooth', 'battery'),
 ('bluetooth', 'battery life'),
 ('bluetooth', 'phone'),
 ('bluetooth', 'price'),
 ('bluetooth', 'quality'),
 ('bluetooth', 'sound'),
 ('bluetooth', 'use'),
 ('bluetooth', 'volume'),
 ('case', 'battery'),
 ('case', 'battery life'),
 ('case', 'phone'),
 ('case', 'price'),
 ('case', 'quality'),
 ('case', 'sound'),
 ('case', 'use'),
 ('case', 'volume'),
 ('headset', 'battery'),
 ('headset', 'battery life'),
 ('headset', 'case'),
 ('headset',

In [60]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [61]:
init_notebook_mode(connected=True)

In [62]:
fig = poincare_2d_visualization(model, filtered_set, "Poincare Hierarchy", show_node_labels=show_node_labels)

In [63]:
iplot(fig)

In [None]:
iplot(fig, filename='poincare_viz.png')