# Imports and settings

In [1]:
%load_ext autoreload
%autoreload 2

import os
import logging
import numpy as np
import pandas as pd

from gensim.models.poincare import PoincareModel, PoincareKeyedVectors, PoincareRelations

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

logging.basicConfig(level=logging.INFO)

In [2]:
from pathlib import Path

In [3]:
import spacy 
nlp = spacy.load('en')

In [4]:
from tqdm import tqdm  # for notebooks
tqdm.pandas()

# Load data and models

In [5]:
data_directory = Path('../results/reviews_Cell_Phones_and_Accessories/')
aspect_rules_file = os.path.join(data_directory, 'aspect-rules.csv')

In [6]:
aspect_df_min_10_times = pd.read_csv(data_directory / 'aspects_per_edu_filtered_min_10_freq.csv')
aspect_df_min_10_times.columns = ['aspect', 'count']

The model can be initialized using an iterable of relations, where a relation is simply a pair of nodes

In [7]:
aspect_df_min_10_times.head()

Unnamed: 0,aspect,count
0,price,6383
1,motorola,4246
2,battery,3925
3,amazon,2805
4,sound quality,2636


In [8]:
aspect_rules_df = pd.read_csv(aspect_rules_file)
aspect_rules_df = aspect_rules_df[['id1', 'id2']]
aspect_rules_df.sample(2)

Unnamed: 0,id1,id2
2067,interface,cingular
216,design,case


## Add aspects from ConceptNet 

In [9]:
from aspects.enrichments import conceptnets

In [10]:
conceptnet = conceptnets.load_conceptnet_io()

INFO:aspects.enrichments.conceptnets:ConceptNet.io temp files will be load from: /home/laugustyniak/github/phd/sentiment-backend/aspects/data/conceptnet/conceptnet_io.pkl


 'car' -> 'HasPrerequisite' -> 'tire'

In [11]:
conceptnet['tire']

[{'end': 'active',
  'end-lang': 'en',
  'relation': 'Antonym',
  'start': 'tire',
  'start-lang': 'en',
  'weight': 0.15},
 {'end': 'bus_depot',
  'end-lang': 'en',
  'relation': 'AtLocation',
  'start': 'tire',
  'start-lang': 'en',
  'weight': 1.0},
 {'end': 'car',
  'end-lang': 'en',
  'relation': 'AtLocation',
  'start': 'tire',
  'start-lang': 'en',
  'weight': 1.0},
 {'end': 'car_show',
  'end-lang': 'en',
  'relation': 'AtLocation',
  'start': 'tire',
  'start-lang': 'en',
  'weight': 1.0},
 {'end': 'garage',
  'end-lang': 'en',
  'relation': 'AtLocation',
  'start': 'tire',
  'start-lang': 'en',
  'weight': 1.0},
 {'end': 'trunk',
  'end-lang': 'en',
  'relation': 'AtLocation',
  'start': 'tire',
  'start-lang': 'en',
  'weight': 1.0},
 {'end': 'get_leak',
  'end-lang': 'en',
  'relation': 'CapableOf',
  'start': 'tire',
  'start-lang': 'en',
  'weight': 1.0},
 {'end': 'need_attention_soon',
  'end-lang': 'en',
  'relation': 'CapableOf',
  'start': 'tire',
  'start-lang': 'en'

In [12]:
hierarchical_relations_parent_child = ['HasA', 'MadeOf', 'HasPrerequisite']
hierarchical_relations_child_parent = ['PartOf', 'IsA']

In [13]:
def get_neighbours_child_and_parents(conceptnet, concept, relation_types_get_child, relation_types_get_parent):
    neighbours_childs = set(
        (concept, concept_info['relation'], "---->", concept_info['end'])
        for concept_info
        in conceptnet[concept]
        if concept_info['relation'] in relation_types_get_child
    )
    neighbours_parents = set(
        (concept_info['start'], concept_info['relation'],  "<----", concept)
        
        for concept_info
        in conceptnet[concept]
        if concept_info['relation'] in relation_types_get_parent
    )

    return list(neighbours_childs.union(neighbours_parents))

In [14]:
get_neighbours_child_and_parents(conceptnet, 'tire', hierarchical_relations_child_parent, hierarchical_relations_parent_child)

[('tire', 'IsA', '---->', 'tire'),
 ('tire', 'PartOf', '---->', 'automobile'),
 ('tire', 'IsA', '---->', 'wheel'),
 ('tire', 'IsA', '---->', 'part_for_wheeled_vehicles'),
 ('tire', 'MadeOf', '<----', 'tire'),
 ('car', 'HasPrerequisite', '<----', 'tire'),
 ('tire', 'IsA', '---->', 'hoop'),
 ('tire', 'PartOf', '---->', "car's_wheel"),
 ('tire', 'IsA', '---->', 'indispose'),
 ('tire', 'PartOf', '---->', 'car'),
 ('tire', 'IsA', '---->', 'devolve'),
 ('tire', 'IsA', '---->', 'non_powered_device')]

In [15]:
def get_concepts_based_on_hierarchy_relation(conceptnet, concept, relation_types_get_child, relation_types_get_parent):
    neighbours_childs = set(
        (concept_info['end'].replace('_', ' '), concept)
        for concept_info
        in conceptnet[concept]
        if concept_info['relation'] in relation_types_get_child
    )
    neighbours_parents = set(
        (concept_info['start'].replace('_', ' '), concept)
        for concept_info
        in conceptnet[concept]
        if concept_info['relation'] in relation_types_get_parent
    )

    return list(neighbours_childs.union(neighbours_parents))

In [16]:
get_concepts_based_on_hierarchy_relation(conceptnet, 'battery', hierarchical_relations_child_parent, hierarchical_relations_parent_child)

[('using calculator', 'battery'),
 ('assault', 'battery'),
 ('source of current', 'battery'),
 ('solar energy system', 'battery'),
 ('electrical component', 'battery'),
 ('stamp mill', 'battery'),
 ('device stores energy chemically', 'battery'),
 ('start car', 'battery'),
 ('battery', 'battery'),
 ('voltage source', 'battery'),
 ('electrical device', 'battery'),
 ('collection', 'battery'),
 ('team', 'battery'),
 ('directed route', 'battery'),
 ('electrochemical cell', 'battery'),
 ('artillery', 'battery'),
 ('baseball team', 'battery'),
 ('power source', 'battery'),
 ('man made thing', 'battery')]

## Filter rules and aspects

In [17]:
aspect_rules_df = aspect_rules_df[aspect_rules_df.id1.isin(aspect_df_min_10_times.aspect.values) & aspect_rules_df.id2.isin(aspect_df_min_10_times.aspect.values)]

In [18]:
len(aspect_df_min_10_times.aspect.values)

946

In [19]:
aspect_rules_df.drop_duplicates(inplace=True)

In [20]:
relations = list(zip(aspect_rules_df.id2, aspect_rules_df.id1))

In [21]:
len(relations)

1551

In [22]:
# relations

## Add concepts from conceptnet 

In [23]:
from more_itertools import flatten

In [24]:
conceptnet_relations = list(flatten([
    get_concepts_based_on_hierarchy_relation(conceptnet, aspect, hierarchical_relations_child_parent, hierarchical_relations_parent_child)
    for aspect
    in aspect_df_min_10_times.aspect.values
]))

In [25]:
len(conceptnet_relations)

3099

In [26]:
all_relations = relations + conceptnet_relations

In [27]:
len(all_relations)

4650

## Filter aspects that are Named Entities [wrongly extracted during aspects extraction]

In [28]:
def is_named_entity(text):
    return True if nlp(text).ents else False

In [29]:
# aspect_rules_df = aspect_rules_df[~aspect_rules_df.progress_apply(lambda row: any([is_named_entity(row.id1), is_named_entity(row.id2)]), axis=1)]

In [30]:
# aspect_rules_df

## Train or load pre-trained embedding

In [64]:
model = PoincareModel(train_data=all_relations, size=2, burn_in=0)

INFO:gensim.models.poincare:loading relations from train data..
INFO:gensim.models.poincare:loaded 1551 relations from train data, 481 nodes


In [65]:
model.train(epochs=100, print_every=500)

INFO:gensim.models.poincare:training model of size 2 with 1 workers on 1551 relations for 100 epochs and 0 burn-in epochs, using lr=0.10000 burn-in lr=0.01000 negative=10
INFO:gensim.models.poincare:starting training (100 epochs)----------------------------------------
INFO:gensim.models.poincare:training finished


In [66]:
# Saves the entire PoincareModel instance, the loaded model can be trained further
model.save('aspect_rules_model')

INFO:gensim.utils:saving PoincareModel object under aspect_rules_model, separately None
INFO:gensim.utils:not storing attribute _node_counts_cumsum
INFO:gensim.utils:not storing attribute _node_probabilities
INFO:gensim.utils:saved aspect_rules_model


In [67]:
model = PoincareModel.load('aspect_rules_model')

INFO:gensim.utils:loading PoincareModel object from aspect_rules_model
INFO:gensim.utils:loading kv recursively from aspect_rules_model.kv.* with mmap=None
INFO:gensim.utils:setting ignored attribute _node_counts_cumsum to None
INFO:gensim.utils:setting ignored attribute _node_probabilities to None
INFO:gensim.utils:loaded aspect_rules_model


# Experiment with embeddings distances and other attributes

In [16]:
model.kv.distance('phone', 'battery')

0.8805650603076436

In [17]:
model.kv.distance('battery', 'phone')

0.8805650603076436

In [18]:
model.kv.distance('battery', 'sound')

0.837521578481867

In [19]:
model.kv.most_similar('battery')

[('europe', 0.08093198680902874),
 ('at & t', 0.08642383684136759),
 ('u.s.', 0.10222450055766011),
 ('accessory one', 0.11554119861045198),
 ('alltel', 0.14837914312316575),
 ('battery works', 0.15853504761528106),
 ('durability', 0.16727964344802884),
 ('extended life battery', 0.17513745444093765),
 ('never', 0.17926570359244448),
 ('bulky', 0.182199664237944)]

In [52]:
model.kv.closest_child('battery')

'oem'

In [53]:
model.kv.closest_parent('battery')

'europe'

In [54]:
model.kv.ancestors('phone')

['voice command',
 'operating system',
 'ipod',
 'quality sound',
 'voice quality',
 'size',
 'power',
 'shipping charge',
 'built',
 'ear hook']

In [55]:
model.kv.descendants('phone')

['outlook', 'nokia', 'earpiece', 'ear speaker', 'sync']

In [56]:
# Saves only the vectors from the PoincareModel instance, in the commonly used word2vec format
# model.kv.save_word2vec_format('aspect_rules_vectors')
# PoincareKeyedVectors.load_word2vec_format('aspect_rules_vectors')

In [57]:
# Rank of distance of node 2 from node 1 in relation to distances of all nodes from node 1
model.kv.rank('phone', 'battery')

127

In [58]:
# Closest child node
model.kv.closest_child('sound')

'battery life'

In [59]:
# Closest child node
model.kv.closest_parent('bluetooth')

'carry'

In [60]:
# # Position in hierarchy - lower values represent that the node is higher in the hierarchy
# print(model.kv.norm('virginia_deer.n.01'))
# print(model.kv.norm('sheep.n.01'))
# print(model.kv.norm('dog.n.01'))
# print(model.kv.norm('placental.n.01'))
# print(model.kv.norm('mammal.n.01'))

In [61]:
# Difference in hierarchy between the first node and the second node
# Positive values indicate the first node is higher in the hierarchy
print(model.kv.difference_in_hierarchy('phone', 'battery'))

0.14496697992576207


In [78]:
print(model.kv.difference_in_hierarchy('battery', 'battery life'))

-0.28312252769231994


In [76]:
model.kv.difference_in_hierarchy('bluetooth', 'headset')

0.35453929420589597

In [81]:
model.kv.difference_in_hierarchy('charger', 'phone')

-0.007124859370829295

In [82]:
model.kv.difference_in_hierarchy('signal', 'phone')

0.04070938856988271

In [65]:
# One possible descendant chain
model.kv.descendants('sound')

['battery life', 'volume controls', 'ear pieces', 'aa', 'bad']

In [66]:
# One possible ancestor chain
model.kv.ancestors('sound')

['controls',
 'music',
 'amazon',
 'priced',
 'quality',
 'volume buttons',
 'use',
 'ear hook']

# Visualization

In [67]:
from gensim.viz.poincare import poincare_2d_visualization, poincare_distance_heatmap

In [68]:
all_relations = list(set(relations))

In [69]:
show_node_labels = ['phone', 'battery', 'sound', 'bluetooth', 'headset', 'price']
filtered_set = set()
for relation in all_relations:
    if relation[0] in show_node_labels and relation[1] in show_node_labels:
        filtered_set.add(relation)

In [70]:
filtered_set

{('battery', 'phone'),
 ('battery', 'price'),
 ('bluetooth', 'price'),
 ('bluetooth', 'sound'),
 ('headset', 'bluetooth'),
 ('phone', 'battery'),
 ('phone', 'bluetooth'),
 ('phone', 'price'),
 ('price', 'battery'),
 ('price', 'bluetooth'),
 ('price', 'headset'),
 ('price', 'phone'),
 ('price', 'sound'),
 ('sound', 'price')}

In [71]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [84]:
init_notebook_mode(connected=True)

In [73]:
fig = poincare_2d_visualization(model, filtered_set, "Poincare Hierarchy", show_node_labels=show_node_labels)

In [2]:
iplot(fig)

NameError: name 'iplot' is not defined

In [50]:
iplot(fig, filename='poincare_viz.png')