In [1]:
from os.path import basename, join, exists
from pathlib import Path
from os import system, makedirs
from glob import glob
from tqdm import tqdm_notebook
from more_itertools import flatten

import sys
import re

import pandas as pd
import networkx as nx

sys.path.append("../edu_dependency_parser")
from trees.parse_tree import ParseTree

sys.path.append("../aspects")
from analysis.aspects_hierarchy import get_aspects_hierarchy
from analysis import rst_data_extractors

from utilities.transformations import load_serialized

# Load dataset specific files

In [2]:
# here choose dataset's results main directory
data_path = '/datasets/sentiment/aspects/results/reviews_Cell_Phones_and_Accessories/'

In [3]:
dataset_name = basename(data_path)
data_path_trees = join(data_path,  'edu_trees_dir')
data_path_link_tree = join(data_path,  'link_trees_dir')
data_path_aspects_graph = join(data_path,  'aspects_graph')
data_path_aspects_page_ranks = join(data_path,  'aspects_page_ranks')
data_path_extracted_documents = join(data_path,  'extracted_documents')

# Extracted documents loading 

In [4]:
extracted_documents_data = rst_data_extractors.get_extracted_documents(data_path_extracted_documents)

100001it [00:05, 16884.00it/s]


In [5]:
extracted_documents_data.items()[:2]

[(0,
  'this phone case is awesome  and  a great deal if, I were to buy a  a phone from the  mall  they would of charge me twenty-five dollars. but this was a great deal to buy this phone case online.'),
 (1,
  'It is big and heavy.  I switched back to my regular earpiece with the wire!')]

In [6]:
load_serialized(join(data_path_extracted_documents, '0'))

'this phone case is awesome  and  a great deal if, I were to buy a  a phone from the  mall  they would of charge me twenty-five dollars. but this was a great deal to buy this phone case online.'

# Docs info analysis

In [7]:
docs_info = load_serialized(join(data_path, 'documents_info'))

In [8]:
docs_info.items()[:2]

[(0,
  {'EDUs': [0, 1, 2, 3, 4, 5],
   'accepted_edus': [0, 1, 2, 3, 4, 5],
   'aspect_concepts': {0: {}, 1: {}, 2: {}, 3: {}, 4: {}, 5: {}},
   'aspect_keywords': {0: {'rake': [(u'phone case', 4.0), (u'awesome', 1.0)],
     'text_rank': []},
    1: {'rake': [(u'great deal', 4.0)], 'text_rank': []},
    2: {'rake': [(u'phone', 1.0), (u'mall', 1.0), (u'buy', 1.0)],
     'text_rank': []},
    3: {'rake': [(u'charge', 1.0), (u'dollars', 1.0), (u'twenty-', 1.0)],
     'text_rank': []},
    4: {'rake': [(u'great deal', 4.0)], 'text_rank': []},
    5: {'rake': [(u'phone case online', 9.0), (u'buy', 1.0)],
     'text_rank': []}},
   'aspects': {0: [u'phone case'],
    1: [u'deal'],
    2: [u'phone', u'mall'],
    3: [u'dollar'],
    4: [u'deal'],
    5: [u'phone case']},
   'sentiment': {0: 1, 1: 1, 2: -1, 3: -1, 4: 1, 5: -1}}),
 (1,
  {'EDUs': [6, 7],
   'accepted_edus': [6, 7],
   'aspect_concepts': {6: {}, 7: {}},
   'aspect_keywords': {6: {'rake': [(u'heavy', 1.0), (u'big', 1.0)],
     't

# Aspects per EDU analysis

In [9]:
aspects_per_edu = load_serialized(join(data_path, 'aspects_per_edu'))

In [10]:
aspects_per_edu.items()[:10]

[(0, [u'phone case']),
 (1, [u'deal']),
 (2, [u'phone', u'mall']),
 (3, [u'dollar']),
 (4, [u'deal']),
 (5, [u'phone case']),
 (6, []),
 (7, [u'earpiece', u'wire']),
 (8, [u'antenna']),
 (9, [u'mount'])]

## Get docs with more than one edu with aspect

In [106]:
document_info_at_least_2_aspects_accepted = rst_data_extractors.get_docs_info_with_at_least_n_aspects_accepted(docs_info, 10)

In [107]:
document_info_at_least_2_aspects_accepted.items()[:2]

[(6,
  {'EDUs': [27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40],
   'accepted_edus': [27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40],
   'aspect_concepts': {27: {},
    28: {},
    29: {},
    30: {},
    31: {},
    32: {},
    33: {},
    34: {},
    35: {},
    36: {},
    37: {},
    38: {},
    39: {},
    40: {}},
   'aspect_keywords': {27: {'rake': [(u'fact', 1.0)], 'text_rank': []},
    28: {'rake': [(u'clips', 1.0), (u'belt', 1.0)], 'text_rank': []},
    29: {'rake': [(u'cloth', 1.0), (u'months', 1.0)], 'text_rank': []},
    30: {'rake': [(u'metal clip', 4.0), (u'covering', 1.0)], 'text_rank': []},
    31: {'rake': [(u'loose', 1.0)], 'text_rank': []},
    32: {'rake': [(u'small inconvenience', 4.0)], 'text_rank': []},
    33: {'rake': [(u'flip phone close', 9.0)], 'text_rank': []},
    34: {'rake': [], 'text_rank': []},
    35: {'rake': [], 'text_rank': []},
    36: {'rake': [(u'phone fits', 4.0), (u'snugly', 1.0)], 'text_rank': []},
    37: {'rake': [(u'acci

### # of aspects extracted

In [100]:
all_extracted_aspects = list(flatten([
        list(flatten(doc_info['aspects'].values()))
        for doc_id, doc_info 
        in tqdm_notebook(document_info_at_least_2_aspects_accepted.items())
]))




In [108]:
len(document_info_at_least_2_aspects_accepted)

42086

In [101]:
len(all_extracted_aspects)

1474714

In [102]:
len(set(all_extracted_aspects))

102025

In [103]:
all_extracted_aspects_counter = Counter(all_extracted_aspects)

In [104]:
all_extracted_aspects_counter.most_common()

[(u'phone', 92159),
 (u'headset', 26847),
 (u'case', 23542),
 (u'product', 22290),
 (u'battery', 19719),
 (u'problem', 15596),
 (u'what', 14072),
 (u'price', 13269),
 (u'one', 13254),
 (u'ear', 13164)]

In [62]:
[
    doc_info
    for doc_id, doc_info 
    in tqdm_notebook(document_info_at_least_2_aspects_accepted.items())
    if 0 in doc_info['sentiment'].values()
][:2]

[{'EDUs': [6, 7],
  'accepted_edus': [6, 7],
  'aspect_concepts': {6: {}, 7: {}},
  'aspect_keywords': {6: {'rake': [(u'heavy', 1.0), (u'big', 1.0)],
    'text_rank': []},
   7: {'rake': [(u'regular earpiece', 4.0),
     (u'switched back', 4.0),
     (u'wire', 1.0)],
    'text_rank': []}},
  'aspects': {6: [], 7: [u'earpiece', u'wire']},
  'sentiment': {6: 0, 7: -1}},
 {'EDUs': [8, 9, 10, 11, 12],
  'accepted_edus': [8, 9, 10, 11, 12],
  'aspect_concepts': {8: {}, 9: {}, 10: {}, 11: {}, 12: {}},
  'aspect_keywords': {8: {'rake': [(u'decent antenna', 4.0)], 'text_rank': []},
   9: {'rake': [(u'mount', 1.0)], 'text_rank': []},
   10: {'rake': [(u'thin metal horizontal', 9.0),
     (u'mount kit seperately', 9.0),
     (u'vertical bar', 4.0),
     (u'screws', 1.0),
     (u'buy', 1.0)],
    'text_rank': []},
   11: {'rake': [(u'6db gain', 4.0), (u'truck', 1.0), (u'mirror', 1.0)],
    'text_rank': []},
   12: {'rake': [(u'-- minimal gain', 9.0), (u'$ 200 amplifier', 1.0)],
    'text_rank': [

In [13]:
len(document_info_at_least_2_aspects_accepted)

98528

In [14]:
extracted_documents_data[0]

'this phone case is awesome  and  a great deal if, I were to buy a  a phone from the  mall  they would of charge me twenty-five dollars. but this was a great deal to buy this phone case online.'

## Get aspects distribution

In [15]:
from collections import Counter

In [16]:
aspects_counter = Counter(flatten(aspects_per_edu.values()))

In [17]:
aspects_counter.most_common(10)

[(u'phone', 92275),
 (u'headset', 26873),
 (u'case', 23596),
 (u'product', 22422),
 (u'battery', 19786),
 (u'problem', 15620),
 (u'what', 14095),
 (u'price', 13375),
 (u'one', 13281),
 (u'ear', 13179)]

In [18]:
aspects_counter['phone']

92275

## Get document with provided aspects

In [91]:
rst_data_extractors.get_document_based_on_aspect(docs_info, extracted_documents_data, 'car charger').items()[10:14]

100%|██████████| 99681/99681 [00:00<00:00, 150076.77it/s]


[(24620,
  "Not much to say other than since this is a genuine name-brand high quality charger, you aren't apt to fry your device or slowly destroy the innards with a low-grade poorly regulated charger.  This one charges about as quickly as I've seen a car charger do its job, has a very nice decently long cord, seems well constructed, and doesn't leave me worrying about how much shorter the life expectancy of my phone is going to be if it doesn't get fried in the meantime.  I do expect that this is going to outlast the junk I've purchased in the past for $2-3.  Surprisingly, this isn't that much more expensive than those, so there is no reason to skimp on that $1-2."),
 (24622,
  "I am a salesman who travels 4 days a week and I rely heavily on my car chargers. This one is durable and well built and performs well. I've gone through quite a few car chargers and this one is by far the best one I've found so far."),
 (81968,
  'Knowing that I have a car charger, is huge....I always forget 

# Aspect-aspect graph analysis

## Aspect's page ranks

In [20]:
aspects_page_ranks = load_serialized(data_path_aspects_page_ranks)

In [21]:
aspects_page_ranks_df = pd.DataFrame(
    [(aspect, page_rank) for aspect, page_rank in aspects_page_ranks.items()],
    columns=['aspect', 'pagerank']
)

In [22]:
aspects_page_ranks_df.head(10)

Unnamed: 0,aspect,pagerank
0,phone,0.050792
1,product,0.034677
2,headset,0.021423
3,case,0.020639
4,price,0.011911
5,battery,0.011545
6,item,0.01077
7,charger,0.010195
8,motorola,0.005958
9,what,0.005383


In [23]:
aspects_page_ranks_df['count'] = aspects_page_ranks_df.aspect.apply(lambda a: aspects_counter[a])

In [24]:
aspects_page_ranks_df.head(10)

Unnamed: 0,aspect,pagerank,count
0,phone,0.050792,92275
1,product,0.034677,22422
2,headset,0.021423,26873
3,case,0.020639,23596
4,price,0.011911,13375
5,battery,0.011545,19786
6,item,0.01077,7876
7,charger,0.010195,12405
8,motorola,0.005958,8031
9,what,0.005383,14095


In [25]:
aspects_graph = load_serialized(data_path_aspects_graph)

In [26]:
aspects_graph.to_directed()

<networkx.classes.multidigraph.MultiDiGraph at 0x7f5da31550d0>

### Edges analysis

Direction of the edge means relation from satellite to nucelus. S -> N

In [27]:
edges_counter = Counter(aspects_graph.edges())

In [28]:
edges_counter.most_common(10)

[((u'phone', u'case'), 481),
 ((u'phone', u'product'), 351),
 ((u'battery', u'phone'), 234),
 ((u'ear', u'headset'), 232),
 ((u'phone', u'headset'), 221),
 ((u'phone', u'battery'), 214),
 ((u'problem', u'phone'), 203),
 ((u'phone', u'charger'), 186),
 ((u'phone', u'price'), 158),
 ((u'camera', u'phone'), 157)]

In [89]:
aspect = u'volume'
sorted({
    (aspects, cooccurence)
    for aspects, cooccurence
    in edges_counter.items()
    if aspect in aspects
}, key=operator.itemgetter(1), reverse=True)[:10]    

[((u'volume', u'headset'), 64),
 ((u'volume', u'phone'), 41),
 ((u'volume', u'product'), 31),
 ((u'volume', u'headphone'), 15),
 ((u'volume', u'case'), 14),
 ((u'volume', u'price'), 10),
 ((u'headset', u'volume'), 10),
 ((u'volume', u'motorola'), 9),
 ((u'ear', u'volume'), 8),
 ((u'volume', u'bluetooth'), 8)]

In [63]:
aspect_hierarchy_elements = []
for (aspect_1, aspect_2), n_count in edges_counter.most_common(250):
    aspect_hierarchy_elements.append(get_aspects_hierarchy(edges_counter, aspects_page_ranks, aspect_1, aspect_2))

aspect_hierarchy_elements = set(aspect_hierarchy_elements)

In [64]:
get_aspects_hierarchy(edges_counter, aspects_page_ranks, 'phone', 'camera')

AspectHierarchyRST(nucleus='phone', satellite='camera', all_relations=176)

In [65]:
import operator

In [66]:
sorted(aspect_hierarchy_elements, key=operator.itemgetter(2), reverse=True)

[AspectHierarchyRST(nucleus=u'phone', satellite=u'case', all_relations=633),
 AspectHierarchyRST(nucleus=u'phone', satellite=u'battery', all_relations=448),
 AspectHierarchyRST(nucleus=u'phone', satellite=u'product', all_relations=446),
 AspectHierarchyRST(nucleus=u'phone', satellite=u'headset', all_relations=322),
 AspectHierarchyRST(nucleus=u'phone', satellite=u'problem', all_relations=280),
 AspectHierarchyRST(nucleus=u'phone', satellite=u'charger', all_relations=270),
 AspectHierarchyRST(nucleus=u'phone', satellite=u'price', all_relations=266),
 AspectHierarchyRST(nucleus=u'headset', satellite=u'ear', all_relations=251),
 AspectHierarchyRST(nucleus=u'phone', satellite=u'feature', all_relations=213),
 AspectHierarchyRST(nucleus=u'product', satellite=u'headset', all_relations=206),
 AspectHierarchyRST(nucleus=u'product', satellite=u'price', all_relations=204),
 AspectHierarchyRST(nucleus=u'phone', satellite=u'thing', all_relations=192),
 AspectHierarchyRST(nucleus=u'battery', satelli

In [31]:
aspects_graph.out_degree()['phone']

9555

In [32]:
aspects_graph.in_degree()['phone']

13198

In [33]:
aspects_graph.out_degree()['product']

2679

In [34]:
aspects_graph.in_degree()['product']

7696

In [35]:
aspects_graph.out_degree()['battery']

2154

In [36]:
aspects_graph.in_degree()['battery']

2499

In [37]:
aspects_in_degree = aspects_graph.in_degree()
aspects_out_degree = aspects_graph.out_degree()

In [38]:
aspects_page_ranks_df['in_degree'] = aspects_page_ranks_df.aspect.apply(lambda a: aspects_in_degree[a])
aspects_page_ranks_df['out_degree'] = aspects_page_ranks_df.aspect.apply(lambda a: aspects_out_degree[a])

In [39]:
aspects_page_ranks_df.head(10)

Unnamed: 0,aspect,pagerank,count,in_degree,out_degree
0,phone,0.050792,92275,13198,9555
1,product,0.034677,22422,7696,2679
2,headset,0.021423,26873,5904,2351
3,case,0.020639,23596,5216,2375
4,price,0.011911,13375,2750,2178
5,battery,0.011545,19786,2499,2154
6,item,0.01077,7876,2275,947
7,charger,0.010195,12405,2363,1474
8,motorola,0.005958,8031,1577,788
9,what,0.005383,14095,1136,1705


In [40]:
len([g for g in nx.strongly_connected_components(aspects_graph.to_directed())])

14304

In [41]:
# G = max(nx.weakly_connected_component_subgraphs(aspects_graph), key=len)