In [1]:
from os.path import basename, join, exists
from pathlib import Path
from os import system, makedirs
from glob import glob
from tqdm import tqdm_notebook
from more_itertools import flatten

import sys
import re

import pandas as pd
import networkx as nx

sys.path.append("../edu_dependency_parser")
from trees.parse_tree import ParseTree

sys.path.append("../aspects")
from analysis.aspects_hierarchy import get_aspects_hierarchy
from analysis import rst_data_extractors

from utilities.transformations import load_serialized

# Load dataset specific files

In [2]:
# here choose dataset's results main directory
data_path = '/datasets/sentiment/aspects/results/reviews_Cell_Phones_and_Accessories/'

In [3]:
dataset_name = basename(data_path)
data_path_trees = join(data_path,  'edu_trees_dir')
data_path_link_tree = join(data_path,  'link_trees_dir')
data_path_aspects_graph = join(data_path,  'aspects_graph')
data_path_aspects_page_ranks = join(data_path,  'aspects_page_ranks')
data_path_extracted_documents = join(data_path,  'extracted_documents')

# Extracted documents loading 

In [4]:
extracted_documents_data = rst_data_extractors.get_extracted_documents(data_path_extracted_documents)

100001it [00:05, 16798.68it/s]


In [5]:
extracted_documents_data.items()[:2]

[(0,
  'this phone case is awesome  and  a great deal if, I were to buy a  a phone from the  mall  they would of charge me twenty-five dollars. but this was a great deal to buy this phone case online.'),
 (1,
  'It is big and heavy.  I switched back to my regular earpiece with the wire!')]

In [7]:
load_serialized(join(data_path_extracted_documents, '0'))

'this phone case is awesome  and  a great deal if, I were to buy a  a phone from the  mall  they would of charge me twenty-five dollars. but this was a great deal to buy this phone case online.'

# Docs info analysis

In [8]:
docs_info = load_serialized(join(data_path, 'documents_info'))

In [None]:
docs_info.items()[:2]

# Aspects per EDU analysis

In [None]:
aspects_per_edu = load_serialized(join(data_path, 'aspects_per_edu'))

In [None]:
aspects_per_edu.items()[:10]

## Get docs with more than one edu with aspect

In [None]:
document_info_at_least_2_aspects_accepted = rst_data_extractors.get_docs_info_with_at_least_n_aspects_accepted(docs_info, 2)

In [None]:
document_info_at_least_2_aspects_accepted.items()[:2]

In [None]:
len(document_info_at_least_2_aspects_accepted)

In [None]:
extracted_documents_data[0]

## Get aspects distribution

In [None]:
from collections import Counter

In [None]:
aspects_counter = Counter(flatten(aspects_per_edu.values()))

In [None]:
aspects_counter.most_common(10)

In [None]:
aspects_counter['phone']

## Get document with provided aspects

In [None]:
rst_data_extractors.get_document_based_on_aspect(docs_info, extracted_documents_data, 'car charger').items()[:2]

# Aspect-aspect graph analysis

## Aspect's page ranks

In [None]:
aspects_page_ranks = load_serialized(data_path_aspects_page_ranks)

In [None]:
aspects_page_ranks_df = pd.DataFrame(
    [(aspect, page_rank) for aspect, page_rank in aspects_page_ranks.items()],
    columns=['aspect', 'pagerank']
)

In [None]:
aspects_page_ranks_df.head(10)

In [None]:
aspects_page_ranks_df['count'] = aspects_page_ranks_df.aspect.apply(lambda a: aspects_counter[a])

In [None]:
aspects_page_ranks_df.head(10)

In [None]:
aspects_graph = load_serialized(data_path_aspects_graph)

In [None]:
aspects_graph.to_directed()

### Edges analysis

Direction of the edge means relation from satellite to nucelus. S -> N

In [None]:
edges_counter = Counter(aspects_graph.edges())

In [None]:
edges_counter.most_common(10)

In [None]:
aspect_hierarchy_elements = []
for (aspect_1, aspect_2), n_count in edges_counter.most_common(50):
    aspect_hierarchy_elements.append(get_aspects_hierarchy(edges_counter, aspects_page_ranks, aspect_1, aspect_2))

aspect_hierarchy_elements = set(aspect_hierarchy_elements)

In [None]:
aspect_hierarchy_elements

In [None]:
aspects_graph.out_degree()['phone']

In [None]:
aspects_graph.in_degree()['phone']

In [None]:
aspects_graph.out_degree()['product']

In [None]:
aspects_graph.in_degree()['product']

In [None]:
aspects_graph.out_degree()['battery']

In [None]:
aspects_graph.in_degree()['battery']

In [None]:
aspects_in_degree = aspects_graph.in_degree()
aspects_out_degree = aspects_graph.out_degree()

In [None]:
aspects_page_ranks_df['in_degree'] = aspects_page_ranks_df.aspect.apply(lambda a: aspects_in_degree[a])
aspects_page_ranks_df['out_degree'] = aspects_page_ranks_df.aspect.apply(lambda a: aspects_out_degree[a])

In [None]:
aspects_page_ranks_df.head(10)

In [None]:
len([g for g in nx.strongly_connected_components(aspects_graph.to_directed())])

In [None]:
# G = max(nx.weakly_connected_component_subgraphs(aspects_graph), key=len)