# Imports 

In [1]:
import simplejson
import numpy as np
from glob import glob
from bs4 import BeautifulSoup
from datetime import datetime
from os.path import basename, join
from pprint import pprint
import re
import pickle 
import cPickle
import pandas as pd

import networkx as nx
from operator import itemgetter
import matplotlib.pyplot as plt

import sys
sys.path.append("/root/sa/edu_dependency_parser/src")
from trees.parse_tree import ParseTree

%matplotlib inline

In [66]:
from spacy.en import English
nlp = English()

# Load parsed data 

In [135]:
# data_path = '/datasets/sentiment/aspects/results-brexit/results/brexit-news-all/'
data_path = '/root/sa/results/brexit-event-4516937-body-no-lemma/'

In [136]:
def load_serialized(f_path, f_name):
    with open(join(f_path, f_name)) as f:
        obj = cPickle.load(f)
    return obj

In [137]:
aspect_graph = load_serialized(data_path, 'aspects_graph') 

In [138]:
def graph_stats(graph, top_n=25):
    n_n = len(graph.nodes())
    n_e = len(graph.edges())
    print '#Nodes: {}'.format(n_n)
    print '#Edges: {}'.format(n_e)
    print 'Connections coverage: {}%'.format(( float(n_e) / (n_n * n_n ) * 100))
    
    degree_sequence=sorted(nx.degree(graph).values(),reverse=True) # degree sequence
    dmax=max(degree_sequence)

    print '-'*30
    print 'Highest degree of nodes'
    nd=sorted(aspect_graph.degree_iter(), key=itemgetter(1), reverse=True)[0:top_n]
    pprint([x for x in nd if len(x[0]) > 1]) # filter out aspect consists of one letter
    
    print '-'*30
    print 'Page Rank'
    pg = nx.pagerank(aspect_graph)
    pprint(sorted([(node, pagerank) for node, pagerank in pg.items()], key=lambda x:pg[x[0]], reverse=True)[:top_n])
    print '-'*30
    pprint('Unique Values of Page Rank: {}, unique nodes: {}'.format(len(set([pagerank for node, pagerank in pg.items()])), n_n))

In [139]:
graph_stats(aspect_graph)

#Nodes: 21600
#Edges: 5632390
Connections coverage: 1.20721664952%
------------------------------
Highest degree of nodes
[(u'eu', 31629),
 (u'britain', 31020),
 (u'brexit', 27704),
 (u'referendum', 25092),
 (u'europe', 24887),
 (u'vote', 23978),
 (u'people', 22832),
 (u'country', 22404),
 (u'uk', 22110),
 (u'cameron', 21706),
 (u'brussels', 20941),
 (u'years', 20874),
 (u'scotland', 20774),
 (u'time', 20111),
 (u'article', 19123),
 (u'london', 18671),
 (u'prime minister', 18600),
 (u'european union', 17973),
 (u'decision', 16873),
 (u'way', 16553),
 (u'leaders', 16551),
 (u'brexit vote', 15920),
 (u'countries', 15467),
 (u'world', 15171)]
------------------------------
Page Rank
[(u'britain', 0.031807100668532116),
 (u'vote', 0.012641978126903626),
 (u'eu', 0.010734990741155342),
 (u'country', 0.00971982725885178),
 (u'brexit', 0.007180940023793626),
 (u'scotland', 0.006202332707480315),
 (u'europe', 0.0060306415020623914),
 (u'referendum', 0.005814965026936518),
 (u'london', 0.005569

## Sentiment EDUs load

In [140]:
sent_edu = load_serialized(data_path, 'sentiment_filtered_edus')

## Aspects load 

In [143]:
aspect_edu = load_serialized(data_path, 'aspects_per_edu')

### Summarize sentiment per aspect

In [144]:
def summarize_aspect_sentiment(aspect_edu, sent_edu):
    doc_asp_sent = []
    for doc_id, aspects in aspect_edu.iteritems():
        for aspect in aspects:
            doc_asp_sent.append((aspect, sent_edu[doc_id]['sentiment'][0]))
    
    df = pd.DataFrame()
    df['aspect'] = [x[0] for x in doc_asp_sent]
    df['sentiment'] = [x[1] for x in doc_asp_sent]
    
    df_mean = df.groupby('aspect').mean().reset_index().sort('sentiment', ascending=False)
    df_mean.columns = ['aspect', 'mean_sent']
    df_sum = df.groupby('aspect').sum().reset_index().sort('sentiment', ascending=False)
    df_sum.columns = ['aspect', 'sum_sent']
    df_count = df.groupby('aspect').count().reset_index().sort('sentiment', ascending=False)
    df_count.columns = ['aspect', 'occurrences']
    
    df = pd.merge(df_mean, df_sum, right_index=True, left_index=True)
    df = pd.merge(df, df_count, right_index=True, left_index=True)
    
    df = df[[u'aspect', u'mean_sent', u'sum_sent', u'occurrences']]
    
    return doc_asp_sent, df

doc_asp_sent, df = summarize_aspect_sentiment(aspect_edu, sent_edu)



In [148]:
aspects_to_remove = ['day', 'days', 'week', 'weeks', 'month', 'months', 'year', 'years', 'monday', 'tuesday', 'wednesday', 
                     'thursday', 'friday', 'saturday', 'sunday', 'january', 'february', 'march', 'april', 'may', 'june', 'july',
                     'august', 'september', 'october', 'november', 'december']

In [151]:
df[(df.aspect.str.len() > 1) & (~df.aspect.isin(aspects_to_remove))].head(50)

Unnamed: 0,aspect,mean_sent,sum_sent,occurrences
1876,britain,0.393472,4894,12438
5997,eu,-0.542575,-2415,4451
20830,vote,0.117008,485,4145
16980,scotland,0.411281,1400,3404
3970,country,-0.596593,-1751,2935
2483,cameron,0.391127,961,2457
1451,bloc,-0.266639,-653,2449
16040,referendum,-0.161318,-377,2337
1624,brexit,-0.246844,-567,2297
6343,europe,0.292705,634,2166
