In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json 

from more_itertools import flatten

import plotly 
plotly.offline.init_notebook_mode(connected=True)

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

%matplotlib inline

In [3]:
import pandas as pd

In [4]:
with open('../results/reviews_Cell_Phones_and_Accessories/aspects_per_edu.json') as f:
    aspects = json.load(f)

In [5]:
aspects_flatten = list(flatten(aspects.values()))

In [6]:
aspects_df = pd.DataFrame(aspects_flatten, columns=['aspect'])

In [7]:
aspects__unique_df = pd.DataFrame.from_dict(aspects_df.aspect.value_counts())

In [8]:
aspects__unique__filtered_df = aspects__unique_df[aspects__unique_df.aspect >= 10]

In [9]:
aspects__unique__filtered_df.to_excel('../results/reviews_Cell_Phones_and_Accessories/aspects_per_edu_filtered_min_10_freq.xlsx')

In [10]:
aspects__unique__filtered_df = aspects__unique__filtered_df.reset_index()
aspects__unique__filtered_df.columns = ['aspect', 'count']
aspects__unique__filtered_df

Unnamed: 0,aspect,count
0,price,6383
1,motorola,4246
2,battery,3925
3,amazon,2805
4,sound quality,2636
5,charger,2605
6,screen,2506
7,quality,2350
8,sound,2317
9,use,2169


In [11]:
import spacy
nlp = spacy.load('en_vectors_web_lg')

In [12]:
def get_vectors(text):
    return nlp(text).vector

In [13]:
aspects__unique__filtered_df['embedding'] = aspects__unique__filtered_df.aspect.apply(get_vectors)

In [14]:
from sklearn.cluster import AgglomerativeClustering

In [15]:
hierarchical_cluster = AgglomerativeClustering(n_clusters=25).fit(aspects__unique__filtered_df.embedding.tolist())
aspects__unique__filtered_df['cluster'] = hierarchical_cluster.labels_

In [16]:
aspects__unique__filtered_df

Unnamed: 0,aspect,count,embedding,cluster
0,price,6383,"[-1.1277, 0.4237, 0.31249, -0.3565, 0.26942, -...",16
1,motorola,4246,"[-0.063224, 0.23633, 0.44686, 0.28599, 0.35976...",3
2,battery,3925,"[-0.014736, 0.63635, 0.61387, 0.17026, -0.4460...",13
3,amazon,2805,"[-0.73095, 0.45252, 0.1357, 0.25915, -0.14606,...",24
4,sound quality,2636,"[-0.253425, 0.68264997, -0.23280498, -0.327435...",21
5,charger,2605,"[0.023015, 0.2852, 0.20656, -0.044744, 0.10165...",13
6,screen,2506,"[0.65734, 0.015723, 0.054131, 0.11986, 0.2765,...",14
7,quality,2350,"[-0.61027, 0.73804, -0.25383, -0.37005, -0.108...",21
8,sound,2317,"[0.10342, 0.62726, -0.21178, -0.28482, 0.38483...",10
9,use,2169,"[-0.039594, -0.0633, -0.29835, -0.05685, -0.52...",11


In [17]:
from sklearn.manifold import TSNE

def get_tsne(df: pd.DataFrame, intent_col: str = 'aspect', tooltip_col: str = 'aspect', tsne=None) -> pd.DataFrame:
    if tsne is None:
        tsne = TSNE(n_components=2, init='random', random_state=0, perplexity=50)
    tsne_coords = tsne.fit_transform(df.embedding.tolist())
    return pd.DataFrame(dict(
        x=tsne_coords[:, 0],
        y=tsne_coords[:, 1],
        intent=df[intent_col],
        tooltip=df[tooltip_col],
        cluster=df.cluster
    ))

In [18]:
aspects__unique__filtered_df_tsne = get_tsne(aspects__unique__filtered_df)
aspects__unique__filtered_df_tsne.sample(5)

Unnamed: 0,x,y,intent,tooltip,cluster
505,18.705282,10.254844,john,john,15
131,-20.27183,-16.143173,key,key,2
830,-6.697741,-18.620546,mic quality,mic quality,21
0,18.501896,-9.187131,price,price,16
518,-17.405077,6.95237,usb 2.0,usb 2.0,20


In [19]:
from collections import namedtuple
from itertools import product

import pandas as pd
from plotly.offline import iplot

SymbolColor = namedtuple('ColorSymbol', 'symbol, color')

COLOR_PALETTE = [
    'rgb(253,174,97)',
    'rgb(215,48,39)',
    'rgb(166,217,106)',
    'rgb(26,152,80)',
    'rgb(0,139,139)',
    'rgb(0,191,255)',
    'rgb(0,0,128)',
    'rgb(138,43,226)',
    'rgb(0,0,0)',
]

SYMBOL_COLOR = [
    SymbolColor(cs[0], cs[1])
    for cs
    in product(range(33), COLOR_PALETTE)
]

def draw_embeddings(df: pd.DataFrame):
    layout = {
        'autosize': False,
        'width': 1500,
        'height': 1500,
        'margin': {
            'l': 50,
            'r': 50,
            'b': 100,
            't': 100,
            'pad': 4
        }
    }

    data = [
        {
            'x': sub_df.x,
            'y': sub_df.y,
            'text': sub_df.tooltip,
            'marker': {
                'symbol': SYMBOL_COLOR[cluster % len(SYMBOL_COLOR)].symbol,
                'color': SYMBOL_COLOR[cluster % len(SYMBOL_COLOR)].color,
                'size': 15
            },
            'mode': 'markers',
            'name': cluster
        }
        for cluster, sub_df
        in df.groupby(by='cluster')
    ]

    iplot({
        'data': data,
        'layout': layout
    })

In [20]:
draw_embeddings(aspects__unique__filtered_df_tsne)

In [21]:
import qgrid
qgrid_widget = qgrid.show_grid(aspects__unique__filtered_df_tsne, show_toolbar=True)

In [22]:
qgrid_widget

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…