# Explore Data
**Author:** Jane Hung  
**Date:** 1 Mar 2020  
**Citations:**  
@inproceedings{xu_bert2019,
    title = "BERT Post-Training for Review Reading Comprehension and Aspect-based Sentiment Analysis",
    author = "Xu, Hu and Liu, Bing and Shu, Lei and Yu, Philip S.",
    booktitle = "Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics",
    year = "2019",
}  
https://drive.google.com/file/d/1NGH5bqzEx6aDlYJ7O3hepZF4i_p4iMR8/view

## Initialize environment

In [2]:
import pandas as pd
import numpy as np
import os
import sys
import json
import pprint
import tensorflow as tf
from time import time
import io
import re

import nltk

import pickle
from csv import reader

import matplotlib as mpl
if os.environ.get('DISPLAY','') == '':
    print('no display found. Using non-interactive Agg backend')
    mpl.use('Agg')
import matplotlib.pyplot as plt
from matplotlib import colors
from matplotlib.ticker import PercentFormatter

from tensorflow.keras import layers
from tensorflow.keras.backend import sparse_categorical_crossentropy
from tensorflow.keras.layers import Dense, Flatten

from datetime import datetime

from transformers import BertTokenizer, TFBertModel

from sklearn.metrics import log_loss

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

no display found. Using non-interactive Agg backend


## Helper functions

In [3]:
def read_json(filename):
    f = open(filename,'r')
    data = json.loads(f.read())
    print('\n',filename)
    pprint.pprint(dict(list(data.items())[:1]))
    return(data)

## Import data

### Training Data

In [4]:
ae_laptop_train = read_json('../data/hu-data/ae/laptop/train.json')
ae_rest_train = read_json('../data/hu-data/ae/rest/train.json')


asc_laptop_train = read_json('../data/hu-data/asc/laptop/train.json')
asc_rest_train = read_json('../data/hu-data/asc/rest/train.json')


 ../data/hu-data/ae/laptop/train.json
{'0': {'label': ['B',
                 'O',
                 'O',
                 'O',
                 'O',
                 'O',
                 'O',
                 'B',
                 'I',
                 'O',
                 'O',
                 'O',
                 'O',
                 'O'],
       'sentence': ['Keyboard',
                    'is',
                    'great',
                    'but',
                    'primary',
                    'and',
                    'secondary',
                    'control',
                    'buttons',
                    'could',
                    'be',
                    'more',
                    'durable',
                    '.']}}

 ../data/hu-data/ae/rest/train.json
{'0': {'label': ['O', 'O', 'O', 'B'],
       'sentence': ['I', 'LOVE', 'their', 'Thai']}}

 ../data/hu-data/asc/laptop/train.json
{'327_0': {'id': '327_0',
           'polarity': 'positive',
           'sent

### Dev data

In [5]:
ae_laptop_dev  = read_json('../data/hu-data/ae/laptop/dev.json')
ae_rest_dev = read_json('../data/hu-data/ae/rest/dev.json')


asc_laptop_dev = read_json('../data/hu-data/asc/laptop/dev.json')
asc_rest_dev = read_json('../data/hu-data/asc/rest/dev.json')


 ../data/hu-data/ae/laptop/dev.json
{'0': {'label': ['O',
                 'O',
                 'O',
                 'O',
                 'O',
                 'O',
                 'O',
                 'O',
                 'O',
                 'O',
                 'O',
                 'O',
                 'O',
                 'O',
                 'O',
                 'O',
                 'O',
                 'O'],
       'sentence': ['I',
                    'have',
                    'had',
                    'this',
                    'laptop',
                    'for',
                    'a',
                    'few',
                    'months',
                    'now',
                    'and',
                    'i',
                    'would',
                    'say',
                    'im',
                    'pretty',
                    'satisfied',
                    '.']}}

 ../data/hu-data/ae/rest/dev.json
{'0': {'label': ['O',
           

Q: How do we get from the ASC data back to the AE data?

In [6]:
asc_laptop_train['327_0']
ae_laptop_train['400']

{'polarity': 'positive',
 'term': 'use',
 'id': '327_0',
 'sentence': 'Also it is very good for college students who just need a reliable, easy to use computer.'}

{'label': ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 'sentence': ['If',
  'you',
  'could',
  'stretch',
  'by',
  'a',
  'few',
  '100',
  'dollars',
  'I',
  'highly',
  'recommend',
  'you',
  'should',
  'replace',
  'your',
  'Windows',
  'laptop',
  'with',
  'this',
  'one',
  '.']}

## Play with BERT

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [8]:
tokenizer.tokenize(asc_laptop_train['327_0']['sentence'])

['Also',
 'it',
 'is',
 'very',
 'good',
 'for',
 'college',
 'students',
 'who',
 'just',
 'need',
 'a',
 'reliable',
 ',',
 'easy',
 'to',
 'use',
 'computer',
 '.']

## Play with AE baseline 

In [9]:
# tag with universal POS. Especially for nouns
nltk.pos_tag(ae_laptop_train['0']['sentence'],tagset='universal')

[('Keyboard', 'NOUN'),
 ('is', 'VERB'),
 ('great', 'ADJ'),
 ('but', 'CONJ'),
 ('primary', 'ADJ'),
 ('and', 'CONJ'),
 ('secondary', 'ADJ'),
 ('control', 'NOUN'),
 ('buttons', 'NOUN'),
 ('could', 'VERB'),
 ('be', 'VERB'),
 ('more', 'ADV'),
 ('durable', 'ADJ'),
 ('.', '.')]

In [10]:
ae_laptop_dev_df = pd.DataFrame.from_dict(ae_laptop_dev,orient='index')
ae_laptop_dev_df

Unnamed: 0,label,sentence
0,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[I, have, had, this, laptop, for, a, few, mont..."
1,"[O, O, O, O, B, I, O, O, O, O, O, O, B, O, O, ...","[Additional, caveat, :, the, base, installatio..."
2,"[O, O, O, O, B, O, O, O, O, B, O, O, O, O, O, ...","[it, is, of, high, quality, ,, has, a, killer,..."
3,"[O, B, O, O, O, O, O, O, O, O, O, O, O, O]","[The, screen, gets, smeary, and, dusty, very, ..."
4,"[O, O, O, O, O, O, O, O, O, O, O]","[I, previously, owned, an, HP, desktop, and, a..."
...,...,...
145,"[O, O, O, O, O]","[The, benefits, were, immediate, !]"
146,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[All-, in-, all, ,, I, would, definitely, reco..."
147,"[O, O, O, O, O]","[just, chill, and, enjoy, .]"
148,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[My, son, and, his, family, have, a, hard, tim..."


In [11]:
def pos_ae(tokenized_sentence):
    """
    Tag sentences using POS tagger
    """
    pos_sent = tokenized_sentence.apply(lambda sent:nltk.pos_tag(sent,tagset='universal'))
    
    
    # tag with BIO terminology
    ae_tag = lambda sent:['O' if token[1] != 'NOUN' 
                          else 'B' if ((token[1]=='NOUN') & (sent[ind-1][1]!='NOUN')) 
                          else 'I' for ind,token in enumerate(sent)]

    return(pos_sent.apply(ae_tag))

# since the POS tagger is based on the words themselves and not context.
ae_laptop_dev_df['predictions'] = pos_ae(ae_laptop_dev_df['sentence'])
ae_laptop_dev_df.head()

def convert_int(tagged_tokens):
    """
    Convert B,I,O tags to integers
    """
    return(tagged_tokens.apply(lambda sent: [0 if token=='O' else 1 if token=='B' else 2 for token in sent]))

convert_int(ae_laptop_dev_df['predictions'])


Unnamed: 0,label,sentence,predictions
0,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[I, have, had, this, laptop, for, a, few, mont...","[O, O, O, O, B, O, O, O, B, O, O, O, O, O, O, ..."
1,"[O, O, O, O, B, I, O, O, O, O, O, O, B, O, O, ...","[Additional, caveat, :, the, base, installatio...","[O, B, O, O, B, I, O, O, O, B, O, O, B, O, O, ..."
2,"[O, O, O, O, B, O, O, O, O, B, O, O, O, O, O, ...","[it, is, of, high, quality, ,, has, a, killer,...","[O, O, O, O, B, O, O, O, B, I, O, O, O, O, O, ..."
3,"[O, B, O, O, O, O, O, O, O, O, O, O, O, O]","[The, screen, gets, smeary, and, dusty, very, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
4,"[O, O, O, O, O, O, O, O, O, O, O]","[I, previously, owned, an, HP, desktop, and, a...","[O, O, O, O, B, I, O, O, B, I, O]"


0      [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...
1      [0, 1, 0, 0, 1, 2, 0, 0, 0, 1, 0, 0, 1, 0, 0, ...
2      [0, 0, 0, 0, 1, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, ...
3             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
4                      [0, 0, 0, 0, 1, 2, 0, 0, 1, 2, 0]
                             ...                        
145                                      [0, 1, 0, 0, 0]
146           [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0]
147                                      [0, 1, 0, 1, 0]
148    [0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, ...
149    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: predictions, Length: 150, dtype: object

## Explore AE classifier-based NP Chunker

In [12]:
# TODO try a more sophisticated method for chunking

In [74]:
ae_laptop_train['15']

sentence = nltk.pos_tag(ae_laptop_train['15']['sentence'])
sentence

def regex_parser(tokenized_sentence):
    grammar = r"""
      NP: {<DT|PP\$>?<JJ>*<NN>}   # chunk determiner/possessive, adjectives and noun
          {<NNP>+}                # chunk sequences of proper nouns
    """
    cp = nltk.RegexpParser(grammar)

    result = cp.parse(tokenized_sentence)
    return(result)
tree = regex_parser(sentence)

{'label': ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B',
  'I',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 'sentence': ['Toshiba',
  'is',
  'aware',
  'of',
  'the',
  'issue',
  'but',
  'unless',
  'the',
  'extended',
  'warrenty',
  'is',
  'bought',
  'Toshiba',
  'will',
  'do',
  'nothing',
  'about',
  'it',
  '.']}

[('Toshiba', 'NNP'),
 ('is', 'VBZ'),
 ('aware', 'JJ'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('issue', 'NN'),
 ('but', 'CC'),
 ('unless', 'IN'),
 ('the', 'DT'),
 ('extended', 'JJ'),
 ('warrenty', 'NN'),
 ('is', 'VBZ'),
 ('bought', 'VBN'),
 ('Toshiba', 'NNP'),
 ('will', 'MD'),
 ('do', 'VB'),
 ('nothing', 'NN'),
 ('about', 'IN'),
 ('it', 'PRP'),
 ('.', '.')]

In [75]:
print(tree)
[(ind, node) for ind, node in enumerate(tree)]
[(subtree[0],'B') if 'NP' in subtree.label() else (subtree[0],'O')for subtree in tree.subtrees() ]

(S
  (NP Toshiba/NNP)
  is/VBZ
  aware/JJ
  of/IN
  (NP the/DT issue/NN)
  but/CC
  unless/IN
  (NP the/DT extended/JJ warrenty/NN)
  is/VBZ
  bought/VBN
  (NP Toshiba/NNP)
  will/MD
  do/VB
  (NP nothing/NN)
  about/IN
  it/PRP
  ./.)


[(0, Tree('NP', [('Toshiba', 'NNP')])),
 (1, ('is', 'VBZ')),
 (2, ('aware', 'JJ')),
 (3, ('of', 'IN')),
 (4, Tree('NP', [('the', 'DT'), ('issue', 'NN')])),
 (5, ('but', 'CC')),
 (6, ('unless', 'IN')),
 (7, Tree('NP', [('the', 'DT'), ('extended', 'JJ'), ('warrenty', 'NN')])),
 (8, ('is', 'VBZ')),
 (9, ('bought', 'VBN')),
 (10, Tree('NP', [('Toshiba', 'NNP')])),
 (11, ('will', 'MD')),
 (12, ('do', 'VB')),
 (13, Tree('NP', [('nothing', 'NN')])),
 (14, ('about', 'IN')),
 (15, ('it', 'PRP')),
 (16, ('.', '.'))]

[(Tree('NP', [('Toshiba', 'NNP')]), 'O'),
 (('Toshiba', 'NNP'), 'B'),
 (('the', 'DT'), 'B'),
 (('the', 'DT'), 'B'),
 (('Toshiba', 'NNP'), 'B'),
 (('nothing', 'NN'), 'B')]

In [76]:
from nltk.corpus import conll2000
conll2000.chunked_sents('test.txt', chunk_types=['NP'])
print(cp.evaluate(conll2000.chunked_sents('test.txt', chunk_types=['NP'])))
print([tree])
print(cp.evaluate([tree]))
nltk.chunk.util.tree2conllstr(tree)
[el[2][0] for el in nltk.chunk.util.tree2conlltags(tree)]

[Tree('S', [Tree('NP', [('Rockwell', 'NNP'), ('International', 'NNP'), ('Corp.', 'NNP')]), Tree('NP', [("'s", 'POS'), ('Tulsa', 'NNP'), ('unit', 'NN')]), ('said', 'VBD'), Tree('NP', [('it', 'PRP')]), ('signed', 'VBD'), Tree('NP', [('a', 'DT'), ('tentative', 'JJ'), ('agreement', 'NN')]), ('extending', 'VBG'), Tree('NP', [('its', 'PRP$'), ('contract', 'NN')]), ('with', 'IN'), Tree('NP', [('Boeing', 'NNP'), ('Co.', 'NNP')]), ('to', 'TO'), ('provide', 'VB'), Tree('NP', [('structural', 'JJ'), ('parts', 'NNS')]), ('for', 'IN'), Tree('NP', [('Boeing', 'NNP')]), Tree('NP', [("'s", 'POS'), ('747', 'CD'), ('jetliners', 'NNS')]), ('.', '.')]), Tree('S', [Tree('NP', [('Rockwell', 'NNP')]), ('said', 'VBD'), Tree('NP', [('the', 'DT'), ('agreement', 'NN')]), ('calls', 'VBZ'), ('for', 'IN'), Tree('NP', [('it', 'PRP')]), ('to', 'TO'), ('supply', 'VB'), Tree('NP', [('200', 'CD'), ('additional', 'JJ'), ('so-called', 'JJ'), ('shipsets', 'NNS')]), ('for', 'IN'), Tree('NP', [('the', 'DT'), ('planes', 'NNS')

ChunkParse score:
    IOB Accuracy:  67.7%%
    Precision:     48.1%%
    Recall:        37.4%%
    F-Measure:     42.1%%
[Tree('S', [Tree('NP', [('Toshiba', 'NNP')]), ('is', 'VBZ'), ('aware', 'JJ'), ('of', 'IN'), Tree('NP', [('the', 'DT'), ('issue', 'NN')]), ('but', 'CC'), ('unless', 'IN'), Tree('NP', [('the', 'DT'), ('extended', 'JJ'), ('warrenty', 'NN')]), ('is', 'VBZ'), ('bought', 'VBN'), Tree('NP', [('Toshiba', 'NNP')]), ('will', 'MD'), ('do', 'VB'), Tree('NP', [('nothing', 'NN')]), ('about', 'IN'), ('it', 'PRP'), ('.', '.')])]
ChunkParse score:
    IOB Accuracy: 100.0%%
    Precision:    100.0%%
    Recall:       100.0%%
    F-Measure:    100.0%%


'Toshiba NNP B-NP\nis VBZ O\naware JJ O\nof IN O\nthe DT B-NP\nissue NN I-NP\nbut CC O\nunless IN O\nthe DT B-NP\nextended JJ I-NP\nwarrenty NN I-NP\nis VBZ O\nbought VBN O\nToshiba NNP B-NP\nwill MD O\ndo VB O\nnothing NN B-NP\nabout IN O\nit PRP O\n. . O'

['B',
 'O',
 'O',
 'O',
 'B',
 'I',
 'O',
 'O',
 'B',
 'I',
 'I',
 'O',
 'O',
 'B',
 'O',
 'O',
 'B',
 'O',
 'O',
 'O']

## Explore AE evaluation - CE

In [14]:
# only using 0,1 because there aren't many very large token phrases
log_loss(convert_int(pd.DataFrame(ae_laptop_dev_df.iloc[0]['label'])),convert_int(pd.DataFrame(ae_laptop_dev_df.iloc[0]['predictions'])),labels=[0,1])

3.837730665815654

## Explore AE evaluation - SemEval14

In [15]:
# TODO need to explore how we want to move forward with all sentences rather than just 1.
# Should try to implement the SemEval14 evaluation criteria bc this is best practice

## Play with ASC baseline

In [16]:
asc_laptop_dev_df = pd.DataFrame.from_dict(asc_laptop_dev,orient='index')
asc_laptop_dev_df.head()

Unnamed: 0,polarity,term,id,sentence
1113_0,negative,safe mode,1113_0,Not even safe mode boots.
2595_0,positive,Keyboard,2595_0,Keyboard was also very nice and had a solid feel.
1039_0,negative,Keyboard,1039_0,Keyboard is plastic and spongey feeling.
315_0,positive,quality,315_0,I would recommend this laptop to anyone lookin...
1284_0,negative,screen,1284_0,"Thus, when you carry it at a slanted angle, th..."


In [17]:
analyzer = SentimentIntensityAnalyzer()
pos_neg_tag_lst = []
for ind,sentence in enumerate(asc_laptop_dev_df.sentence):
    vs = analyzer.polarity_scores(sentence)
    pos_neg_tag = 'negative' if vs['compound'] <= -0.05 else 'positive' if vs['compound'] >= 0.05 else 'neutral' 
    if ind <10: print("{:-<65} {} ({})".format(sentence, str(vs['compound']),pos_neg_tag))
    pos_neg_tag_lst.append(pos_neg_tag)
asc_laptop_dev_df['predictions'] = pos_neg_tag_lst

Not even safe mode boots.---------------------------------------- -0.3412 (negative)
Keyboard was also very nice and had a solid feel.---------------- 0.5709 (positive)
Keyboard is plastic and spongey feeling.------------------------- 0.128 (positive)
I would recommend this laptop to anyone looking to get a new laptop who is willing to spend a little more money to get great quality! 0.784 (positive)
Thus, when you carry it at a slanted angle, the screen will "topple" or "slide" down, if you understand what I mean. 0.0 (neutral)
When I called Sony the Customer Service was Great.--------------- 0.6249 (positive)
I also did not like the loud noises it made or how the bottom of the computer would get really hot. -0.2755 (negative)
I also did not like the loud noises it made or how the bottom of the computer would get really hot. -0.2755 (negative)
Also, one of the users mentioned how the edges on the macbook is sharp, if you have money to spend on one of the incase shells, it doesn't seem 

## Explore ASC evaluation - accuracy

In [18]:
asc_laptop_dev_df.head()
(asc_laptop_dev_df.polarity == asc_laptop_dev_df.predictions).value_counts(normalize=True)

Unnamed: 0,polarity,term,id,sentence,predictions
1113_0,negative,safe mode,1113_0,Not even safe mode boots.,negative
2595_0,positive,Keyboard,2595_0,Keyboard was also very nice and had a solid feel.,positive
1039_0,negative,Keyboard,1039_0,Keyboard is plastic and spongey feeling.,positive
315_0,positive,quality,315_0,I would recommend this laptop to anyone lookin...,positive
1284_0,negative,screen,1284_0,"Thus, when you carry it at a slanted angle, th...",neutral


True     0.613333
False    0.386667
dtype: float64

In [19]:
log_loss(asc_laptop_dev_df.polarity,asc_laptop_dev_df.predictions)

ValueError: could not convert string to float: 'negative'