In [182]:
import numpy as np
import pandas as pd
import nltk
from nltk import sent_tokenize
from nltk import word_tokenize
import spacy
import os.path

In [183]:
data_path = 'data/reviewSelected100.json'
os.path.exists(data_path)

True

In [184]:
review_df = pd.read_json(data_path, lines=True,encoding = "ISO-8859-1")
review_df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,8aoJJdKEO3ypoZNszpPu7Q,bGgAL09pxLnV_FFgR4ZADg,ZBE-H_aUlicix_9vUGQPIQ,5,0,0,0,We had my Mother's Birthday Party here on 10/2...,2016-11-09 20:07:25
1,J5NOCLdhuhor7USRhtYZ8w,pFCb-1j6oI3TDjr26h2cJQ,e-YnECeZNt8ngm0tu4X9mQ,4,0,0,0,Good Korean grill near Eaton Centre. The marin...,2015-12-05 05:06:43
2,PXiLWAYRt3xnHaJ8MB4rzw,mEzc6LeTNiQgIVsq3poMbg,j7HO1YeMQGYo3KibMXZ5vg,5,2,1,3,Was recommended to try this place by few peopl...,2014-10-11 05:16:15
3,VrLarvxZYJm74yAqtpe9PQ,o-zUN2WEZgjQS7jnNsec0g,7e3PZzUpG5FYOTGt3O3ePA,3,0,0,0,Ambience: Would not expect something this nice...,2016-07-25 03:45:26
4,C1CUpidlVFprUCkApqzCmA,Wlx0iBXJvk4x0EeOt2Bz1Q,vuHzLZ7nAeT-EiecOkS5Og,1,11,0,3,Absolutely the WORST pool company that I have ...,2016-04-11 18:49:11


In [185]:
#Randomly generate five indexes
review_random_idx = np.array(np.random.rand(5)*len(review_df), dtype=np.int32)

In [186]:
#Extract out those sentences 
review_random_df = review_df.iloc[review_random_idx]
review_random_df = review_random_df.reset_index()

In [187]:
print(review_random_df['text'])

0    Everyone at Auto Tint Express was super friend...
1    For a waffle house this place is a little bett...
2    I know that I'm in the minority... but I reall...
3    The view is amazing but the food is all crap. ...
4    We have used Happy Endings services twice now....
Name: text, dtype: object


In [188]:
print(review_random_df['text'].loc[0])

Everyone at Auto Tint Express was super friendly, they helped me pick a good shade to go with for my girls 2014 corolla. Everything came out amazing and the waiting room is super nice. Free drinks and TV to watch with a comfortable couch. Definitely check them out for any vehicle you have!


# Method 1 using NLTK

In [189]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/guangxushen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/guangxushen/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [190]:
#Tokenize the text
review_random_df['tokenize'] = review_random_df['text'].apply(nltk.word_tokenize)
#Tag those individual tokens respectively
review_random_df['pos_tag'] = review_random_df['tokenize'].apply(nltk.pos_tag)

In [191]:
#Export the output as the resulting json
review_random_df.to_json(r'output/reviewTagging5_1.json', orient='records', lines=True)

# Method 2 using Spacy

In [192]:
import en_core_web_sm

nlp = en_core_web_sm.load()

In [193]:
doc = nlp(review_random_df['text'].to_string())
print(type(doc))

<class 'spacy.tokens.doc.Doc'>


In [194]:
print(doc)
sentences = list(doc.sents)
print(len(sentences))
# doc.to_json(r'output/reviewTagging5_2_doc.json')

0    Everyone at Auto Tint Express was super friend...
1    For a waffle house this place is a little bett...
2    I know that I'm in the minority... but I reall...
3    The view is amazing but the food is all crap. ...
4    We have used Happy Endings services twice now....
11


In [195]:
#Get each of the line ready for pos tagging
doc_0 = nlp(review_random_df.loc[0]['text'])
doc_1 = nlp(review_random_df.loc[1]['text'])
doc_2 = nlp(review_random_df.loc[2]['text'])
doc_3 = nlp(review_random_df.loc[3]['text'])
doc_4 = nlp(review_random_df.loc[2]['text'])

In [196]:
#checking of the sentence obtained
print(doc_1)
sentence_1 = list(doc_1.sents)
print(len(sentence_1))

For a waffle house this place is a little better than the others,, Staff is exceptional, and the place is always VERY Clean.... Awesome job...
2


In [197]:
for token in doc_0:
    print(token.text,token.pos_)

Everyone PRON
at ADP
Auto PROPN
Tint PROPN
Express PROPN
was AUX
super ADV
friendly ADJ
, PUNCT
they PRON
helped VERB
me PRON
pick VERB
a DET
good ADJ
shade NOUN
to PART
go VERB
with ADP
for ADP
my DET
girls NOUN
2014 NUM
corolla NOUN
. PUNCT
Everything PRON
came VERB
out ADP
amazing ADJ
and CCONJ
the DET
waiting NOUN
room NOUN
is AUX
super ADV
nice ADJ
. PUNCT
Free ADJ
drinks NOUN
and CCONJ
TV NOUN
to PART
watch VERB
with ADP
a DET
comfortable ADJ
couch NOUN
. PUNCT
Definitely ADV
check VERB
them PRON
out ADP
for ADP
any DET
vehicle NOUN
you PRON
have AUX
! PUNCT


In [198]:
for token in doc_1:
    print(token.text,token.pos_)

For ADP
a DET
waffle NOUN
house NOUN
this DET
place NOUN
is AUX
a DET
little ADJ
better ADJ
than SCONJ
the DET
others NOUN
, PUNCT
, PUNCT
Staff PROPN
is AUX
exceptional ADJ
, PUNCT
and CCONJ
the DET
place NOUN
is AUX
always ADV
VERY ADV
Clean ADJ
.... PUNCT
Awesome ADJ
job NOUN
... PUNCT


In [199]:
for token in doc_2:
    print(token.text,token.pos_)

I PRON
know VERB
that SCONJ
I PRON
'm AUX
in ADP
the DET
minority NOUN
... PUNCT
but CCONJ
I PRON
really ADV
do AUX
not PART
like VERB
The DET
Pretzel PROPN
Shop PROPN
. PUNCT
My DET
co NOUN
- NOUN
workers NOUN
often ADV
pick VERB
up ADP
pretzels NOUN
from ADP
here ADV
for ADP
birthdays NOUN
or CCONJ
other ADJ
office NOUN
events NOUN
, PUNCT
and CCONJ
everyone PRON
spends VERB
the DET
morning NOUN
raving VERB
about ADP
how ADV
delicious ADJ
the DET
pretzels NOUN
are AUX
and CCONJ
how ADV
they PRON
can VERB
not PART
stop VERB
eating VERB
them PRON
. PUNCT
Am AUX
I PRON
missing VERB
something PRON
? PUNCT
I PRON
just ADV
sit VERB
quietly ADV
at ADP
my DET
desk NOUN
and CCONJ
raise VERB
an DET
eyebrow NOUN
at ADP
all DET
this DET
chatter NOUN
. PUNCT
I PRON
've AUX
had AUX
them PRON
at ADP
least ADJ
5 NUM
- SYM
8 NUM
times NOUN
and CCONJ
can VERB
not PART
say VERB
that SCONJ
I PRON
've AUX
ever ADV
enjoyed VERB
them PRON
. PUNCT
The DET
regular ADJ
pretzels NOUN
are AUX
either CCONJ
over 

In [200]:
for token in doc_1:
    print(token.text,token.pos_)

For ADP
a DET
waffle NOUN
house NOUN
this DET
place NOUN
is AUX
a DET
little ADJ
better ADJ
than SCONJ
the DET
others NOUN
, PUNCT
, PUNCT
Staff PROPN
is AUX
exceptional ADJ
, PUNCT
and CCONJ
the DET
place NOUN
is AUX
always ADV
VERY ADV
Clean ADJ
.... PUNCT
Awesome ADJ
job NOUN
... PUNCT


In [201]:
for token in doc_3:
    print(token.text,token.pos_)

The DET
view NOUN
is AUX
amazing ADJ
but CCONJ
the DET
food NOUN
is AUX
all DET
crap ADJ
. PUNCT
The DET
wine NOUN
is AUX
over ADV
priced VERB
and CCONJ
the DET
food NOUN
is AUX
nothing PRON
to PART
write VERB
about ADP
. PUNCT
The DET
Server PROPN
did AUX
not PART
even ADV
know VERB
how ADV
to PART
carry VERB
plates NOUN
or CCONJ
serve VERB


In [202]:
for token in doc_4:
    print(token.text,token.pos_)

I PRON
know VERB
that SCONJ
I PRON
'm AUX
in ADP
the DET
minority NOUN
... PUNCT
but CCONJ
I PRON
really ADV
do AUX
not PART
like VERB
The DET
Pretzel PROPN
Shop PROPN
. PUNCT
My DET
co NOUN
- NOUN
workers NOUN
often ADV
pick VERB
up ADP
pretzels NOUN
from ADP
here ADV
for ADP
birthdays NOUN
or CCONJ
other ADJ
office NOUN
events NOUN
, PUNCT
and CCONJ
everyone PRON
spends VERB
the DET
morning NOUN
raving VERB
about ADP
how ADV
delicious ADJ
the DET
pretzels NOUN
are AUX
and CCONJ
how ADV
they PRON
can VERB
not PART
stop VERB
eating VERB
them PRON
. PUNCT
Am AUX
I PRON
missing VERB
something PRON
? PUNCT
I PRON
just ADV
sit VERB
quietly ADV
at ADP
my DET
desk NOUN
and CCONJ
raise VERB
an DET
eyebrow NOUN
at ADP
all DET
this DET
chatter NOUN
. PUNCT
I PRON
've AUX
had AUX
them PRON
at ADP
least ADJ
5 NUM
- SYM
8 NUM
times NOUN
and CCONJ
can VERB
not PART
say VERB
that SCONJ
I PRON
've AUX
ever ADV
enjoyed VERB
them PRON
. PUNCT
The DET
regular ADJ
pretzels NOUN
are AUX
either CCONJ
over 

# Method 3 Using Unigram Tagger trained with Brown Corpus

In [203]:
#Define new functions to tokenize the sentences generated
def tokenizer(sents, num):
    words = dict.fromkeys((i for i in range(num)), [])
    for i in range(num):
        words[i] = word_tokenize(sents[i])
    return words

In [204]:
print(review_random_df['text'])

0    Everyone at Auto Tint Express was super friend...
1    For a waffle house this place is a little bett...
2    I know that I'm in the minority... but I reall...
3    The view is amazing but the food is all crap. ...
4    We have used Happy Endings services twice now....
Name: text, dtype: object


In [205]:
#Tokenize the sentences using the function definte previously
words = tokenizer(review_random_df['text'], 5)

In [206]:
nltk.download('brown')

[nltk_data] Downloading package brown to
[nltk_data]     /Users/guangxushen/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [207]:
# Unigram tagger trained using Brown corpus
def tagger_unigram(words, num):
    from nltk.corpus import brown
    brown_tagged_sents = brown.tagged_sents(categories='news')
    ut = nltk.UnigramTagger(brown_tagged_sents)
    #create a new dictionary & key
    pos_2 = dict.fromkeys((i for i in range(num)), [])
    #use i to iterate through the sentence index first, then use j to iterate through the words in sentences
    for i,j in words.items():
        pos_2[i] = ut.tag(j)
    return pos_2

In [208]:
#Call unigram
unigram_pos_tags = tagger_unigram(words, 5)

In [209]:
print("\nResults of unigram tagger:")
print(unigram_pos_tags[0])


Results of unigram tagger:
[('Everyone', None), ('at', 'IN'), ('Auto', None), ('Tint', None), ('Express', None), ('was', 'BEDZ'), ('super', 'JJ'), ('friendly', 'JJ'), (',', ','), ('they', 'PPSS'), ('helped', 'VBN'), ('me', 'PPO'), ('pick', 'VB'), ('a', 'AT'), ('good', 'JJ'), ('shade', 'NN'), ('to', 'TO'), ('go', 'VB'), ('with', 'IN'), ('for', 'IN'), ('my', 'PP$'), ('girls', 'NNS'), ('2014', None), ('corolla', None), ('.', '.'), ('Everything', 'PN'), ('came', 'VBD'), ('out', 'RP'), ('amazing', 'JJ'), ('and', 'CC'), ('the', 'AT'), ('waiting', 'VBG'), ('room', 'NN'), ('is', 'BEZ'), ('super', 'JJ'), ('nice', 'JJ'), ('.', '.'), ('Free', None), ('drinks', None), ('and', 'CC'), ('TV', 'NN'), ('to', 'TO'), ('watch', 'VB'), ('with', 'IN'), ('a', 'AT'), ('comfortable', 'JJ'), ('couch', 'NN'), ('.', '.'), ('Definitely', None), ('check', 'NN'), ('them', 'PPO'), ('out', 'RP'), ('for', 'IN'), ('any', 'DTI'), ('vehicle', 'NN'), ('you', 'PPSS'), ('have', 'HV'), ('!', '.')]
