In [24]:
import numpy as np
import pandas as pd
import nltk
from nltk import sent_tokenize
from nltk import word_tokenize
import spacy
import os.path

In [25]:
data_path = 'data/reviewSelected100.json'
os.path.exists(data_path)

True

In [26]:
review_df = pd.read_json(data_path, lines=True,encoding = "ISO-8859-1")
review_df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,8aoJJdKEO3ypoZNszpPu7Q,bGgAL09pxLnV_FFgR4ZADg,ZBE-H_aUlicix_9vUGQPIQ,5,0,0,0,We had my Mother's Birthday Party here on 10/2...,2016-11-09 20:07:25
1,J5NOCLdhuhor7USRhtYZ8w,pFCb-1j6oI3TDjr26h2cJQ,e-YnECeZNt8ngm0tu4X9mQ,4,0,0,0,Good Korean grill near Eaton Centre. The marin...,2015-12-05 05:06:43
2,PXiLWAYRt3xnHaJ8MB4rzw,mEzc6LeTNiQgIVsq3poMbg,j7HO1YeMQGYo3KibMXZ5vg,5,2,1,3,Was recommended to try this place by few peopl...,2014-10-11 05:16:15
3,VrLarvxZYJm74yAqtpe9PQ,o-zUN2WEZgjQS7jnNsec0g,7e3PZzUpG5FYOTGt3O3ePA,3,0,0,0,Ambience: Would not expect something this nice...,2016-07-25 03:45:26
4,C1CUpidlVFprUCkApqzCmA,Wlx0iBXJvk4x0EeOt2Bz1Q,vuHzLZ7nAeT-EiecOkS5Og,1,11,0,3,Absolutely the WORST pool company that I have ...,2016-04-11 18:49:11


In [27]:
#Randomly generate five indexes
review_random_idx = np.array(np.random.rand(5)*len(review_df), dtype=np.int32)

In [28]:
#Extract out those sentences 
review_random_df = review_df.iloc[review_random_idx]
review_random_df = review_random_df.reset_index()

In [29]:
print(review_random_df['text'])

0    Update: Seriously disappointed that my gel pol...
1    Five stars hands down!\n\nI am a proud Canadia...
2    I stopped into the Wok Box Fresh Asian Kitchen...
3    The best pretzels in town!\n\nWhen any of my c...
4    Just went there for the first time. I got the ...
Name: text, dtype: object


In [30]:
print(review_random_df['text'].loc[0])

Update: Seriously disappointed that my gel polish chipped and peeled after ONLY two weeks. I've never had gels last such a short period of time. All the gel manis I've gotten have lasted 3-4 weeks, usually leaning on the 4 week end. Not sure if it's their application process or the quality of their products, but sadly I won't be returning now with gels only lasting 2 weeks.


# Method 1 using NLTK

In [31]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/guangxushen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/guangxushen/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [32]:
#Tokenize the text
review_random_df['tokenize'] = review_random_df['text'].apply(nltk.word_tokenize)
#Tag those individual tokens respectively
review_random_df['pos_tag'] = review_random_df['tokenize'].apply(nltk.pos_tag)

In [33]:
#Export the output as the resulting json
review_random_df.to_json(r'output/reviewTagging5_1.json', orient='records', lines=True)

# Method 2 using Spacy

In [34]:
import en_core_web_sm

nlp = en_core_web_sm.load()

In [35]:
doc = nlp(review_random_df['text'].to_string())
print(type(doc))

<class 'spacy.tokens.doc.Doc'>


In [36]:
print(doc)
sentences = list(doc.sents)
#print(len(sentences))
# doc.to_json(r'output/reviewTagging5_2_doc.json')

0    Update: Seriously disappointed that my gel pol...
1    Five stars hands down!\n\nI am a proud Canadia...
2    I stopped into the Wok Box Fresh Asian Kitchen...
3    The best pretzels in town!\n\nWhen any of my c...
4    Just went there for the first time. I got the ...


In [37]:
#Get each of the line ready for pos tagging
doc_0 = nlp(review_random_df.loc[0]['text'])
doc_1 = nlp(review_random_df.loc[1]['text'])
doc_2 = nlp(review_random_df.loc[2]['text'])
doc_3 = nlp(review_random_df.loc[3]['text'])
doc_4 = nlp(review_random_df.loc[4]['text'])

In [38]:
#checking of the sentence obtained
print(doc_1)
sentence_1 = list(doc_1.sents)
print(len(sentence_1))

Five stars hands down!

I am a proud Canadian and have had my fair share of poutines, but last night I felt like my taste buds had finally been awakened.

Word on the street is they are getting a food truck! Look out Food Truck Frenzy, things are about to get cheesy!
5


In [39]:
result_list = [doc_0,doc_1,doc_2,doc_3,doc_4]
counter = 0
for element in result_list:
    counter += 1
    final_result = []
    for token in element:
        result = []
        result.append(token.text)
        result.append(token.pos_)
        final_result.append(result)
    print("===================")
    print("POS Tag for sentence", str(counter))
    print(final_result)

POS Tag for sentence 1
[['Update', 'NOUN'], [':', 'PUNCT'], ['Seriously', 'ADV'], ['disappointed', 'VERB'], ['that', 'SCONJ'], ['my', 'PRON'], ['gel', 'NOUN'], ['polish', 'NOUN'], ['chipped', 'VERB'], ['and', 'CCONJ'], ['peeled', 'VERB'], ['after', 'ADP'], ['ONLY', 'ADV'], ['two', 'NUM'], ['weeks', 'NOUN'], ['.', 'PUNCT'], ['I', 'PRON'], ["'ve", 'AUX'], ['never', 'ADV'], ['had', 'VERB'], ['gels', 'NOUN'], ['last', 'VERB'], ['such', 'DET'], ['a', 'DET'], ['short', 'ADJ'], ['period', 'NOUN'], ['of', 'ADP'], ['time', 'NOUN'], ['.', 'PUNCT'], ['All', 'DET'], ['the', 'DET'], ['gel', 'NOUN'], ['manis', 'NOUN'], ['I', 'PRON'], ["'ve", 'AUX'], ['gotten', 'VERB'], ['have', 'AUX'], ['lasted', 'VERB'], ['3', 'NUM'], ['-', 'SYM'], ['4', 'NUM'], ['weeks', 'NOUN'], [',', 'PUNCT'], ['usually', 'ADV'], ['leaning', 'VERB'], ['on', 'ADP'], ['the', 'DET'], ['4', 'NUM'], ['week', 'NOUN'], ['end', 'NOUN'], ['.', 'PUNCT'], ['Not', 'PART'], ['sure', 'ADJ'], ['if', 'SCONJ'], ['it', 'PRON'], ["'s", 'AUX'], ['t

# Method 3 Using Unigram Tagger trained with Brown Corpus

In [40]:
#Define new functions to tokenize the sentences generated
def tokenizer(sents, num):
    words = dict.fromkeys((i for i in range(num)), [])
    for i in range(num):
        words[i] = word_tokenize(sents[i])
    return words

In [41]:
print(review_random_df['text'])

0    Update: Seriously disappointed that my gel pol...
1    Five stars hands down!\n\nI am a proud Canadia...
2    I stopped into the Wok Box Fresh Asian Kitchen...
3    The best pretzels in town!\n\nWhen any of my c...
4    Just went there for the first time. I got the ...
Name: text, dtype: object


In [42]:
#Tokenize the sentences using the function definte previously
words = tokenizer(review_random_df['text'], 5)

In [43]:
nltk.download('brown')

[nltk_data] Downloading package brown to
[nltk_data]     /Users/guangxushen/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [44]:
# Unigram tagger trained with Brown corpus dataset
def tagger_unigram(words, num):
    from nltk.corpus import brown
    brown_tagged_sents = brown.tagged_sents(categories='news')
    unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
    #create a set of dictionary & key for words and taggs
    pos_2 = dict.fromkeys((i for i in range(num)), [])
    #use i to iterate through the sentence index first, then use j to iterate through the words in sentences
    for sentence,word in words.items():
        pos_2[sentence] = unigram_tagger.tag(word)
    return pos_2

In [45]:
#Call unigram
unigram_pos_tags = tagger_unigram(words, 5)

In [46]:
print("\nResults of unigram tagger:")
print(unigram_pos_tags[0])


Results of unigram tagger:
[('Update', None), (':', ':'), ('Seriously', None), ('disappointed', None), ('that', 'CS'), ('my', 'PP$'), ('gel', None), ('polish', None), ('chipped', 'VBD'), ('and', 'CC'), ('peeled', None), ('after', 'IN'), ('ONLY', None), ('two', 'CD'), ('weeks', 'NNS'), ('.', '.'), ('I', 'PPSS'), ("'ve", None), ('never', 'RB'), ('had', 'HVD'), ('gels', None), ('last', 'AP'), ('such', 'JJ'), ('a', 'AT'), ('short', 'JJ'), ('period', 'NN'), ('of', 'IN'), ('time', 'NN'), ('.', '.'), ('All', 'ABN'), ('the', 'AT'), ('gel', None), ('manis', None), ('I', 'PPSS'), ("'ve", None), ('gotten', 'VBN'), ('have', 'HV'), ('lasted', 'VBD'), ('3-4', None), ('weeks', 'NNS'), (',', ','), ('usually', 'RB'), ('leaning', None), ('on', 'IN'), ('the', 'AT'), ('4', 'CD'), ('week', 'NN'), ('end', 'NN'), ('.', '.'), ('Not', '*'), ('sure', 'JJ'), ('if', 'CS'), ('it', 'PPS'), ("'s", None), ('their', 'PP$'), ('application', 'NN'), ('process', 'NN'), ('or', 'CC'), ('the', 'AT'), ('quality', 'NN'), ('of