In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk import sent_tokenize
from nltk import word_tokenize
import spacy
import os.path

In [2]:
data_path = 'data/reviewSelected100.json'
os.path.exists(data_path)

True

In [3]:
review_df = pd.read_json(data_path, lines=True,encoding = "ISO-8859-1")
review_df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,8aoJJdKEO3ypoZNszpPu7Q,bGgAL09pxLnV_FFgR4ZADg,ZBE-H_aUlicix_9vUGQPIQ,5,0,0,0,We had my Mother's Birthday Party here on 10/2...,2016-11-09 20:07:25
1,J5NOCLdhuhor7USRhtYZ8w,pFCb-1j6oI3TDjr26h2cJQ,e-YnECeZNt8ngm0tu4X9mQ,4,0,0,0,Good Korean grill near Eaton Centre. The marin...,2015-12-05 05:06:43
2,PXiLWAYRt3xnHaJ8MB4rzw,mEzc6LeTNiQgIVsq3poMbg,j7HO1YeMQGYo3KibMXZ5vg,5,2,1,3,Was recommended to try this place by few peopl...,2014-10-11 05:16:15
3,VrLarvxZYJm74yAqtpe9PQ,o-zUN2WEZgjQS7jnNsec0g,7e3PZzUpG5FYOTGt3O3ePA,3,0,0,0,Ambience: Would not expect something this nice...,2016-07-25 03:45:26
4,C1CUpidlVFprUCkApqzCmA,Wlx0iBXJvk4x0EeOt2Bz1Q,vuHzLZ7nAeT-EiecOkS5Og,1,11,0,3,Absolutely the WORST pool company that I have ...,2016-04-11 18:49:11


In [4]:
#Randomly generate five indexes
review_random_idx = np.array(np.random.rand(5)*len(review_df), dtype=np.int32)

In [5]:
#Extract out those sentences 
review_random_df = review_df.iloc[review_random_idx]
review_random_df = review_random_df.reset_index()

In [6]:
print(review_random_df['text'])

0    This place IS the quintessential hole in the w...
1    Better than expected.   Less wait time than th...
2    First let me say I will never come back to thi...
3    Kandahar kabab and Chicken breast kabab are my...
4    Perk-Cup is definitely a convenient spot to dr...
Name: text, dtype: object


In [7]:
print(review_random_df['text'].loc[0])

This place IS the quintessential hole in the wall restaurant that you always heard about but never been to before. Amazingly enough it is located right off of Central Avenue which is a very busy road but because of the exterior of you would drive right past it and think nothing of it.

 DON'T DO THAT AGAIN!!! Stop HERE and eat! 

Cheese steak Sub JUST DO IT

This place has a chicken parmesan pasta that you could literally eat with a fork if you were 105 years the chicken breast is so tender and succulent. 
One of the best pizzas I've had aside from an actual slice from NY.


# Method 1 using NLTK

In [8]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/guangxushen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/guangxushen/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [9]:
#Tokenize the text
review_random_df['tokenize'] = review_random_df['text'].apply(nltk.word_tokenize)
#Tag those individual tokens respectively
review_random_df['pos_tag'] = review_random_df['tokenize'].apply(nltk.pos_tag)

In [10]:
#Export the output as the resulting json
review_random_df.to_json(r'output/reviewTagging5_1.json', orient='records', lines=True)

# Method 2 using Spacy

In [11]:
import en_core_web_sm

nlp = en_core_web_sm.load()

In [12]:
doc = nlp(review_random_df['text'].to_string())
print(type(doc))

<class 'spacy.tokens.doc.Doc'>


In [13]:
print(doc)
sentences = list(doc.sents)
print(len(sentences))
# doc.to_json(r'output/reviewTagging5_2_doc.json')

0    This place IS the quintessential hole in the w...
1    Better than expected.   Less wait time than th...
2    First let me say I will never come back to thi...
3    Kandahar kabab and Chicken breast kabab are my...
4    Perk-Cup is definitely a convenient spot to dr...
11


In [14]:
#Get each of the line ready for pos tagging
doc_0 = nlp(review_random_df.loc[0]['text'])
doc_1 = nlp(review_random_df.loc[1]['text'])
doc_2 = nlp(review_random_df.loc[2]['text'])
doc_3 = nlp(review_random_df.loc[3]['text'])
doc_4 = nlp(review_random_df.loc[2]['text'])

In [15]:
#checking of the sentence obtained
print(doc_1)
sentence_1 = list(doc_1.sents)
print(len(sentence_1))

Better than expected.   Less wait time than the Val Vista/ Williams Field location.  Thorough cleaning for the basic
3


In [16]:
result_list = [doc_0,doc_1,doc_2,doc_3,doc_4]
counter = 0
for element in result_list:
    counter += 1
    final_result = []
    for token in element:
        result = []
        result.append(token.text)
        result.append(token.pos_)
        final_result.append(result)
    print("===================")
    print("POS Tag for sentence", str(counter))
    print(final_result)

POS Tag for sentence 1
[['This', 'DET'], ['place', 'NOUN'], ['IS', 'VERB'], ['the', 'DET'], ['quintessential', 'ADJ'], ['hole', 'NOUN'], ['in', 'ADP'], ['the', 'DET'], ['wall', 'PROPN'], ['restaurant', 'NOUN'], ['that', 'DET'], ['you', 'PRON'], ['always', 'ADV'], ['heard', 'VERB'], ['about', 'ADP'], ['but', 'CCONJ'], ['never', 'ADV'], ['been', 'AUX'], ['to', 'ADP'], ['before', 'ADV'], ['.', 'PUNCT'], ['Amazingly', 'ADV'], ['enough', 'ADV'], ['it', 'PRON'], ['is', 'AUX'], ['located', 'VERB'], ['right', 'ADV'], ['off', 'ADP'], ['of', 'ADP'], ['Central', 'PROPN'], ['Avenue', 'PROPN'], ['which', 'DET'], ['is', 'AUX'], ['a', 'DET'], ['very', 'ADV'], ['busy', 'ADJ'], ['road', 'NOUN'], ['but', 'CCONJ'], ['because', 'SCONJ'], ['of', 'ADP'], ['the', 'DET'], ['exterior', 'NOUN'], ['of', 'ADP'], ['you', 'PRON'], ['would', 'VERB'], ['drive', 'VERB'], ['right', 'ADV'], ['past', 'ADP'], ['it', 'PRON'], ['and', 'CCONJ'], ['think', 'VERB'], ['nothing', 'PRON'], ['of', 'ADP'], ['it', 'PRON'], ['.', 'PU

# Method 3 Using Unigram Tagger trained with Brown Corpus

In [17]:
#Define new functions to tokenize the sentences generated
def tokenizer(sents, num):
    words = dict.fromkeys((i for i in range(num)), [])
    for i in range(num):
        words[i] = word_tokenize(sents[i])
    return words

In [18]:
print(review_random_df['text'])

0    This place IS the quintessential hole in the w...
1    Better than expected.   Less wait time than th...
2    First let me say I will never come back to thi...
3    Kandahar kabab and Chicken breast kabab are my...
4    Perk-Cup is definitely a convenient spot to dr...
Name: text, dtype: object


In [19]:
#Tokenize the sentences using the function definte previously
words = tokenizer(review_random_df['text'], 5)

In [20]:
nltk.download('brown')

[nltk_data] Downloading package brown to
[nltk_data]     /Users/guangxushen/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [21]:
# Unigram tagger trained using Brown corpus
def tagger_unigram(words, num):
    from nltk.corpus import brown
    brown_tagged_sents = brown.tagged_sents(categories='news')
    ut = nltk.UnigramTagger(brown_tagged_sents)
    #create a new dictionary & key
    pos_2 = dict.fromkeys((i for i in range(num)), [])
    #use i to iterate through the sentence index first, then use j to iterate through the words in sentences
    for i,j in words.items():
        pos_2[i] = ut.tag(j)
    return pos_2

In [22]:
#Call unigram
unigram_pos_tags = tagger_unigram(words, 5)

In [23]:
print("\nResults of unigram tagger:")
print(unigram_pos_tags[0])


Results of unigram tagger:
[('This', 'DT'), ('place', 'NN'), ('IS', None), ('the', 'AT'), ('quintessential', None), ('hole', 'NN'), ('in', 'IN'), ('the', 'AT'), ('wall', 'NN'), ('restaurant', 'NN'), ('that', 'CS'), ('you', 'PPSS'), ('always', 'RB'), ('heard', 'VBN'), ('about', 'IN'), ('but', 'CC'), ('never', 'RB'), ('been', 'BEN'), ('to', 'TO'), ('before', 'IN'), ('.', '.'), ('Amazingly', None), ('enough', 'AP'), ('it', 'PPS'), ('is', 'BEZ'), ('located', 'VBN'), ('right', 'JJ'), ('off', 'RP'), ('of', 'IN'), ('Central', 'JJ-TL'), ('Avenue', 'NN-TL'), ('which', 'WDT'), ('is', 'BEZ'), ('a', 'AT'), ('very', 'QL'), ('busy', 'JJ'), ('road', 'NN'), ('but', 'CC'), ('because', 'CS'), ('of', 'IN'), ('the', 'AT'), ('exterior', None), ('of', 'IN'), ('you', 'PPSS'), ('would', 'MD'), ('drive', 'NN'), ('right', 'JJ'), ('past', 'NN'), ('it', 'PPS'), ('and', 'CC'), ('think', 'VB'), ('nothing', 'PN'), ('of', 'IN'), ('it', 'PPS'), ('.', '.'), ('DO', None), ("N'T", None), ('DO', None), ('THAT', None), ('