In [54]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize,word_tokenize
#refer to https://www.kaggle.com/residentmario/sentiment-analysis-and-collocation-of-reviews
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk import wordpunct_tokenize
import matplotlib.pyplot as plt

import gensim
from gensim.models.word2vec import LineSentence
from gensim.models import Word2Vec

import codecs

from nltk.stem import WordNetLemmatizer
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

import itertools as it
from gensim.models.phrases import Phrases, Phraser
import spacy #https://github.com/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb

In [2]:
#read data and data cleanning
df_reviews = pd.read_csv('boston/reviews.csv')
df_reviews.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,1178162,4724140,2013-05-21,4298113,Olivier,My stay at islam's place was really cool! Good...
1,1178162,4869189,2013-05-29,6452964,Charlotte,Great location for both airport and city - gre...
2,1178162,5003196,2013-06-06,6449554,Sebastian,We really enjoyed our stay at Islams house. Fr...
3,1178162,5150351,2013-06-15,2215611,Marine,The room was nice and clean and so were the co...
4,1178162,5171140,2013-06-16,6848427,Andrew,Great location. Just 5 mins walk from the Airp...


In [3]:
#there are several languages
#we only intereted in English
def get_language_likelihood(input_text):
    """Return a dictionary of languages and their likelihood of being the 
    natural language of the input text
    """
 
    input_text = input_text.lower()
    input_words = wordpunct_tokenize(input_text)
 
    language_likelihood = {}
    total_matches = 0
    for language in stopwords._fileids:
        language_likelihood[language] = len(set(input_words) &
                set(stopwords.words(language)))
 
    return language_likelihood
 
def get_language(input_text):
    """Return the most likely language of the given text
    """ 
    likelihoods = get_language_likelihood(input_text)
    return sorted(likelihoods, key=likelihoods.get, reverse=True)[0]

In [4]:
#drop na
df_reviews_no_na = df_reviews.dropna().copy() 

In [5]:
#create another columns for language
df_reviews_no_na['language'] = df_reviews_no_na['comments'].apply(get_language)

In [6]:
#this take a long time to run , we save it
df_reviews_no_na.to_csv('boston/reviews_no_na_add_language_type.csv',index=False)

In [7]:
df_reviews_lang = pd.read_csv('boston/reviews_no_na_add_language_type.csv')

In [8]:
#check how many types
df_reviews_lang['language'].value_counts()

english        64001
french          1285
arabic          1050
spanish          630
german           535
italian          191
dutch            184
portuguese       101
azerbaijani       99
danish            58
russian           27
swedish           18
norwegian         16
greek              8
romanian           7
finnish            5
hungarian          3
turkish            3
indonesian         1
Name: language, dtype: int64

In [9]:
#most are english, we only intereted in english
df_review_english = df_reviews_lang[df_reviews_lang['language']=='english'].copy()

In [22]:
#let's clean the data
#we will split each reviews into sentence and then words
#need to run  python -m spacy download en
nlp = spacy.load('C:\\Users\\zzguk\\Anaconda3\\Lib\\site-packages\\en_core_web_sm\\en_core_web_sm-2.0.0')

In [24]:
#put all txt together

sample_parsed_review = nlp(df_review_english['comments'][0])

In [25]:
print(sample_parsed_review)

My stay at islam's place was really cool! Good location, 5min away from subway, then 10min from downtown. The room was nice, all place was clean. Islam managed pretty well our arrival, even if it was last minute ;) i do recommand this place to any airbnb user :)


In [26]:
for num, sentence in enumerate(sample_parsed_review.sents):
    print('Sentence {}:'.format(num+1))
    print(sentence)
    print()

Sentence 1:
My stay at islam's place was really cool!

Sentence 2:
Good location, 5min away from subway, then 10min from downtown.

Sentence 3:
The room was nice, all place was clean.

Sentence 4:
Islam managed pretty well our arrival, even if it was last minute ;) i do recommand this place to any airbnb user :)



In [27]:
for num,entity in enumerate(sample_parsed_review.ents):
    print('Entity {}:'.format(num+1),entity,'-',entity.label_)

Entity 1: 5min - CARDINAL
Entity 2: 10min - CARDINAL
Entity 3: Islam - ORG
Entity 4: last minute - TIME


In [55]:
for token in sample_parsed_review:
    print(token.lemma_)

-PRON-
stay
at
islam
's
place
be
really
cool
!
good
location
,
5min
away
from
subway
,
then
10min
from
downtown
.
the
room
be
nice
,
all
place
be
clean
.
islam
manage
pretty
well
-PRON-
arrival
,
even
if
-PRON-
be
last
minute
;)
i
do
recommand
this
place
to
any
airbnb
user
:)


In [32]:
token_text = [token.orth_ for token in sample_parsed_review]
token_pos = [token.pos_ for token in sample_parsed_review]
list(zip(token_text,token_pos))

[('My', 'ADJ'),
 ('stay', 'NOUN'),
 ('at', 'ADP'),
 ('islam', 'NOUN'),
 ("'s", 'PART'),
 ('place', 'NOUN'),
 ('was', 'VERB'),
 ('really', 'ADV'),
 ('cool', 'ADJ'),
 ('!', 'PUNCT'),
 ('Good', 'ADJ'),
 ('location', 'NOUN'),
 (',', 'PUNCT'),
 ('5min', 'NOUN'),
 ('away', 'ADV'),
 ('from', 'ADP'),
 ('subway', 'NOUN'),
 (',', 'PUNCT'),
 ('then', 'ADV'),
 ('10min', 'VERB'),
 ('from', 'ADP'),
 ('downtown', 'NOUN'),
 ('.', 'PUNCT'),
 ('The', 'DET'),
 ('room', 'NOUN'),
 ('was', 'VERB'),
 ('nice', 'ADJ'),
 (',', 'PUNCT'),
 ('all', 'DET'),
 ('place', 'NOUN'),
 ('was', 'VERB'),
 ('clean', 'ADJ'),
 ('.', 'PUNCT'),
 ('Islam', 'PROPN'),
 ('managed', 'VERB'),
 ('pretty', 'ADV'),
 ('well', 'ADV'),
 ('our', 'ADJ'),
 ('arrival', 'NOUN'),
 (',', 'PUNCT'),
 ('even', 'ADV'),
 ('if', 'ADP'),
 ('it', 'PRON'),
 ('was', 'VERB'),
 ('last', 'ADJ'),
 ('minute', 'NOUN'),
 (';)', 'PUNCT'),
 ('i', 'PRON'),
 ('do', 'VERB'),
 ('recommand', 'ADV'),
 ('this', 'DET'),
 ('place', 'NOUN'),
 ('to', 'ADP'),
 ('any', 'DET'),
 (

In [33]:
token_entity_type = [token.ent_type_ for token in sample_parsed_review]
token_entity_iob = [token.ent_iob_ for token in sample_parsed_review]


In [35]:
tmp_list = list(zip(token_text, token_entity_type, token_entity_iob))
pd.DataFrame(tmp_list,
             columns=['token_text', 'entity_type', 'inside_outside_begin'])

Unnamed: 0,token_text,entity_type,inside_outside_begin
0,My,,O
1,stay,,O
2,at,,O
3,islam,,O
4,'s,,O
5,place,,O
6,was,,O
7,really,,O
8,cool,,O
9,!,,O


In [38]:
token_lemma = [token.lemma_ for token in sample_parsed_review]
token_shape = [token.shape_ for token in sample_parsed_review]
pd.DataFrame(list(zip(token_text, token_lemma, token_shape)),
             columns=['token_text', 'token_lemma', 'token_shape'])

Unnamed: 0,token_text,token_lemma,token_shape
0,My,-PRON-,Xx
1,stay,stay,xxxx
2,at,at,xx
3,islam,islam,xxxx
4,'s,'s,'x
5,place,place,xxxx
6,was,be,xxx
7,really,really,xxxx
8,cool,cool,xxxx
9,!,!,!


In [40]:
token_entity_type = [token.ent_type_ for token in sample_parsed_review]
token_entity_iob = [token.ent_iob_ for token in sample_parsed_review]

pd.DataFrame(list(zip(token_text, token_entity_type, token_entity_iob)),
             columns=['token_text', 'entity_type', 'inside_outside_begin'])

Unnamed: 0,token_text,entity_type,inside_outside_begin
0,My,,O
1,stay,,O
2,at,,O
3,islam,,O
4,'s,,O
5,place,,O
6,was,,O
7,really,,O
8,cool,,O
9,!,,O


In [41]:
token_attributes = [(token.orth_,
                     token.prob,
                     token.is_stop,
                     token.is_punct,
                     token.is_space,
                     token.like_num,
                     token.is_oov)
                    for token in sample_parsed_review]

df = pd.DataFrame(token_attributes,
                  columns=['text',
                           'log_probability',
                           'stop?',
                           'punctuation?',
                           'whitespace?',
                           'number?',
                           'out of vocab.?'])

df.loc[:, 'stop?':'out of vocab.?'] = (df.loc[:, 'stop?':'out of vocab.?']
                                       .applymap(lambda x: u'Yes' if x else u''))
                                               
df

Unnamed: 0,text,log_probability,stop?,punctuation?,whitespace?,number?,out of vocab.?
0,My,-20.0,,,,,Yes
1,stay,-20.0,,,,,Yes
2,at,-20.0,Yes,,,,Yes
3,islam,-20.0,,,,,Yes
4,'s,-20.0,,,,,Yes
5,place,-20.0,,,,,Yes
6,was,-20.0,Yes,,,,Yes
7,really,-20.0,Yes,,,,Yes
8,cool,-20.0,,,,,Yes
9,!,-20.0,,Yes,,,Yes


In [56]:
def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuation or whitespace
    """
    
    return token.is_punct or token.is_space

In [59]:

def lemmatized_sentence_corpus():
    for rev in df_review_english['comments']:
        parsed_rev = nlp(rev)
        for sent in parsed_rev.sents:
            yield u' '.join([token.lemma_ for token in sent if not punct_space(token)])

In [61]:
#let's put all sentence into a file 
#it take some times
if False:
    with codecs.open('review_sentences_in_all.txt','w',encoding='utf_8') as f:
        for sentence in lemmatized_sentence_corpus():
            f.write(sentence+'\n')

In [62]:
reveiw_sentences = LineSentence('review_sentences_in_all.txt')

In [63]:
ct=0
for sent in reveiw_sentences:
    print(sent)
    ct += 1
    if ct>5:
        break

['-PRON-', 'stay', 'at', 'islam', "'s", 'place', 'be', 'really', 'cool']
['good', 'location', '5min', 'away', 'from', 'subway', 'then', '10min', 'from', 'downtown']
['the', 'room', 'be', 'nice', 'all', 'place', 'be', 'clean']
['islam', 'manage', 'pretty', 'well', '-PRON-', 'arrival', 'even', 'if', '-PRON-', 'be', 'last', 'minute', 'i', 'do', 'recommand', 'this', 'place', 'to', 'any', 'airbnb', 'user']
['great', 'location', 'for', 'both', 'airport', 'and', 'city', 'great', 'amenity', 'in', 'the', 'house', 'plus']
['islam', 'be', 'always', 'very', 'helpful', 'even', 'though', '-PRON-', 'be', 'away']


In [65]:
#write to a txt file
if False:
    bigram_model = Phrases(reveiw_sentences)
    bigram_model.save('bigram_review_all')
bigram_model = Phrases.load('bigram_review_all')

In [68]:
if False:
    with codecs.open('bigram_review_sentences_all.txt','w',encoding='utf_8') as f:
        for sentence in reveiw_sentences:
            bigram_sent = u' '.join(bigram_model[sentence])
            f.write(bigram_sent + '\n')

In [69]:
rev_bigram_sentences = LineSentence('bigram_review_sentences_all.txt')

In [70]:
#let's do it again to get trigram and save the model
if False:
    trigram_model = Phrases(rev_bigram_sentences)
    trigram_model.save('trigram_review_all')
trigram_model = Phrases.load('trigram_review_all')

In [72]:
#let's save the trigram sentences
if False:
    with codecs.open('trigram_review_sentences_all.txt','w',encoding='utf_8') as f:
        for sentence in rev_bigram_sentences:
            trigram_sent = u' '.join(trigram_model[sentence])
            f.write(trigram_sent + '\n')

In [73]:
rev_trigram_sentences = LineSentence('trigram_review_sentences_all.txt')

In [80]:
#let's create a file with one review per line
if False:
    with codecs.open('trigram_review_one_review_per_line.txt','w',encoding='utf_8') as f:
        for rev in df_review_english['comments']:
            parsed_rev = nlp(rev)
            unigram_rev = [token.lemma_ for token in parsed_rev if not punct_space(token)]
            #apply the first-order and second-order phrase model
            bigram_rev = bigram_model[unigram_rev]
            trigram_rev = trigram_model[bigram_rev]
            #remove stop words
            trigram_rev = [term for term in trigram_rev 
                           if term not in spacy.lang.en.stop_words.STOP_WORDS]
            
            #write to a new file
            trigram_rev = u' '.join(trigram_rev)
            f.write(trigram_rev+'\n')
            

In [76]:
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore

import pyLDAvis
import pyLDAvis.gensim
import warnings

In [83]:
%%time
if False:
    trigram_rev = LineSentence('trigram_review_one_review_per_line.txt')
    trigram_dic = Dictionary(trigram_rev)
    trigram_dic.filter_extremes(no_below=10,no_above=0.4)
    trigram_dic.compactify()
    trigram_dic.save('traigram_review_dict.dict')
trigram_dic = Dictionary.load('traigram_review_dict.dict')
    

Wall time: 4.99 ms


<gensim.corpora.dictionary.Dictionary at 0x23a6a249ef0>

In [82]:
def trigram_bow_generator(filepath):
    """
    generator function to read reviews from a file
    and yield a bag-of-words representation
    """
    
    for review in LineSentence(filepath):
        yield trigram_dic.doc2bow(review)

In [84]:
if False:
    MmCorpus.serialize('trigram_review_bow_corpus_all.mm',
                      trigram_bow_generator('trigram_review_one_review_per_line.txt'))
trigram_bow_corpus = MmCorpus('trigram_review_bow_corpus_all.mm')

In [88]:
if True:
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        
        lda = LdaMulticore(trigram_bow_corpus,
                          num_topics=50,
                          id2word=trigram_dic,
                          workers=3)
        lda.save('lda_review_model_all')
lda = LdaMulticore.load('lda_review_model_all')

In [91]:
def explore_topic(topic_number, topn=25):
    """
    accept a user-supplied topic number and
    print out a formatted list of the top terms
    """
        
    print( u'{:20} {}'.format(u'term', u'frequency') + u'\n')

    for term, frequency in lda.show_topic(topic_number, topn=25):
        print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))

In [92]:
explore_topic(topic_number=0)

term                 frequency

good                 0.030
room                 0.024
nice                 0.022
place                0.015
bed                  0.015
need                 0.013
clean                0.012
house                0.010
flat                 0.010
apartment            0.010
boston               0.010
comfortable          0.010
time                 0.010
host                 0.010
use                  0.009
location             0.008
kitchen              0.008
bathroom             0.008
steve                0.007
night                0.007
lot                  0.007
perfect              0.006
helpful              0.005
bit                  0.005
's                   0.005


In [None]:
#word2vector
if True:
    review