In [1]:
import pandas as pd
import os
import re
import numpy as np
import pickle
from nltk import FreqDist
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

## TOPIC MODELLING

In [2]:
final_concat_reviews=pd.read_pickle('final_concat_reviews.pkl', compression='infer')

In [207]:
final_concat_reviews.head()

Unnamed: 0,offering_id,name,latitude,longitude,adj_word,category,pros,cons,pro_words,con_words,num_review
0,72572,BEST WESTERN PLUS Pioneer Square Hotel,47.601616,-122.335257,[make unplanned visit due passport expensive p...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, ...","[(732, staff helpful), (727, staff friendly), ...","[(749, staff wonderful), (738, staff helpful p...","staff helpful, staff friendly, room clean, lig...","staff wonderful, staff helpful parking, rate r...",250
1,72579,BEST WESTERN Loyal Inn,47.618108,-122.341253,[night room room bf clean comfortable room tv ...,"[1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, ...","[(240, staff helpful), (212, room spacious), (...","[(242, staff room), (231, smoking room), (230,...","staff helpful, room spacious, room clean, grea...","staff room, smoking room, size room, size doub...",108
2,72586,BEST WESTERN PLUS Executive Inn,47.620324,-122.345896,[beautiful night photography ask room view sta...,"[1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, ...","[(320, staff helpful), (308, spacious clean), ...","[(347, wide open), (336, view lot), (270, room...","staff helpful, spacious clean, room clean, gre...","wide open, view lot, room key, previous night,...",134
3,100504,Hotel Monaco Seattle - a Kimpton Hotel,47.60692,-122.334114,[wonderful feeling little guilty great price p...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(1863, wine hour), (1693, th floor), (1613, s...","[(1650, stayed business), (1636, staff service...","wine hour, th floor, staff helpful, staff frie...","stayed business, staff service, staff customer...",466
4,100505,Warwick Seattle Hotel,47.613938,-122.340963,[short weekend able get free room airline plea...,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, ...","[(1127, staff helpful), (1124, staff great), (...","[(1183, time reservation), (1033, second floor...","staff helpful, staff great, room view, room re...","time reservation, second floor, room room, roo...",353


In [3]:
documents=final_concat_reviews.loc[:,['offering_id','name','pro_words','con_words']]

In [4]:
#load stop_words
with open('stop_words.pkl', 'rb') as f:
       stop_words = pickle.load(f)

In [5]:
from nltk.stem import WordNetLemmatizer 

#let's preprocess the stopw_words
def lemma_words(text):
    text = text.split()
    lemmatizer = WordNetLemmatizer()
    lemma_words = [lemmatizer.lemmatize(word) for word in text]
    text = " ".join(lemma_words)
    return text

In [6]:
#lemmartize the stop_words
stop_words=[lemma_words(x) for x in stop_words]

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',stop_words=stop_words, ngram_range=(2, 3), min_df=5,max_df=0.5)
tf_word = tf.fit_transform(documents.pro_words)

  'stop_words.' % sorted(inconsistent))


In [20]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words=stop_words, ngram_range = (2,3), min_df = 5, max_df=.5)
doc_word_cv = vectorizer.fit_transform(documents.pro_words)

  'stop_words.' % sorted(inconsistent))


In [21]:
from sklearn.decomposition import TruncatedSVD

lsa = TruncatedSVD(5)
tf_topic_lsa = lsa.fit_transform(tf_word)

In [22]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [23]:
display_topics(lsa, tf.get_feature_names(), 8)


Topic  0
staff friendly, helpful staff, staff helpful staff, helpful staff friendly, room clean, location front, location front desk, great location front

Topic  1
shuttle service, free shuttle, staff helpful shuttle, helpful shuttle, helpful shuttle service, desk free shuttle, shuttle free, free shuttle free

Topic  2
helpful staff friendly, staff helpful staff, helpful staff, staff friendly, friendly room, staff friendly room, shuttle service, complimentary breakfast

Topic  3
great location front, floor staff, floor staff helpful, location front desk, location front, th floor staff, th floor, room service

Topic  4
light rail great, rail great, short walk, rail great location, light rail, location good, great location good, short walk room


In [24]:
from sklearn.decomposition import NMF

tf_nmf_model = NMF(5)
tf_doc_topic_nmf = tf_nmf_model.fit_transform(tf_word)

In [25]:
display_topics(tf_nmf_model, tf.get_feature_names(), 10)


Topic  0
helpful staff friendly, staff helpful staff, helpful staff, staff friendly, friendly room, staff friendly room, room clean, friendly room clean, free breakfast breakfast, friendly staff

Topic  1
shuttle service, free shuttle, helpful shuttle, staff helpful shuttle, helpful shuttle service, desk free shuttle, front desk free, desk free, free shuttle free, shuttle free

Topic  2
staff helpful room, helpful room, clean comfortable, room clean, staff clean comfortable, staff clean, comfortable breakfast, clean comfortable breakfast, desk staff, friendly staff

Topic  3
location front, location front desk, great location front, floor staff, floor staff helpful, th floor staff, th floor, room service, location great, location great great

Topic  4
room spacious, short walk, light rail great, rail great, rail great location, walk room, short walk room, light rail, location good, room spacious room


In [26]:
from sklearn.decomposition import LatentDirichletAllocation
# Build LDA Model
lda_model_sklearn = LatentDirichletAllocation(n_components=5,          # Number of topics
                                      max_iter=200,              # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=128,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )
tf_doc_topic_lda = lda_model_sklearn.fit_transform(tf_word)

In [277]:
# lda_model_sklearn.components_

In [27]:
display_topics(lda_model_sklearn, vectorizer.get_feature_names(), 8)


Topic  0
staff friendly, location front desk, location front, helpful staff, staff helpful staff, helpful staff friendly, great location front, room clean

Topic  1
free shuttle, shuttle service, helpful shuttle, staff helpful shuttle, helpful shuttle service, service room, complimentary breakfast, shuttle service room

Topic  2
staff helpful room, helpful room, corner room, clean comfortable, room nice, great value, room large, rental car

Topic  3
free internet, corner room, room clean, room nice, clean comfortable, great value, short walk room, walk room

Topic  4
breakfast breakfast, free breakfast breakfast, rail great location, light rail great, rail great, living room, helpful staff, staff helpful staff


In [28]:
vectorizer = CountVectorizer(stop_words=stop_words, ngram_range = (2,3), min_df = 5, max_df=.5)
doc_word_cv = vectorizer.fit_transform(documents.pro_words)

  'stop_words.' % sorted(inconsistent))


In [29]:

from sklearn.decomposition import TruncatedSVD

lsa = TruncatedSVD(5)
cv_topic_lsa = lsa.fit_transform(doc_word_cv)

In [31]:
display_topics(lsa, vectorizer.get_feature_names(), 8)


Topic  0
room clean, staff friendly, helpful staff, staff helpful staff, helpful staff friendly, location front, location front desk, great location front

Topic  1
shuttle service, free shuttle, helpful shuttle service, helpful shuttle, staff helpful shuttle, desk free, front desk free, desk free shuttle

Topic  2
helpful room, staff helpful room, room clean, location great, clean comfortable, helpful room clean, light rail, room service

Topic  3
location front desk, location front, great location front, floor staff, floor staff helpful, th floor staff, th floor, room service

Topic  4
desk staff, clean comfortable, staff helpful room, helpful room, free breakfast, front desk staff, comfortable breakfast, clean comfortable breakfast


In [32]:
from sklearn.decomposition import NMF

cv_nmf_model = NMF(5)
cv_doc_topic_nmf = cv_nmf_model.fit_transform(doc_word_cv)

In [33]:
display_topics(cv_nmf_model, vectorizer.get_feature_names(), 10)


Topic  0
staff friendly, staff helpful staff, helpful staff, helpful staff friendly, friendly room, staff friendly room, room clean, friendly room clean, free breakfast breakfast, friendly staff

Topic  1
shuttle service, free shuttle, desk free, front desk free, staff helpful shuttle, helpful shuttle, helpful shuttle service, desk free shuttle, free breakfast, free shuttle free

Topic  2
location front, location front desk, great location front, floor staff helpful, floor staff, th floor, th floor staff, location great, room service, room clean

Topic  3
clean comfortable, room clean, staff helpful room, helpful room, desk staff, staff clean comfortable, staff clean, clean comfortable breakfast, comfortable breakfast, friendly staff

Topic  4
light rail, light rail great, rail great, rail great location, short walk, short walk room, walk room, location good, great location good, location great


In [34]:
from sklearn.decomposition import LatentDirichletAllocation
# Build LDA Model
lda_model_sklearn = LatentDirichletAllocation(n_components=5,          # Number of topics
                                      max_iter=200,              # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=128,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )
cv_doc_topic_lda = lda_model_sklearn.fit_transform(doc_word_cv)

In [293]:
# lda_model_sklearn.components_

In [35]:
display_topics(lda_model_sklearn, vectorizer.get_feature_names(), 8)


Topic  0
location front desk, location front, great location front, staff friendly, helpful staff, staff helpful staff, helpful staff friendly, floor staff helpful

Topic  1
shuttle service, free shuttle, helpful shuttle, staff helpful shuttle, helpful shuttle service, front desk free, desk free, service room

Topic  2
clean comfortable, room clean, helpful room, staff helpful room, light rail, clean comfortable breakfast, comfortable breakfast, staff clean comfortable

Topic  3
room nice, room clean, clean comfortable, room service room, service room, room service, corner room, light rail

Topic  4
helpful staff, staff helpful staff, helpful staff friendly, staff friendly, breakfast breakfast, free breakfast breakfast, room clean, front desk free


## COSINE SIMILARITY

In [36]:
from sklearn.metrics.pairwise import cosine_similarity

In [37]:
cosine_sim_lda_cv=cosine_similarity(cv_doc_topic_lda, cv_doc_topic_lda)

In [38]:
indices = pd.Series(final_concat_reviews.name)

In [39]:
indices[0]

'BEST WESTERN PLUS Pioneer Square Hotel'

In [40]:
def recommendations(title, cosine_sim = cosine_sim_lda_cv):
    
    recommended_hotel = []
    
    # gettin the index of the perfume that matches the title
    idx = indices[indices == title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 5 most similar perfume
    top_5_indexes = list(score_series.iloc[1:6].index)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_5_indexes:
        recommended_hotel.append(list(indices)[i])
        
    return recommended_hotel

In [41]:
recommendations('BEST WESTERN PLUS Pioneer Square Hotel')

['Hotel Max',
 'Inn at Queen Anne',
 'Executive Hotel Pacific',
 'The Sorrento Hotel',
 'Residence Inn Seattle Downtown/Lake Union']

In [43]:
recommendation_list=[]
for i in final_concat_reviews.name:
    recommend_hotel=recommendations(i)
    recommendation_list.append(recommend_hotel)

In [44]:
final_concat_reviews['recommendation']=recommendation_list

In [45]:
final_concat_reviews.iloc[0]

offering_id                                                   72572
name                         BEST WESTERN PLUS Pioneer Square Hotel
latitude                                                    47.6016
longitude                                                  -122.335
adj_word          [make unplanned visit due passport expensive p...
category          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, ...
pros              [(732, staff helpful), (727, staff friendly), ...
cons              [(749, staff wonderful), (738, staff helpful p...
pro_words         staff helpful, staff friendly, room clean, lig...
con_words         staff wonderful, staff helpful parking, rate r...
num_review                                                      250
recommendation    [Hotel Max, Inn at Queen Anne, Executive Hotel...
Name: 0, dtype: object

In [47]:
#to pickle
export_csv = final_concat_reviews.to_csv (r'Seattle_Hotel_Recommendation.csv', index = None, header=True)