In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import gensim
import re
import string
import nltk
from gensim import corpora, models, similarities, matutils
from sklearn.metrics.pairwise import cosine_similarity

# 1. Data

In [3]:
df.head()

Unnamed: 0,business_id,cool,funny,stars,useful,text_clean,text_noun
0,--9e1ONYQuAa-CB_Rrw7Tw,0.631389,0.471562,4.087113,0.942405,what can i say wowzers probably one of the be...,wowzers steak houses service dinner seafood to...
1,0AQnRQw34IQW9-1gJkYnMA,0.307479,0.323176,2.943675,0.598338,me and my best friend stayed at the monte car...,friend monte carlo diablo s hotel didn feel ve...
2,0NmTwqYEQiKErDv4a55obg,0.801971,0.573477,4.120968,1.252688,stars goes to the truffled mushroom with pole...,stars mushroom stars basil spaghetti ambience ...
3,2weQS-RnoOBhb1KsHKyoSQ,0.767622,0.734573,3.637749,1.20914,over one hour wait for sunday brunch not a hu...,hour wait brunch selection breakfast items des...
4,4GXII-GU7S0ZyU6ElkhscQ,0.496835,0.787975,2.352848,1.131329,yeah this place is a friggin joke i live hear...,yeah place joke i hear pool sulfar pool deck f...


# 2. Text preprocessing

In [None]:
def preprocess(df):
    alphabet = lambda x: re.sub(r"""[^a-z]+""", ' ', x)  # remove non-alphabets
    punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower()) # remove punctuation and lower the case
    enter = lambda x: x.replace('\n','')  # remove \n

    df['text_clean'] = df.text.map(punc_lower).map(enter).map(alphabet)

    # Only keep nouns
    noun=[]
    for item in df.text_clean:     
        tokens = nltk.word_tokenize(item)
        tags = nltk.pos_tag(tokens)
        nouns = [word for word,pos in tags if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')]
        noun.append(nouns)

    df['text_noun'] = noun
    for i in range(len(df.text_noun)):     
        df.text_noun[i] = ' '.join(df.text_noun[i])

    # Remove some common nouns
    mapping = { 'food':'', 'place':'', 'time':'', 'vegas':'', 'people':'','strip':'','service':'','night':''}

    for i in range(len(df.text_noun)):
        for k, v in mapping.items():
            df.text_noun[i] = df.text_noun[i].replace(k, v)
    return df

df = preprocess(df)

In [9]:
df.head()

Unnamed: 0,business_id,cool,funny,stars,useful,text_clean,text_noun
0,--9e1ONYQuAa-CB_Rrw7Tw,0.631389,0.471562,4.087113,0.942405,what can i say wowzers probably one of the be...,wowzers steak houses dinner sea tower wedge w...
1,0AQnRQw34IQW9-1gJkYnMA,0.307479,0.323176,2.943675,0.598338,me and my best friend stayed at the monte car...,friend monte carlo diablo s hotel didn feel d...
2,0NmTwqYEQiKErDv4a55obg,0.801971,0.573477,4.120968,1.252688,stars goes to the truffled mushroom with pole...,stars mushroom stars basil spaghetti ambience ...
3,2weQS-RnoOBhb1KsHKyoSQ,0.767622,0.734573,3.637749,1.20914,over one hour wait for sunday brunch not a hu...,hour wait brunch selection breakfast items des...
4,4GXII-GU7S0ZyU6ElkhscQ,0.496835,0.787975,2.352848,1.131329,yeah this place is a friggin joke i live hear...,yeah joke i hear pool sulfar pool deck items...


# 3. Word embedding

## 3.1 CountVectorizer

In [2]:
cv = CountVectorizer(stop_words='english')
cv.fit(df.text_noun)
X = cv.transform(df.text_noun).transpose()
pd.DataFrame(X.toarray(), cv.get_feature_names()).head()

## 3.2 TF-IDF

In [3]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf.fit(df.text_noun)
X2 = tfidf.transform(df.text_noun).transpose()
pd.DataFrame(X2.toarray(), tfidf.get_feature_names()).head()

# 4. Topic modeling

In [None]:
def topic_search(data, model, num_topics, passes):
    corpus = matutils.Sparse2Corpus(data)
    id2word = dict((v, k) for k, v in model.vocabulary_.items())
    lda = models.LdaModel(corpus=corpus, num_topics=num_topics, id2word=id2word, passes=passes)
    lda_corpus = lda[corpus]  # Transform the docs from the word space to the topic space (like "transform" in sklearn)
    lda_docs = [doc for doc in lda_corpus]
    return lda.print_topics(), lda_docs

In [19]:
# Using CounterVectorizer
topics, topic_vectors = topic_search(X, cv, 5, 5) 

In [25]:
topics

[(0,
  '0.042*"room" + 0.036*"hotel" + 0.012*"rooms" + 0.011*"casino" + 0.009*"pool" + 0.007*"day" + 0.007*"stay" + 0.007*"buffet" + 0.007*"view" + 0.006*"desk"'),
 (1,
  '0.029*"burger" + 0.015*"fries" + 0.012*"club" + 0.009*"burgers" + 0.009*"line" + 0.008*"drinks" + 0.007*"restaurant" + 0.007*"bar" + 0.007*"order" + 0.007*"table"'),
 (2,
  '0.014*"buffet" + 0.011*"restaurant" + 0.010*"dinner" + 0.009*"steak" + 0.007*"meal" + 0.007*"chicken" + 0.007*"menu" + 0.007*"table" + 0.006*"experience" + 0.006*"order"'),
 (3,
  '0.046*"room" + 0.028*"hotel" + 0.016*"rooms" + 0.011*"casino" + 0.010*"pool" + 0.008*"floor" + 0.007*"day" + 0.007*"view" + 0.007*"bathroom" + 0.007*"staff"'),
 (4,
  '0.018*"pizza" + 0.013*"chocolate" + 0.011*"cirque" + 0.009*"line" + 0.007*"stage" + 0.006*"seats" + 0.006*"buffet" + 0.006*"way" + 0.006*"shows" + 0.005*"music"')]

In [None]:
# Using TF-IDF
topics, topic_vectors = topic_search(X2, tfidf, 5, 5)

# 5. Categorizing business based on topics

In [23]:
def matching(topic_vectors):
    topic_number = []
    for i in range(len(topic_vectors)):
        a=[]
        for j in range(len(topic_vectors[i])):
            a.append(topic_vectors[i][j][1])
        topic_number.append(topic_vectors[i][a.index(max(a))][0])
    df_topics=pd.DataFrame(pd.DataFrame({'business_id': df.business_id, 'topic_number': topic_number}))

    categories = [list(df_business.loc[df_business.business_id == item].categories) for item in df_topics.business_id]
    df_topics['categories'] = categories
    return df_topics

df_topics = matching(topic_vectors)

# 6. Matching business data to review data

In [102]:
def merge(df, df_business):
    bn = [df_business.loc[df_business.business_id == df.business_id[i]].name.iloc[0] for i in range(len(df))]
    bc = [df_business.loc[df_business.business_id == df.business_id[i]].categories.iloc[0] for i in range(len(df))]
    ba = [df_business.loc[df_business.business_id == df.business_id[i]].address.iloc[0] for i in range(len(df))]
    bla = [df_business.loc[df_business.business_id == df.business_id[i]].latitude.iloc[0] for i in range(len(df))]
    blo = [df_business.loc[df_business.business_id == df.business_id[i]].longitude.iloc[0] for i in range(len(df))]

    df['business_name'] = bn
    df['business_categories'] = bc
    df['address'] = ba
    df['latitude'] = bla
    df['longitude'] = blo
    return df

df = merge(df, df_business)

# 7. Recommendation system

In [None]:
df_wide = pd.pivot_table(df, values=["stars"], index=["business_name", "user_id"], aggfunc=np.mean).unstack()
df_wide = df_wide.fillna(0) # replace NaN with 0
dists = cosine_similarity(df_wide)
dists = pd.DataFrame(dists, columns=df_wide.index)
dists.index = dists.columns

In [None]:
# Recommend based on the similarity of inputs and outputs
def get_similar(businesses, n=None):

    businesses = [business for business in businesses if business in dists.columns]
    busi_summed = dists[businesses].apply(lambda row: np.sum(row), axis=1).sort_values(ascending=False)
    ranked_busi = busi_summed.index[busi_summed.index.isin(businesses)==False]
    ranked_business = ranked_busi.tolist()
    if n is None:
        return ranked_business
    else:
        return ranked_business[:n]

In [None]:
for i, business in enumerate(get_similar(["Bacchanal Buffet"], 10)):
    print("%d) %s" % (i+1, business))