In [45]:
import collections
import contractions
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import pandas as pd
import pyLDAvis.sklearn
import re
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
import unicodedata

### Create the Las Vegas Pizza dataset and write to CSV

In [None]:
# # read in yelp reviews and business data
# reviews = pd.read_csv('../Assets/yelp_academic_dataset_review.csv')
# biz = pd.read_csv('../Assets/yelp_academic_dataset_business.csv')

# # filter businesses to only include places in Las Vegas that have "pizza" in their category
# lv_pizza_biz = biz.loc[(biz['city']=="Las Vegas") & 
#                        (biz['state']=='NV') &
#                        (biz['categories'].str.lower().str.contains('pizza')), :]

# # merge reviews to the LV_Pizza businesses
# lv_pizza_reviews = (lv_pizza_biz.loc[:, ['business_id', 'name', 'stars', 'state', 
#                                          'city', 'neighborhood','address', 'postal_code']]
#                                 .rename(columns={'stars': 'biz_stars'})
#                                 .merge(reviews, how='inner', on='business_id'))

# # Write to csv
# lv_pizza_reviews.to_csv('../Assets/lv_pizza_reviews.csv', index=False)


# # read in tips
# tips = pd.read_csv('../Assets/yelp_academic_dataset_tip.csv')

# # get tips for lv_pizza businesses
# biz_ids = lv_pizza_reviews.loc[:, ['business_id', 'name', 'stars', 'state', 
#                                   'city', 'neighborhood','address', 'postal_code']].drop_duplicates()
# lv_pizza_tips = biz_ids.merge(tips, how='inner', on='business_id')

# # write to csv
# lv_pizza_tips.to_csv('../Assets/lv_pizza_tips.csv', index=False)


### Prepare lv_pizza_reviews dataset

In [49]:
# read in lv_pizza_reviews and assign proper data types to columns
df = (pd.read_csv('../Assets/lv_pizza_reviews.csv')
        .assign(date=lambda x: pd.to_datetime(x['date'], errors='coerce')))
print(df.info())

tips = pd.read_csv('../Assets/lv_pizza_tips.csv')
print(tips.info())

low_rating = df.loc[df['stars'] <= 1, :]
high_rating = df.loc[df['stars'] >= 4, :]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81620 entries, 0 to 81619
Data columns (total 16 columns):
business_id     81620 non-null object
name            81620 non-null object
biz_stars       81620 non-null float64
state           81620 non-null object
city            81620 non-null object
neighborhood    75242 non-null object
address         81587 non-null object
postal_code     81620 non-null int64
review_id       81620 non-null object
text            81620 non-null object
date            81620 non-null datetime64[ns]
cool            81620 non-null int64
funny           81620 non-null int64
useful          81620 non-null int64
stars           81620 non-null int64
user_id         81620 non-null object
dtypes: datetime64[ns](1), float64(1), int64(5), object(9)
memory usage: 10.0+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96304 entries, 0 to 96303
Data columns (total 12 columns):
business_id     96304 non-null object
name            96304 non-null object
stars  

### Predict Sentiment

In [113]:
def normalize_accented_characters(text):
    '''
    removes accents from characters and replaces with non-accented equivalent
    '''
    if type(text) != str:
        text = (unicodedata.normalize('NFKD', text.decode('utf-8'))
                           .encode('ascii', 'ignore'))
    return text

In [114]:
def analyze_sentiment_sentiwordnet_lexicon(review, verbose=False):
    '''
    analyzes sentiment of text using 
    '''
    # pre-process text
    review = normalize_accented_characters(review)
    
    # tokenize and POS tag text tokens
    text_tokens = nltk.word_tokenize(review)
    tagged_text = nltk.pos_tag(text_tokens)
    pos_score = neg_score = token_count = obj_score = 0
    
    # get wordnet synsets based on POS tags for NN, VB, JJ, RB
    # get sentiment scores if synsets are found
    for word, tag in tagged_text:  
        ss_set = None
        if 'NN' in tag and list(swn.senti_synsets(word, 'n')): # singular noun
            ss_set = list(swn.senti_synsets(word, 'n'))[0]
        elif 'VB' in tag and list(swn.senti_synsets(word, 'v')): # Verb
            ss_set = list(swn.senti_synsets(word, 'v'))[0]
        elif 'JJ' in tag and list(swn.senti_synsets(word, 'a')): # Adjective
            ss_set = list(swn.senti_synsets(word, 'a'))[0]
        elif 'RB' in tag and list(swn.senti_synsets(word, 'r')): # adverb
            ss_set = list(swn.senti_synsets(word, 'r'))[0]
        
        # if senti-synset is found
        if ss_set:
            # add scores for all found synsets
            pos_score += ss_set.pos_score()
            neg_score += ss_set.neg_score()
            obj_score += ss_set.obj_score()
            token_count += 1
        
        # if no words have synset, return None 
        if token_count == 0:
            return None
        
    # aggregate final scores
    final_score = pos_score - neg_score
    norm_final_score = round(float(final_score) / token_count, 2)
    final_sentiment = 'positive' if norm_final_score >= 0 else 'negative'
    if verbose:
        norm_obj_score = round(float(obj_score) / token_count, 2)
        norm_pos_score = round(float(pos_score) / token_count, 2)
        norm_neg_score = round(float(neg_score) / token_count, 2)
        # to display results in a nice table
        sentiment_frame = pd.DataFrame([[final_sentiment, norm_obj_score,
                                         norm_pos_score, norm_neg_score,
                                         norm_final_score]],
                                         columns=pd.MultiIndex(
                                             levels=[['SENTIMENT STATS:'],
                                                ['Predicted Sentiment',
                                                 'Objectivity',
                                                 'Positive', 'Negative',
                                                 'Overall']],
                                              labels=[[0,0,0,0,0],
                                                      [0,1,2,3,4]]))
        print(sentiment_frame)
    return final_sentiment

In [115]:
# takes a while to run
df['pred_sentiment'] = df['text'].apply(analyze_sentiment_sentiwordnet_lexicon)

### Split into a matched and mismatched dataframe

In [None]:
# only include rows that have sentiment assigned and stars !=3
df_sentiment = df.loc[(~df['pred_sentiment'].isna()) 
                      & (df['stars'] != 3) , :]

df_match = (df_sentiment.loc[((df['stars'] <= 2) & (df['pred_sentiment'] == 'negative'))
                            |((df['stars'] >=4) & (df['pred_sentiment'] == 'positive')), :]
                        .reset_index(drop=True))

df_mismatch = (df_sentiment.loc[((df['stars'] <= 2) & (df['pred_sentiment'] == 'positive'))
                            |((df['stars'] >=4) & (df['pred_sentiment'] == 'negative')), :]
                           .reset_index(drop=True))

print('{} of {} observations were assigned sentiment'.format(df_sentiment.index.size, df.index.size))
print('{} had sentiment matching review'.format(df_match.index.size))
print('{} had sentiment mismatched with review'.format(df_mismatch.index.size))

### Create preprocessing tools

In [36]:
# create custom list of stopwords
stopwords = set(stopwords.words('english')).union(set(ENGLISH_STOP_WORDS)).union(set(['pizza']))

In [None]:
# create custome tokenizer and stemmer
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search(r'[a-zA-Z\-][a-zA-Z\-]{2,}', token):  # character only words of length >= 2
            filtered_tokens.append(token)
    stems = [SnowballStemmer("english").stem(t) for t in filtered_tokens]
    return stems

### create count matrix

In [97]:
########
data = low_rating  # assign df to data
########

# converts collection of text documents (reviews) to a matrix of token counts
vectorizer = CountVectorizer(min_df=5,     # word must appear at least 5 times to be included
                             max_df=0.9,   # don't include words that are found in 90% of documents
                             stop_words=stopwords,       # remove stop words
                             lowercase=True,             # lowercase all words
                             strip_accents='unicode',    # remove accents
                             token_pattern=r'[a-zA-Z\-][a-zA-Z\-]{3,}', # character only words of length >= 2
#                              tokenizer=tokenize_and_stem,    # uses tokenize_and_stem function to tokenize
                             ngram_range=(1,1))          # consider single words and pairs of words

# learn the vocabulary dictionary and return the term-document matrix
count_matrix = vectorizer.fit_transform(data['text'])  

# get feature names
feature_names = vectorizer.get_feature_names()

### Run TF-IDF on count matrix

In [69]:
tfidf_transformer = TfidfTransformer()  # instatiate the object
tfidf_matrix = tfidf_transformer.fit_transform(count_matrix)  # transform count matrix to tf-idf matrix

### Run K-Means Clustering

In [96]:
# run kmeans
num_clusters = 5
km = KMeans(n_clusters=num_clusters, n_jobs=-1) # instantiate kmeans object, with n clusters, and tell it to use all processors
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()
data.loc[:,'cluster'] = clusters  # assign cluster results to dataframe
c = Counter(clusters) # returns count of each cluster

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


### Get details of each cluster

In [110]:
# get details of clusters
cluster_details = {}

# get cluster centroids
ordered_centroids = km.cluster_centers_.argsort()[:, ::-1]

# get key features for each cluster
# and also list restaurants and reviews associated with that cluster
for cluster_num in range(num_clusters):
    cluster_details[cluster_num] = {}  # assign cluster number as dictionary
    cluster_details[cluster_num]['cluster_num'] = cluster_num  # assign cluster num
    key_features = [feature_names[index] for index in ordered_centroids[cluster_num, :20]] # top 20 features of cluster
    cluster_details[cluster_num]['key_features'] = key_features

    businesses = data.loc[data['cluster'] == cluster_num, 'name'].unique().tolist()  # list of biz in cluster
    review_ids = data.loc[data['cluster'] == cluster_num, 'review_id'].unique().tolist() # list of reviews in cluster
    cluster_details[cluster_num]['businesses'] = businesses  # assign businesses
    cluster_details[cluster_num]['review_ids'] = review_ids  # assign review_ids


### Visualize Clusters  - DON'T RUN.  DOESNT WORK!

In [112]:
# import matplotlib.pyplot as plt
# from sklearn.manifold import MDS
# from sklearn.metrics.pairwise import cosine_similarity
# import random
# from matplotlib.font_manager import FontProperties

    
# # generate random color for clusters
# def generate_random_color():
#     color = '#%06x' % random.randint(0, 0xFFFFFF)
#     return color
    
# plot_size=(16,8)

# # define markers for clusters
# markers = ['o', 'v', '^', '<', '>', '8', 's', 'p', '*', 'h', 'H', 'D', 'd']

# # build cosine distance matrix
# cosine_distance = 1 - cosine_similarity(tfidf_matrix)

# # dimensionality reduction using MDS
# mds = MDS(n_components=2, dissimilarity="precomputed",
#           random_state=1)

# # get coordinates of clusters in new low-dimensional space
# plot_positions = mds.fit_transform(cosine_distance)
# x_pos, y_pos = plot_positions[:, 0], plot_positions[:, 1]

# # build cluster plotting data
# cluster_color_map = {}
# cluster_name_map = {}

# for cluster_num, cluster_details in cluster_details.items():
#     # assign cluster features to unique label
#     cluster_color_map[cluster_num] = generate_random_color()
#     cluster_name_map[cluster_num] = ', '.join(cluster_details['key_features'][:5]).strip()

# # map each unique cluster label with its coordinates and movies
# cluster_plot_frame = pd.DataFrame({'x': x_pos,
#                                    'y': y_pos,
#                                    'label': data['cluster'].values.tolist(),
#                                    'stars': data['stars'].values.tolist()})
# grouped_plot_frame = cluster_plot_frame.groupby('label')

# # set plot figure size and axes
# fig, ax = plt.subplots(figsize=plot_size)
# ax.margins(0.05)

# # plot each cluster using co-ordinates and movie titles
# for cluster_num, cluster_frame in grouped_plot_frame:
#     marker = markers[cluster_num] if cluster_num < len(markers) \
#              else np.random.choice(markers, size=1)[0]
# ax.plot(cluster_frame['x'], cluster_frame['y'],
#         marker=marker, linestyle='', ms=12,
#         label=cluster_name_map[cluster_num],
#         color=cluster_color_map[cluster_num], mec='none')
# ax.set_aspect('auto')
# ax.tick_params(axis= 'x', which='both', bottom='off', top='off',
#                labelbottom='off')
# ax.tick_params(axis= 'y', which='both', left='off', top='off',
#                labelleft='off')
# fontP = FontProperties()
# fontP.set_size('small')
# ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.01),
#           fancybox=True,
#           shadow=True, ncol=5, numpoints=1, prop=fontP)

# #add labels as the film titles
# for index in range(len(cluster_plot_frame)):
#     ax.text(cluster_plot_frame.ix[index]['x'],
#     cluster_plot_frame.ix[index]['y'],
#     cluster_plot_frame.ix[index]['stars'], size=8)

# # show the plot
# plt.show()

KeyboardInterrupt: 

### view highest score tfidf grams

In [84]:
# changes count matrix to dense (can see gram mentions per review)
grams = pd.DataFrame(count_matrix.todense(), index=data.index, columns=feature_names)
grams['review_id'] = data['review_id']

# format to long matrix
grams = (pd.melt(grams.reset_index(), 
                 id_vars=['index','review_id'],
                 value_name='gram_ct')
            .query('gram_ct > 0')  # only include where gram was mentioned
            .sort_values(['index','review_id']))

# changes tfidf matrix to dense (can see gram score per review)
tfidf = pd.DataFrame(tfidf_matrix.todense(), index=data.index, columns=feature_names)
tfidf['review_id'] = data['review_id']

# format to long matrix
tfidf = (pd.melt(tfidf.reset_index(), 
                 id_vars=['index','review_id'],
                 value_name='tfidf')
            .query('tfidf > 0'))  # only include where it has a tfidf score

In [85]:
gram_summary = (grams.merge(tfidf, 
                            on=['index','review_id','variable'])
                     .set_index('index'))

In [93]:
gram_summary.groupby(['variable'], as_index=False).agg({'tfidf': 'mean'}).sort_values(['tfidf'], ascending=False)

Unnamed: 0,variable,tfidf
3379,pero,0.797989
3281,panini,0.475119
1096,corned,0.469385
3023,mussels,0.445789
4349,slots,0.429744
3279,pancakes,0.429049
5202,wack,0.408475
2956,mirage,0.405808
808,cheep,0.399462
338,bagels,0.399060


### Apply LDA to count matrix

In [52]:
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components=5,  # specify number of topics to create
                                      max_iter=10, 
                                      learning_method='online')  # in general, online is faster method

lda_Z = lda_model.fit_transform(count_matrix)

### Visualize LDA results

In [53]:
# create cool viz of LDA topics
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
panel

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))
