loganjtravis@gmail.com (Logan Travis)

In [3]:
%%capture --no-stdout

# Imports; captures errors to supress warnings about changing
# import syntax

from lxml import html
import nltk
import numpy as np
import pandas as pd
import random
import re
import wikipediaapi


# import random
# import gensim.models as models, \
#        gensim.matutils as matutils, \
#        gensim.corpora as corpora
# import matplotlib.pyplot as plot
# import nltk

# import pandas as pd
# import scipy as sp
# from sklearn.cluster import KMeans
# from sklearn.feature_extraction.text import CountVectorizer, \
#                                             TfidfTransformer
# from sklearn.metrics.pairwise import cosine_similarity, \
#                                      euclidean_distances, \
#                                      paired_cosine_distances
# from sklearn.preprocessing import normalize

In [4]:
# Set random seed for repeatability
random.seed(42)

In [5]:
# Set matplotlib to inline to preserve images in PDF
%matplotlib inline

# Summary

From course page [Week 3 > Task 3 Information > Task 3 Overview](https://www.coursera.org/learn/data-mining-project/supplement/7Ro4J/task-3-overview):

> The goal of this task is to mine the data set to discover the common/popular dishes of a particular cuisine. Typically when you go to try a new cuisine, you don’t know beforehand the types of dishes that are available for that cuisine. For this task, we would like to identify the dishes that are available for a cuisine by building a dish recognizer.
> 
> **Instructions**
>
> Before you begin, make sure you have downloaded the data set and any additional tools you wish to use, as described on the [Data Set and Toolkit Acquisition](https://www.coursera.org/learn/data-mining-project/supplement/Ij7rp/data-set-and-toolkit-acquisition) page.
> 
> Some questions to consider when building the dish recognizer are the following:
> 
> 1. What types of dishes are present in the reviews for a cuisine?
> 2. Are there any surprising dishes in the list you annotated?
> 3. What types of dishes were you able to find?

# Clean List of Mexican Dishes

*

In [6]:
# Set paths to data source, work in process ("WIP"), and output
PATH_SOURCE = "source/"
PATH_WIP = "wip/"
PATH_OUTPUT = "output/"

# Set file paths
PATH_SOURCE_MEXICAN_LABELS = PATH_SOURCE + "labels/Mexican.label"
PATH_SOURCE_MEXICAN_TO_DEL = PATH_SOURCE + "labels/Mexican_TO_DEL.label"
PATH_SOURCE_MEXICAN_TO_FLIP = PATH_SOURCE + "labels/Mexican_TO_FLIP.label"

## Inspect Provided List

*

In [68]:
# Read initial dish list for Mexican cuisine
dfMexDishes = pd.read_csv(PATH_SOURCE_MEXICAN_LABELS, sep="\t", names=["dish", "include"])

In [69]:
# Make `dish` column the index
dfMexDishes.set_index("dish", inplace=True)

In [70]:
# Set `include` column to boolean data type
dfMexDishes.include = dfMexDishes.include.astype(np.bool_,)

In [71]:
# Print dish list shape and head
print("Dish list has shape {}.".format(dfMexDishes.shape))
dfMexDishes.head(5)

Dish list has shape (597, 1).


Unnamed: 0_level_0,include
dish,Unnamed: 1_level_1
fried egg,True
in n out,True
triple sec,True
mexican food,True
service stars,True


## Remove Non-Dishes

In [72]:
# Read dishes to drop for Mexican cuisine
wip = pd.read_csv(PATH_SOURCE_MEXICAN_TO_DEL, sep="\t", names=["dish", "include"], index_col=0)

In [73]:
# Removed dishes to drop from dish list
dfMexDishes.drop(wip.index, inplace=True)

In [74]:
# Print dish list shape and head
print("Dish list has shape {}.".format(dfMexDishes.shape))
dfMexDishes.head(5)

Dish list has shape (412, 1).


Unnamed: 0_level_0,include
dish,Unnamed: 1_level_1
beef tongue,True
refried beans,True
carne asada,True
rice pudding,True
tomato soup,True


## Flip Indicator for False-Negatives

In [75]:
# Read dishes flip drop for Mexican cuisine
wip = pd.read_csv(PATH_SOURCE_MEXICAN_TO_FLIP, sep="\t", names=["dish", "include"], index_col=0)

In [76]:
# Removed dishes to drop from dish list
dfMexDishes.loc[wip.index, "include"] = ~dfMexDishes.loc[wip.index, "include"]

## Add Dishes from Wikipedia

*

In [77]:
# Get Wikipedia page "List of Mexican dishes" and parse as HTML
wp = wikipediaapi.Wikipedia('en', extract_format=wikipediaapi.ExtractFormat.HTML)
wpMexDishesPage = wp.page("List_of_Mexican_dishes")

In [78]:
# Define helper function to pretty-print secitons
def printSections(sections, level=0):
    """Pretty-print sections from `wikipediaapi` page."""
    for i, s in enumerate(sections):
        print("{}{:d}. {}".format(" " * 4 * level, i, s.title))
        printSections(s.sections, level + 1)

In [79]:
# Examine sections
printSections(wpMexDishesPage.sections)

0. Antojitos
1. Cheese  dishes
2. Egg dishes
3. Meat dishes
    0. Beef dishes
    1. Goat dishes
    2. Pork dishes
    3. Poultry dishes
    4. Other meat and protein dishes
4. Moles, sauces, dips and spreads
5. Rice dishes
6. Seafood dishes
7. Soups and stews
8. Vegetable dishes
9. Desserts and sweets
10. Beverages
    0. Non-alcoholic
    1. Alcoholic
11. See also
12. References
13. External links


In [80]:
# Get text from an example section
wpMexDishesPage.sections[0].text

'<p>Street food in Mexico, called <i>antojitos</i> is prepared by street vendors and at small traditional markets in Mexico. Most of them include corn as an ingredient.\n</p>\n<ul><li>Aguachile</li>\n<li>Bolillos (salty bread)</li>\n<li>Burrito</li>\n<li>Camote (Mexican sweet potato)</li>\n<li>Cemitas sandwiches</li>\n<li>Chalupa</li>\n<li>Chapulines and escamoles</li>\n<li>Charales, small fish, basically a type of smelt</li>\n<li>Chicharrón</li></ul>\n<ul><li>Chilaquiles</li>\n<li>Chimichangas (Tex-Mex mostly)</li>\n<li>Choriqueso</li>\n<li>Chorizo</li></ul>\n<ul><li>Cochinita pibil</li>\n<li>Cocido</li>\n<li>Cóctel de camarón and other seafood cocktails</li>\n<li>Corunda</li>\n<li>Curtido</li>\n<li>Elote</li>\n<li>Enchilada (red or green)</li>\n<li>Enfrijoladas</li>\n<li>Ensalada de fruta (fruit salad)</li>\n<li>Entomatadas</li>\n<li>Fajitas</li>\n<li>Filete de pescado</li></ul>\n<ul><li>Flautas</li>\n<li>Frijoles charros</li>\n<li>Fritada</li>\n<li>Gorditas</li>\n<li>Gringas</li>\n<

*

In [81]:
# Define helper function to get list of dishes from section text
def getDishesFromText(sectionText, removeTextAfter="[,-]", wordLimit=3):
    """Return a list of dish names from section text."""
    tree = None
    
    # Create an `lxml` element tree from HTML.
    tree = html.fromstring(sectionText)
    
    # Get dishes from <li> element text
    dishes = tree.xpath("//li/text()")
    
    # Remove parentheticals
    dishes = [re.sub(r"\(.*?\)", "", t) for t in dishes]
    
    # Remove text after passed characters
    dishes = [re.sub("(?<={}).*$".format(removeTextAfter), "", t) for t in dishes]
    
    # Trim to word limit
    dishes = [" ".join(re.split(r"\W+", t)[:wordLimit]).strip() for t in dishes]
    
    # Return list of dishes
    return set(dishes)

In [82]:
# Define helper function to recursively get dishes from all
# sections
def getDishesFromSection(section):
    """Recusively print list of sections from `wikipediaapi` page."""
#     print(section.title)
    if(len(section.sections) == 0):
        return getDishesFromText(section.text)
    else:
        dishes = set()
        for s in section.sections:
            dishes.update(getDishesFromSection(s))
        return dishes

In [83]:
# Get Mexican dishes from Wikipedia page "List of Mexican dishes"
wpMexDishes = set()
for section in wpMexDishesPage.sections[:11]:
    wpMexDishes.update(getDishesFromSection(section))

In [84]:
# Remove empty items and known bad elements then
# format for inclusion in common phrases
wpMexDishes = [(d.lower(), 1) for d in wpMexDishes \
               if d not in ["is of", "or", "", "as a", "where these"]]

In [85]:
# Convert to dataframe and merge
dfMexDishesFromWP = pd.DataFrame(wpMexDishes, columns=["dish", "include"])

In [86]:
# Make `dish` column the index
dfMexDishesFromWP.set_index("dish", inplace=True)

In [87]:
# Set `include` column to boolean data type
dfMexDishesFromWP.include = dfMexDishesFromWP.include.astype(np.bool_,)

In [88]:
# Merge original dish list and Wikipedia dish list then
# fill missing values as False
dfMexDishes = dfMexDishes.merge(dfMexDishesFromWP, how="outer", \
                                left_index=True, right_index=True, \
                                suffixes=["_initial", "_from_wp"])
dfMexDishes.fillna(False, inplace=True)

In [89]:
# Determine final inclusion from initial list or from Wikipedia
dfMexDishes["include_combined"] = dfMexDishes.include_initial | dfMexDishes.include_from_wp

In [105]:
# List dishes to include
print("Found {:,} dishes to incldue and {:,} common phrase to exclude as dishes.".format(\
        sum(dfMexDishes.include_combined), \
        sum(~dfMexDishes.include_combined)))

Found 243 dishes to incldue and 391 common phrase to exclude as dishes.


# Get Yelp Review Data Set

I cleaned the Yelp review data and extraced cuisines from the business data set in separate notebooks. Loading saved data to shorten this report.

In [62]:
# Read saved data
dfYelpReviews = pd.read_pickle(PATH_SOURCE_YELP_REVIEWS)
dfYelpRestToCuis = pd.read_pickle(PATH_SOURCE_YELP_REST_TO_CUISINES)
cuisines = pd.read_csv(PATH_SOURCE_YELP_CUISINES, names=["cuisine"])

NameError: name 'PATH_SOURCE_YELP_REVIEWS' is not defined

In [63]:
# Join (inner) reviews to restaurants
dfYelpReviews = dfYelpReviews.join(dfYelpRestToCuis, \
                                   on="business_id", \
                                   how="inner", \
                                   rsuffix="_business")

NameError: name 'dfYelpReviews' is not defined

# Determine Cuisine Similarity via Term Frequencies

I calculate the term frequencies across cuisines both with and without inverse-document frequency as an initial comparison. These methods take comparatively less time and also pre-compute data necessary for more advanced similarity comparisons like my seeded-LDA topic approach.

## TF Parameters

I found the settings below worked well when extracting topics for the week one assignment:

* Limit maximum terms to 10,000. This is an extreme upper limit. The SciKit Learn `TfidfVectorizer` class ([link to documentation](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)) never yielded more than 5,000 terms based on my other parameters.
* Exclude terms appearing in more than 50% of documents. These add little value for differntiating topics.
* Exclude terms appearing in less than 0.1% of documents. I tested many settings for this parameter ranging down to 2 documents and up to 10% of all documents. The Yelp reviews include numerious limited-use terms (e.g., people and place names) and I found it difficult to interpret the topics with too many present.

In [None]:
# Set token limit
MAX_FEATURES = 10000

# Set document frequency ceiling; topic analysis will ignore
# words found in more documents
MAX_DF = 0.5

# Set document frequency floor; topic analysis will ignore
# words found in fewer document
MIN_DF = 0.001

In [None]:
class MyTokenizer:
    def __init__(self):
        """String tokenizer utilizing lemmatizing and stemming."""
        self.wnl = nltk.stem.WordNetLemmatizer()
    
    def __call__(self, document):
        """Return tokens from a string."""
        return [self.wnl.lemmatize(token) for \
                        token in nltk.word_tokenize(document)]

## Excessive Data Set Size

I worked on a 50% sample of the Yelp reviews for restaurants. The full data set proved too large for my machine's memory.

In [None]:
# Set working dataframe to a 50% sample of the full data set;
# too large otherwise
df = dfYelpReviews.sample(frac=0.5)

In [None]:
# Reset index to simple integer; simplifies document selection
# in TF matrix
df.reset_index(inplace=True)

In [None]:
# Create TF vectorizer 
tf = CountVectorizer(max_features=MAX_FEATURES, max_df=MAX_DF, \
                     min_df=MIN_DF, stop_words="english", \
                     tokenizer=MyTokenizer())

In [None]:
%%time

# Calculate term frequencies
docTerms = tf.fit_transform(df.text)

In [None]:
# Print token matrix shape
print("Found {0[1]:,} tokens in {0[0]:,} documents".format(docTerms.shape))

## Cuisine TF Cosine Similarity

In [None]:
# Sort cuisines alphabetically
cuisines = cuisines.sort_values(by="cuisine").reset_index(drop=True)

In [None]:
%%time

# Sum term frequencies across documents for each cuisine
cuisineTerms = sp.sparse.coo_matrix((0, docTerms.shape[1]))
for c in cuisines.cuisine:
    idxs = df[df.categories.apply(lambda cats: c in cats)].index
    cuisineTerms = sp.sparse.vstack([cuisineTerms, docTerms[idxs, ].sum(axis=0)])

In [None]:
# Convert to CSR sparse matrix to facility row-wise operations
cuisineTerms = cuisineTerms.tocsr()

In [None]:
# Normalize term frequencies by cuisine for initial graph
cuisineTermsL2Norm = normalize(cuisineTerms, axis=0)

In [None]:
# Define graphing function for reusability
def graphSimilarity(matrix, labels, title, display_labels=True, colors=None, \
                    matrix_under=None, alpha_over=0.2):
    """Graph a similarity matrix."""
    fig, ax = plot.subplots()
    fig.set_size_inches(8, 8)
    
    # Generate matrix image with or without underlay
    cmap = colors if not None else plot.cm.get_cmap("viridis")
    if matrix_under is None:
        cax = ax.matshow(matrix, cmap=cmap)
    else:
        cmap_over = plot.get_cmap("binary_r")
        cax_under = ax.matshow(matrix_under, cmap=cmap, vmin=0, vmax=1)
        cax = ax.matshow(matrix, cmap=cmap_over, alpha=alpha_over)
        
    
    # Configure axes
    ax.xaxis.tick_top()
    if display_labels:
        plot.xticks(range(matrix.shape[0]), labels, rotation=90)
        plot.yticks(range(matrix.shape[0]), labels)
    
    # Add title (as x-axis label) and color bar
    ax.set_xlabel(title)
    fig.colorbar(cax)
    
    # Return plot
    return fig, ax

In [None]:
# Plot cosine similarity for all cuisine TFs
matrix = cosine_similarity(cuisineTermsL2Norm)
labels = cuisines.cuisine
_, _ = graphSimilarity(matrix, labels, \
                       "Cuisine TF (L2 Norm) Cosine Similarity", \
                       display_labels=False)
plot.show()

While quick, calculating cosine similarity from intra-topic normalized term frequencies yields poor results. Many cuisines appear unrelated to all others as indicated by the dark bands (i.e., cosine similarity near zero).

In [None]:
# Sample cuisines for detailed view
SAMPLE_20 = sorted(random.sample(range(cuisines.shape[0]), 20))

In [None]:
# Plot cosine similarity for random 20 cuisine TFs
matrix = cosine_similarity(cuisineTermsL2Norm[SAMPLE_20, :])
labels = cuisines.iloc[SAMPLE_20, ].cuisine
_, _ = graphSimilarity(matrix, labels, \
                       "Cuisine TF (L2 Norm) Cosine Similarity - Random Subset", \
                       display_labels=True)
plot.show()

Zooming in on a sample of cuisines shows some expected similarities: Delis to Bagels and Asian Fusion to Sushi Bars. Still, most cuisines appear unrelated to each other.

## Cuisine TF-IDF Cosine Similarity

Applying IDF weighting should improve the cuisine comparison. Note that "document" in IDF does not mean the individual reviews. It instead means the topics. I essentially create an imaginary document summing all the terms seen in reviews for a given topic.

I considered calculating TF-IDF against the reviews. That would yield better per-term weighting. However, I do not know how to consolidate the review-driven TF-IDF results across topics.

In [None]:
# Create TF-IDF transformer from counts
tfToTfidf = TfidfTransformer(norm="l2", use_idf=True)

In [None]:
# Calculate TF-IDF for topics
cuisineTermsIDF = tfToTfidf.fit_transform(cuisineTerms)

In [None]:
# Plot cosine similarity for all cuisines TF-IDF
matrix = cosine_similarity(cuisineTermsIDF)
labels = cuisines.cuisine
_, _ = graphSimilarity(matrix, labels, \
                       "Cuisine TF-IDF Cosine Similarity",
                       display_labels=False)
plot.show()

Immediately we see the IDF produces a better similarity spread. Many cuisines relate closely to one another - as expected in the real world - while some retain their distinctness.

In [None]:
# Plot cosine similarity for random 20 cuisine TF-IDFs
matrix = cosine_similarity(cuisineTermsIDF[SAMPLE_20, :])
labels = cuisines.iloc[SAMPLE_20, ].cuisine
_, _ = graphSimilarity(matrix, labels, \
                       "Cuisine TF-IDF Cosine Similarity - Random Subset", \
                       display_labels=True)
plot.show()

The same sampled cuisines reveal new similarities after applying IDF: Asian Fusion to Caribbean and Ethiopian to Tapas/Small Plates. The IDF also highlights the not-quite-cuisines like Bagels, Donuts, Famers Market, and Patisserie/Cake Shop.

Not all results makes intuitive sense. Delis, a not-quite-cuisine, displays moderate similarity with most cuisines. So too does Wineries. Their reviews may exhibit enough common terms to explain the similarities. As an example, many restaurants sell alcohol so might relate to Wineries regardless of cuisine.

I still think I can improve the cuisine map.

## Cuisine TF-IDF Euclidian Distance

I perform a quick sanity check by comparing TF-IDF accross cuisines using Euclidean distance. It produces similar results to cosine similarity (with color scale reversed). That makes sense since all the TF-IDF vectors point into the same sector (i.e., all positive values) and have similar magnitudes due to normalization.

In [None]:
# Get revers viridis color map
cmap_viridis_r = plot.get_cmap("viridis_r")

In [None]:
# Plot euclidian distance (reverse color map) for all 
# cuisine TF-IDF
matrix = euclidean_distances(cuisineTermsIDF)
labels = cuisines.cuisine
_, _ = graphSimilarity(matrix, labels, \
                       "Cuisine TF-IDF Euclidean Distance", \
                       display_labels=False, colors=cmap_viridis_r)
plot.show()

In [None]:
# Plot euclidian distance (reverse color map) for random 
# 20 cuisine TF-IDFs
matrix = euclidean_distances(cuisineTermsIDF[SAMPLE_20, :])
labels = cuisines.iloc[SAMPLE_20, ].cuisine
_, _ = graphSimilarity(matrix, labels, \
                       "Cuisine TF-IDF Euclidean Distance - Random Subset", \
                       display_labels=True, colors=cmap_viridis_r)
plot.show()

# Determine Cuisine Similarity via LDA

Improving the cuisine map requires improving the text-based representation of each cuisine. LDA offers one potential method: Attempt to discover topics that match cuisines. I say "attempt" because LDA is an unsuperived algorithm; I cannot provide it the cuisines as ground truths.

I research extensions of LDA with supervised or "guided" learning. See [Google Scholar Articles Related to Labeled LDA](https://scholar.google.com/scholar?um=1&ie=UTF-8&lr&q=related:632kE3NGZGxZRM:scholar.google.com/). Unfortunately, I found myself time constrained so choose instead to seed the regular LDA with cuisine TFs as a-priori probabilities.

In [None]:
# Set number of topics
NUM_TOPICS = cuisines.shape[0]

In [None]:
# Calculate TF-IDF for terms
docTermsIDF = tfToTfidf.fit_transform(docTerms)

In [None]:
# Convert GenSim corpus from token matrix
corpus = matutils.Sparse2Corpus(docTermsIDF, documents_columns=False)

In [None]:
# Create a GenSim dictionary for documents; Note: Passes the
# vectorizer tokens as a single "document".
dictionary = corpora.Dictionary([tf.get_feature_names()])

In [None]:
%%time
%%capture --no-stdout

# Find topic using LDA with a-priori from cuisine TF
lda = models.ldamulticore.LdaMulticore(corpus, \
                                       num_topics=NUM_TOPICS,
                                       id2word=dict(dictionary.items()), \
                                       eta=cuisineTerms.todense().tolist()) # Memory intensive

In [None]:
# Get topic distribution for each cuisine
rows, cols, vals = [], [], []
cuisineTerms = cuisineTerms.tocsr()
for i in range(cuisines.shape[0]):
    row = cuisineTerms.getrow(0)
    bow = list(zip(row.indices, row.data))
    for c, v in lda.get_document_topics(bow, minimum_probability=0):
        rows.append(i)
        cols.append(c)
        vals.append(v)
cuisineTopics = sp.sparse.csr_matrix((vals, (rows, cols)))

In [None]:
# Plot cosine similarity for all cuisine LDA topics
matrix = cosine_similarity(cuisineTopics)
labels = cuisines.cuisine
_, _ = graphSimilarity(matrix, labels, \
                       "Cuisine LDA Topic Cosine Similarity", \
                       display_labels=False)
plot.show()

Note the scale of the colorbar. It reaches a maximum of just 0.0001 suggesting the seeded-LDA produced topics nearly matching cuisines (i.e. nearly orthogonal). I think the overall effect improves on TF-IDF similarity but only slightly. Notabely, the seeded-LDA approach removes many of the dark bands finding similarities for cuisines that previously appeared completely dissimilar from all others.

In [None]:
# Plot cosine similarity for random 20 cuisine 
# LDA topics
matrix = cosine_similarity(cuisineTopics[SAMPLE_20, :])
labels = cuisines.iloc[SAMPLE_20, ].cuisine
_, _ = graphSimilarity(matrix, labels, \
                       "Cuisine LDA Topic Cosine Similarity - Random Subset", \
                       display_labels=True)
plot.show()

Closer inspection of the sample cuisines shows a "flattened" similarity matrix. Strong dis/similarities appear like Asian Fusion to Tapas/Small Plates (strongly similar) and Delis to Singaporean (strongly dissimilar). Still, the flattened matrix would benefit from better defining such dis/similarities.

# Cluster Similar Cuisines

I improve the seeded-LDA comparison of cuisines by clustering them and ordering them around those clusters. This took a lot of experimentation. I tried as few as two clusters and as many as 16. I tried varies overlays on the similarity matrix to highlight clusters. I also tried both cosine similarity and euclidean distance again when ordering cuisines within each cluster.

Four clusters and cosine similarity yieleded the best cuisine map.

## Cluster Cuisine Topics with K-Means

In [None]:
# Create and train K-Means clustering instance
cuisineTopicClustersKM = KMeans(n_clusters=4, random_state=42).fit(cuisineTopics)

In [None]:
# Add cluster labels and score to cuisines dataframe
cuisines["cluster_km"] = cuisineTopicClustersKM.labels_

In [None]:
# Calculate distance to cluster centroid
getCentroidDistance = lambda r: paired_cosine_distances(\
        cuisineTopics.getrow(r.name).todense(), \
        cuisineTopicClustersKM.cluster_centers_[r["cluster_km"]].reshape(1, NUM_TOPICS) \
)[0]
cuisines["centroid_dist_km"] = cuisines.apply(getCentroidDistance, axis=1)

In [None]:
# Create sorted index by cluster ID and distance to
# cluster centriod
sortedIdx = cuisines.sort_values(by=["cluster_km", "centroid_dist_km"]).index.values

In [None]:
# Get categorical color map for overlay
cmap_tab10 = plot.get_cmap("tab10")

In [None]:
# Plot cosine similarity for all clustered cuisine 
# LDA topics
matrix = cosine_similarity(cuisineTopics[sortedIdx])
labels = cuisines.cuisine[sortedIdx]
clusterCols = np.array([cuisines.cluster_km[sortedIdx].values / 5, ] * cuisines.shape[0])
_, _ = graphSimilarity(matrix, labels, \
                       "Clustered Cuisine LDA Topic Cosine Similarity", \
                       display_labels=False, colors=cmap_tab10, \
                       matrix_under=clusterCols, alpha_over=0.8)

# Show plot
plot.show()

Each vertical band of color represents a cluster of similar cuisines. The resulting squares along the diagonal highlight the most similar cuisines pushing disimilarities toward the edges of the graph.

In [None]:
# Sort sample
SORTED_SAMPLE_20 = [i for i in sortedIdx if i in SAMPLE_20]

In [None]:
# Plot cosine similarity for random 20 clustered cuisine
# LDA topics
matrix = cosine_similarity(cuisineTopics[SORTED_SAMPLE_20, :])
labels = cuisines.cuisine[SORTED_SAMPLE_20]
clusterCols = np.array([cuisines.cluster_km[SORTED_SAMPLE_20].values / 5, ] * 20)
_, ax = graphSimilarity(matrix, labels, \
                        "Clustered Cuisine LDA Topic Cosine Similarity - Random Subset", \
                        display_labels=True, colors=cmap_tab10, \
                        matrix_under=clusterCols, alpha_over=0.8)

# Show plot
plot.show()

Zooming in shows the value of clustering. Cuisines with strong similarities in the real world appear with each other. This makes comparison between them visually easier. Clustering also reveals a weakness in using review text to represent cuisines: It includes significant non-cuisine information. The {Asian Fusion, Dive Bars, Tapas/Small Plates, Brewies} cluster more likely reflects 'trendy dining' than it does cuisine.