In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import re

In [2]:
# Load text file and resolve formatting issues
with open("yelp.csv", "r", encoding="utf-8") as file:
   comments = [re.sub(r'[^a-zA-Z\s]', '', line.strip().lower()) for line in file.readlines() if line.strip()]

In [3]:
# Convert text to binary term-document matrix for top 5 terms
vectorizer = CountVectorizer(stop_words='english', max_features=5, binary=True)
binary_matrix = vectorizer.fit_transform(comments).toarray()
terms = vectorizer.get_feature_names_out()

In [4]:
# Convert matrix to DataFrame for top 5 terms
binary_df = pd.DataFrame(binary_matrix, columns=terms)
print("Binary Term-Document Matrix (Top 5 terms):")
print(binary_df)
# Top 5 terms are dish, food, good, great, pizza

Binary Term-Document Matrix (Top 5 terms):
    dish  food  good  great  pizza
0      0     0     0      0      0
1      0     1     1      0      0
2      0     1     0      1      0
3      0     1     1      1      1
4      0     0     0      0      0
5      0     0     0      0      0
6      0     0     0      0      1
7      0     0     0      0      0
8      0     0     1      1      1
9      0     1     0      0      1
10     0     0     0      0      0
11     1     1     1      0      1
12     0     0     0      0      0
13     0     0     0      0      1
14     0     0     0      0      0
15     0     0     1      0      0
16     1     0     0      0      1
17     0     1     1      1      0
18     1     0     0      0      1
19     1     0     1      0      1
20     1     0     1      1      0


In [5]:
# Compute linkage for hierarchical clustering
linkage_matrix = linkage(binary_matrix, method='ward')

In [8]:
cluster = AgglomerativeClustering(n_clusters=2, metric='euclidean', linkage='ward')
labels = cluster.fit_predict(binary_matrix)

In [9]:
results_df = pd.DataFrame({
    'Comment': comments,
    'Cluster': labels
})
print("\nCustomer Comments with Cluster Labels:")
print(results_df.sort_values(by='Cluster'))

# Cluster 0 is general comments about the food - including negative
# Cluster 1 includes comments about the food including positive terms like good or great


Customer Comments with Cluster Labels:
                                              Comment  Cluster
0   love the fooddo not have any complaints favori...        0
18  this place is famous for their deep dish pizza...        0
16  love this place super friendly staff and the p...        0
14  amazing pasta and meatballs i was so happy to ...        0
13                     the stuffed pizza is excellent        0
12  fantastico i had the manicotti and it was mout...        0
11  real deep dish pan pizza chicago style worth f...        0
19  the only good dish i have had here was the piz...        0
9   food  we ordered the chicken alfredo pizza mus...        0
10  the manicotti is made with a homemade crepe in...        0
7   way way way overrated i stopped going here aft...        0
6   a definite five stars for the pizza and their ...        0
5                                   best gnocchi ever        0
4   the special was tortellini with pesto sauce  s...        0
3   really grea