In [1]:
##### Variables #####
K_CLUSTERS_RESTAURANT = 20
K_CLUSTERS_USER = 20
REDUCED_D = 300
TERMS_PER_CLUSTER_RESTAURANT = 10

In [2]:
# import libraries
from __future__ import print_function
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.cluster import MiniBatchKMeans
import logging
from time import time
import numpy as np
import sqlite3
import pandas as pd
from stop_words import get_stop_words
from collections import Counter
from pandas import merge

%matplotlib notebook
from matplotlib import pyplot as plt

ImportError: No module named 'stop_words'

In [None]:
conn = sqlite3.connect("yelp2.db")
df = pd.read_sql_query("select text, user_id, business_id, stars from review;", conn)
user_id_valcounts = df['user_id'].value_counts()
unique_users = user_id_valcounts[user_id_valcounts >= 50].index.tolist()
df = df[df['user_id'].isin(unique_users)] #unique_users:  558, unique restaurants:  1620
unique_business = list(set(df['business_id']))

dict_business= {}
dict_user = {}
for index, row in df.iterrows():
    business = str(row['business_id'])
    user = str(row ['user_id'])
    review = str(row['text']).replace('\n','')
    if business in dict_business:
        dict_business[business] += review
    else:
        dict_business[business] = review	
    if user in dict_user:
        dict_user[user] += review
    else:
        dict_user[user] = review
rawdata_user = [ [k1,v1] for k1, v1 in dict_user.items() ]
rawdata_busi = [ [k2,v2] for k2, v2 in dict_business.items() ]

en_stop = get_stop_words('en')
data_b=[]
data_u=[]
for entry in rawdata_user:
    nonstop = []
    for word in entry[1].split():
        if word not in en_stop:
            nonstop.append(word)
    stopresult = ' '.join(nonstop)
    entry[1] = stopresult
    data_u.append(stopresult)
    
for entry in rawdata_busi:
    nonstop = []
    for word in entry[1].split():
        if word not in en_stop:
            nonstop.append(word)
    stopresult = ' '.join(nonstop)
    entry[1] = stopresult
    data_b.append(stopresult)

   
    


In [None]:
# Users Clustered

#########################################

print("Extracting features from the training dataset using a sparse vectorizer")
MAX_FEATURES = 100000
vectorizer = TfidfVectorizer(max_df=0.7, max_features=MAX_FEATURES,
                             min_df=100, stop_words='english',
                             use_idf=True)
X = vectorizer.fit_transform(data_u)
print("n_samples: %d, n_features: %d" % X.shape)

#########################################

print("Performing dimensionality reduction using LSA")
svd = TruncatedSVD(REDUCED_D)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
lsa_X = lsa.fit_transform(X)
explained_variance = svd.explained_variance_ratio_.sum()
print("(Reduced) n_samples: %d, n_features: %d" % lsa_X.shape)
print("Explained variance of the SVD step: {}%".format(
    int(explained_variance * 100)))

#########################################

print("Train a kmeans classifier")
km = MiniBatchKMeans(n_clusters=K_CLUSTERS_USER, init='k-means++', n_init=5,
                     init_size=100000, batch_size=1000, verbose=True)
km.fit(lsa_X)
print("Top %d terms per cluster:" %TERMS_PER_CLUSTER_RESTAURANT)
original_space_centroids = svd.inverse_transform(km.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(K_CLUSTERS_RESTAURANT):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :TERMS_PER_CLUSTER_RESTAURANT]:
        print(' %s' % terms[ind], end='')
    print()

#########################################

clusters = km.labels_.tolist()
print("Number of users in each cluster: ", Counter(clusters))
df.insert(len(df.columns),'user_cluster',0)
for index, row  in df.iterrows():
    uid = str(row['user_id'])
    ind = -1
    for i in range(0, len(rawdata_user)):
        if(uid == str(rawdata_user[i][0])):
            ind = i
    df.set_value(index, 'user_cluster', clusters[ind])
    
#########################################

user_id_cluster = df['user_cluster'].groupby(df['user_id']).agg(lambda x:x.value_counts().index[0])
label = user_id_cluster.tolist()
unique_users.sort()
user_cluster = {'user_id': unique_users, 'user_cluster': label}
df_user_cluster = pd.DataFrame(user_cluster, columns = {'user_id', 'user_cluster'})
print('User Cluster Count: ', df_user_cluster['user_cluster'].value_counts())

#c_temp = df.columns.difference(df_user_cluster.columns)
combined = merge(df, df_user_cluster)

#########################################

In [None]:
# Restuarants Clustered 

#########################################
print("Extracting features from the training dataset using a sparse vectorizer")
MAX_FEATURES = 100000
vectorizer2 = TfidfVectorizer(max_df=0.7, max_features=MAX_FEATURES,
                             min_df=100, stop_words='english',
                             use_idf=True)
X2 = vectorizer2.fit_transform(data_b)
print("n_samples: %d, n_features: %d" % X2.shape)

#########################################

print("Performing dimensionality reduction using LSA")
svd = TruncatedSVD(REDUCED_D)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
lsa_X2 = lsa.fit_transform(X2)
explained_variance = svd.explained_variance_ratio_.sum()
print("(Reduced) n_samples: %d, n_features: %d" % lsa_X2.shape)
print("Explained variance of the SVD step: {}%".format(
    int(explained_variance * 100)))

#########################################

print("Train a kmeans classifier")
km2 = MiniBatchKMeans(n_clusters=K_CLUSTERS_RESTAURANT, init='k-means++', n_init=5,
                     init_size=100000, batch_size=1000, verbose=True)
km2.fit(lsa_X2)
print("Top %d terms per cluster:" %TERMS_PER_CLUSTER_RESTAURANT)
original_space_centroids = svd.inverse_transform(km2.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
terms2 = vectorizer2.get_feature_names()
terms_cluster = []
for i in range(K_CLUSTERS_RESTAURANT):
    print("Cluster %d:" % i, end='')
    temp = []
    for ind in order_centroids[i, :TERMS_PER_CLUSTER_RESTAURANT]:
        print(' %s' % terms2[ind], end='')
        temp.append(terms2[ind])
    terms_cluster.append(temp)
    print()
    
#########################################

clusters2 = km2.labels_.tolist()
print("Number of users in each cluster: ", Counter(clusters2))




for index, row  in df.iterrows():
    bid = str(row['business_id'])
    ind = -1
    for i in range(0, len(rawdata_busi)):
        if(bid == str(rawdata_busi[i][0])):
            ind = i
    df.set_value(index, 'restaurant_cluster', int(clusters2[ind]))





for index, row  in df.iterrows():
    bid = str(row['business_id'])
    ind = -1
    for i in range(0, len(rawdata_busi)):
        if(bid == str(rawdata_busi[i][0])):
            ind = i
    df.set_value(index, 'restaurant_cluster', int(clusters2[ind]))
    
#########################################

restaurant_id_cluster = df['restaurant_cluster'].groupby(df['business_id']).agg(lambda x:x.value_counts().index[0])
label2 = restaurant_id_cluster.tolist()
unique_business.sort()
restaurant_cluster = {'business_id': unique_business, 'restaurant_cluster': label2}
df_restaurant_cluster = pd.DataFrame(restaurant_cluster, columns = {'business_id', 'restaurant_cluster'})
print('Restaurant Cluster Count: ', df_restaurant_cluster['restaurant_cluster'].value_counts())

#c_temp2 = df.columns.difference(df_restaurant_cluster.columns)
combined = merge(df, df_restaurant_cluster)

#########################################

In [None]:
temp = []
for i in range(K_CLUSTERS_RESTAURANT):
    temp.append(i)
terms_c = {'restuarant_cluster': temp, 'terms': terms_cluster}
df_term_cluster = pd.DataFrame(terms_c, columns = {'restuarant_cluster', 'terms'})
df_term_cluster.to_csv('restaurant_cluster_terms.csv')
combined.to_csv('preclustered.csv')