In [1]:
!export PATH=/usr/lib/jvm/java-8-openjdk-amd64/jre/bin:/u3/cs451/packages/spark/bin:/u3/cs451/packages/hadoop/bin:/u3/cs451/packages/maven/bin:/u3/cs451/packages/scala/bin:$PATH
!export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre

In [2]:
import findspark
findspark.init('/u/cs451/packages/spark')

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from operator import add
from pyspark.ml import Pipeline, PipelineModel

from pyspark.ml.feature import RegexTokenizer, CountVectorizer
from pyspark.ml.feature import StopWordsRemover, VectorAssembler
from pyspark.ml.feature import Word2Vec, Word2VecModel
from pyspark.ml.feature import IDF

from pyspark.sql import SQLContext
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [4]:
business_df = spark.read.parquet("Data/yelp_business.parquet")

business_df.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- business_name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- stars: double (nullable = true)
 |-- review_count: long (nullable = true)
 |-- categories: string (nullable = true)



In [5]:
review_df = spark.read.parquet("Data/yelp_review.parquet")

review_df.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- business_id: string (nullable = true)
 |-- stars: double (nullable = true)
 |-- review_date: string (nullable = true)
 |-- review_text: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- funny: long (nullable = true)
 |-- cool: long (nullable = true)



In [6]:
user_df = spark.read.parquet("Data/yelp_users.parquet")

user_df.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- user_name: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- yelping_since: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- fans: long (nullable = true)
 |-- average_stars: double (nullable = true)



In [7]:
# Create views
business_df.createOrReplaceTempView("businesse")
user_df.createOrReplaceTempView("user")
review_df.createOrReplaceTempView("review")

In [8]:
review_text = spark.sql("SELECT business_id, review_text FROM review")
review_text.show(3)

+--------------------+--------------------+
|         business_id|         review_text|
+--------------------+--------------------+
|AakkkTuGZA2KBodKi...|I cannot believe ...|
|YvrylyuWgbP90RgMq...|You can't really ...|
|y-Iw6dZflNix4BdwI...|Good selection of...|
+--------------------+--------------------+
only showing top 3 rows



In [9]:
review_text.count()

376593

In [10]:
review_text = spark.sql("SELECT business_id, review_text FROM review")
review_text_rdd = review_text.rdd
review_by_business_rdd = review_text_rdd.map(tuple).reduceByKey(add)  
review_by_business_df = spark.createDataFrame(review_by_business_rdd)
review_by_business_df = review_by_business_df \
                            .withColumnRenamed('_1', 'business_id') \
                            .withColumnRenamed('_2', 'text')
review_by_business_df.count()

7965

In [11]:
review_by_business_df.show(3)

+--------------------+--------------------+
|         business_id|                text|
+--------------------+--------------------+
|ybbcg01-j7tKJ_oLE...|Came to this plac...|
|LOEZ8zDGly7s-laA1...|Bricco is a local...|
|D5oLn4j7eezCAoOsu...|My first time at ...|
+--------------------+--------------------+
only showing top 3 rows



In [12]:
model_path="model/"
# regexTokenizer = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'text', outputCol = 'token')
# stopWordsRemover = StopWordsRemover(inputCol = 'token', outputCol = 'nostopwrd')
# countVectorizer = CountVectorizer(inputCol="nostopwrd", outputCol="rawFeature")
# iDF = IDF(inputCol="rawFeature", outputCol="idf_vec")
# word2Vec = Word2Vec(vectorSize = 100, minCount = 5, inputCol = 'nostopwrd', outputCol = 'word_vec', seed=123)
# vectorAssembler = VectorAssembler(inputCols=['idf_vec', 'word_vec'], outputCol='comb_vec')

In [13]:
#pipeline = Pipeline(stages=[regexTokenizer, stopWordsRemover, countVectorizer, iDF, word2Vec, vectorAssembler])

# fit the model
#pipeline_mdl = pipeline.fit(review_by_business_df)

#save the pipeline model
#pipeline_mdl.write().overwrite().save(model_path + 'pipe_txt')

In [14]:
pipeline_mdl = PipelineModel.load(model_path + 'pipe_txt')

In [15]:
review_by_business_trf_df = pipeline_mdl.transform(review_by_business_df)

In [16]:
# show the transformed review data

review_by_business_trf_df.select( 'text', 'nostopwrd', 'idf_vec', 'word_vec', 'comb_vec').show(10)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|           nostopwrd|             idf_vec|            word_vec|            comb_vec|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|Came to this plac...|[came, place, bas...|(146294,[0,1,2,3,...|[-0.0136668954378...|(146394,[0,1,2,3,...|
|Bricco is a local...|[bricco, local, j...|(146294,[0,1,2,3,...|[-0.0295332199922...|(146394,[0,1,2,3,...|
|My first time at ...|[first, time, new...|(146294,[0,1,2,3,...|[-0.0603261812018...|(146394,[0,1,2,3,...|
|Nice traditional ...|[nice, traditiona...|(146294,[0,1,2,3,...|[0.03592066404262...|(146394,[0,1,2,3,...|
|I wrote a review ...|[wrote, review, b...|(146294,[0,1,2,3,...|[-0.1295336404974...|(146394,[0,1,2,3,...|
|I have only order...|[ordered, deliver...|(146294,[0,1,2,3,...|[0.05704669608598...|(146394,[0,1,2,3,...|
|Good place for a ...|[good, place, j

In [17]:
review_by_business_trf_df.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- text: string (nullable = true)
 |-- token: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- nostopwrd: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- rawFeature: vector (nullable = true)
 |-- idf_vec: vector (nullable = true)
 |-- word_vec: vector (nullable = true)
 |-- comb_vec: vector (nullable = true)



In [18]:
def CosineSim(vec1, vec2): 
    return np.dot(vec1, vec2) / np.sqrt(np.dot(vec1, vec1)) / np.sqrt(np.dot(vec2, vec2))

In [19]:
all_business_vecs = review_by_business_trf_df.select('business_id', 'word_vec').rdd.map(lambda x: (x[0], x[1])).collect()

In [20]:
all_business_vecs[1]

('LOEZ8zDGly7s-laA1OBIEw',
 DenseVector([-0.0295, -0.1481, 0.0735, 0.0825, 0.1092, 0.0526, -0.3987, -0.2843, -0.0263, -0.2307, -0.3078, 0.056, -0.1377, -0.0417, -0.0354, 0.3381, 0.1244, 0.1929, 0.1569, -0.1047, -0.1248, -0.0394, -0.1009, 0.0016, -0.285, 0.2201, -0.1459, 0.1574, -0.1333, -0.0187, 0.2506, 0.4209, 0.1846, -0.0107, -0.1077, 0.0573, 0.0993, 0.0945, -0.0222, -0.0426, 0.1358, 0.19, -0.3823, -0.3172, 0.0372, 0.1264, 0.0935, -0.2257, 0.0113, -0.1187, 0.0394, -0.0145, -0.0002, 0.0503, 0.1003, 0.1514, -0.1465, -0.0981, 0.1123, 0.1144, -0.1436, 0.0811, -0.0536, -0.0803, -0.1645, 0.1468, 0.0319, -0.0721, 0.0433, 0.0021, 0.1139, -0.0014, -0.2012, -0.1106, -0.1581, 0.0053, 0.0468, 0.1427, 0.0782, 0.0608, 0.0336, 0.1902, 0.0101, 0.0649, 0.1216, 0.0444, -0.007, -0.0167, 0.0059, -0.1692, 0.1947, -0.1836, -0.0397, 0.3697, -0.146, -0.045, 0.1135, 0.0358, -0.0655, 0.004]))

In [21]:
all_business_vecs[1][0]

'LOEZ8zDGly7s-laA1OBIEw'

In [22]:
all_business_vecs[1][1]

DenseVector([-0.0295, -0.1481, 0.0735, 0.0825, 0.1092, 0.0526, -0.3987, -0.2843, -0.0263, -0.2307, -0.3078, 0.056, -0.1377, -0.0417, -0.0354, 0.3381, 0.1244, 0.1929, 0.1569, -0.1047, -0.1248, -0.0394, -0.1009, 0.0016, -0.285, 0.2201, -0.1459, 0.1574, -0.1333, -0.0187, 0.2506, 0.4209, 0.1846, -0.0107, -0.1077, 0.0573, 0.0993, 0.0945, -0.0222, -0.0426, 0.1358, 0.19, -0.3823, -0.3172, 0.0372, 0.1264, 0.0935, -0.2257, 0.0113, -0.1187, 0.0394, -0.0145, -0.0002, 0.0503, 0.1003, 0.1514, -0.1465, -0.0981, 0.1123, 0.1144, -0.1436, 0.0811, -0.0536, -0.0803, -0.1645, 0.1468, 0.0319, -0.0721, 0.0433, 0.0021, 0.1139, -0.0014, -0.2012, -0.1106, -0.1581, 0.0053, 0.0468, 0.1427, 0.0782, 0.0608, 0.0336, 0.1902, 0.0101, 0.0649, 0.1216, 0.0444, -0.007, -0.0167, 0.0059, -0.1692, 0.1947, -0.1836, -0.0397, 0.3697, -0.146, -0.045, 0.1135, 0.0358, -0.0655, 0.004])

In [23]:
def getSimilarBusinesses(b_ids, sim_bus_limit=10):
    
    schema = StructType([   
                            StructField("business_id", StringType(), True)
                            ,StructField("score", IntegerType(), True)
                            ,StructField("input_business_id", StringType(), True)
                        ])
    
    similar_businesses_df = spark.createDataFrame([], schema)
    
    for b_id in b_ids:
        
        input_vec = [(r[1]) for r in all_business_vecs if r[0] == b_id][0]
        #input_vec = reviews_by_business_trf_df.select('word_vec')\
                    #.filter(reviews_by_business_trf_df['business_id'] == b_id)\
                    #.collect()[0][0]

        similar_business_rdd = spark.sparkContext.parallelize((i[0], float(CosineSim(input_vec, i[1]))) for i in all_business_vecs)

        similar_business_df = spark.createDataFrame(similar_business_rdd) \
            .withColumnRenamed('_1', 'business_id') \
            .withColumnRenamed('_2', 'score') \
            .orderBy("score", ascending = False)
            
        similar_business_df = similar_business_df.filter(col("business_id") != b_id).limit(sim_bus_limit)
        similar_business_df = similar_business_df.withColumn('input_business_id', lit(b_id))
        
        similar_businesses_df = similar_businesses_df \
                                    .union(similar_business_df)
        
    
    return similar_businesses_df


In [24]:
def getBusinessDetails(in_bus):
    
    a = in_bus.alias("a")
    b = business_df.alias("b")
    
    return a.join(b, col("a.business_id") == col("b.business_id"), 'inner') \
             .select([col('a.'+xx) for xx in a.columns] + [col('b.business_name'),col('b.categories'),
                                                           col('b.stars'),col('b.review_count'),
                                                           col('b.latitude'),col('b.longitude')])

In [26]:
# test with two restaurants

bids = ['Dl2vgi5W_nbe-A97D0zgfA', 'RtUvSWO_UZ8V3Wpj0n077w']

print('\ninput restaurants details:')
business_df.select('business_id','business_name', 'categories') \
    .filter(business_df.business_id.isin(bids) == True).show(truncate=False)
    
# get top 10 similar business
sims = getBusinessDetails(getSimilarBusinesses(bids))

print('Top 10 similar restaurants for each input restaurant are:"')
sims.select('input_business_id','business_name', 'score','categories').toPandas()


input restaurants details:
+----------------------+----------------------+----------------------------------------------------------------------------+
|business_id           |business_name         |categories                                                                  |
+----------------------+----------------------+----------------------------------------------------------------------------+
|Dl2vgi5W_nbe-A97D0zgfA|Tasty Hut             |Restaurants, Chinese                                                        |
|RtUvSWO_UZ8V3Wpj0n077w|KINKA IZAKAYA ORIGINAL|Bars, Restaurants, Japanese, Tapas/Small Plates, Tapas Bars, Nightlife, Pubs|
+----------------------+----------------------+----------------------------------------------------------------------------+

Top 10 similar restaurants for each input restaurant are:"


Unnamed: 0,input_business_id,business_name,score,categories
0,Dl2vgi5W_nbe-A97D0zgfA,Papa Spicy,0.934741,"Restaurants, Chinese"
1,Dl2vgi5W_nbe-A97D0zgfA,Kushimaru,0.933984,"Japanese, Restaurants"
2,Dl2vgi5W_nbe-A97D0zgfA,New Regime Restaurant,0.932391,"Restaurants, Chinese"
3,Dl2vgi5W_nbe-A97D0zgfA,Qin Tang Taste,0.928421,"Chinese, Restaurants"
4,Dl2vgi5W_nbe-A97D0zgfA,Silver Star BBQ,0.926816,"Restaurants, Chinese"
5,Dl2vgi5W_nbe-A97D0zgfA,The Only Cuisine Corp,0.918891,"Restaurants, Chinese"
6,Dl2vgi5W_nbe-A97D0zgfA,L's Chinese Eatery,0.91857,"Restaurants, Chinese"
7,Dl2vgi5W_nbe-A97D0zgfA,Shaanxi Legend,0.91821,"Restaurants, Chinese"
8,Dl2vgi5W_nbe-A97D0zgfA,Harbin Bbq,0.918076,"Chinese, Barbeque, Restaurants"
9,Dl2vgi5W_nbe-A97D0zgfA,Verdant Garden Chinese Restaurant,0.917669,"Restaurants, Chinese"


In [27]:
def getContentRecoms(u_id, sim_bus_limit=10):
    
    # select restaurants previously reviewed (3+) by the user
    query = """
    SELECT distinct business_id FROM review
    where stars >= 3.5 
    and user_id = "{}"
    """.format(u_id)
    sqlContext = SQLContext(spark.sparkContext)
    usr_rev_bus = sqlContext.sql(query)
    
    # from these get sample of 5 restaurants
    usr_rev_bus = usr_rev_bus.sample(False, 0.5).limit(5)

    usr_rev_bus_det = getBusinessDetails(usr_rev_bus)
    
    # show the sample details
    print('\nBusinesses previously reviewed by user:')
    usr_rev_bus_det.select(['business_id', 'business_name', 'categories']).show(truncate = False)

    bus_list = [i.business_id for i in usr_rev_bus.collect()]

    # get restaurants similar to the sample
    sim_bus_df = getSimilarBusinesses(bus_list, sim_bus_limit)

    # filter out those have been reviewd before by the user
    s = sim_bus_df.alias("s")
    r = usr_rev_bus.alias("r")
    j = s.join(r, col("s.business_id") == col("r.business_id"), 'left_outer') \
         .where(col("r.business_id").isNull()) \
         .select([col('s.business_id'),col('s.score')])

    a = j.orderBy("score", ascending = False).limit(sim_bus_limit)

    return getBusinessDetails(a)

In [28]:
# test recoms for a user

u_id = 'Wc5L6iuvSNF5WGBlqIO8nw'

content_recom_df = getContentRecoms(u_id)

print("Businesses recommended to user based on his previously reviewd businesses:")
content_recom_df.toPandas()


Businesses previously reviewed by user:
+----------------------+----------------------------+--------------------------------------------------------------------------------------------------+
|business_id           |business_name               |categories                                                                                        |
+----------------------+----------------------------+--------------------------------------------------------------------------------------------------+
|c78Pat78fVUBFPXYeVvbaQ|Odd Seoul                   |Bars, Dive Bars, Korean, Restaurants, Asian Fusion, Nightlife                                     |
|9jYnZymuaW-XpMIS75YxgQ|The Beaver                  |Bars, American (New), Canadian (New), Cafes, Restaurants, Diners, Nightlife, Gay Bars             |
|F_oPMHJrH42R67xp5eKtQA|Yummy Korean Food Restaurant|Korean, Restaurants                                                                               |
|xwmwTpzw9XSwK-UT5Ka3HQ|Bellwoods Brewery

Unnamed: 0,business_id,score,business_name,categories,stars,review_count,latitude,longitude
0,uChTCA6MsLAciDRklpO-Fw,0.976159,Makkal Chon,"Restaurants, Greek, Korean",4.0,275,43.744641,-79.296277
1,JtrBa5-T6QLXv3y5_ACTkg,0.975671,Doo Roo Ae,"Restaurants, Korean",4.0,52,43.664074,-79.415611
2,ZCrK07xb6w5Vi1vathV0NQ,0.978606,Bapbo Korean Restaurant,"Korean, Japanese, Restaurants",3.0,124,43.655573,-79.384943
3,hWaptsAiWkxqCX9SFfJDkg,0.978762,BIWON,"Restaurants, Korean",3.0,72,43.652998,-79.396783
4,gd-fV04gMKMz724XM32xQA,0.981712,Bandit Brewery,"Brewpubs, Food, Restaurants, Breakfast & Brunc...",3.5,82,43.652485,-79.449621
5,rO3lZpVSoRMhhd0AEJBjlg,0.98875,Sunrise House,"Restaurants, Korean",4.0,182,43.664068,-79.415668
6,d3olNIBeuH4Eeqc3mkX8LA,0.975217,Seoul Restaurant,"Restaurants, Asian Fusion, Korean",4.0,70,43.664416,-79.414042
7,rhyjGfqYlCJoi8Zeulg6QA,0.987929,Kimchi Korea House,"Restaurants, Korean",3.5,231,43.655256,-79.385475
8,_MA98TVmvVIy-XdI0poc7w,0.982398,Mom's Korean Food,"Korean, Restaurants",3.0,89,43.664706,-79.413836
9,SNkkuchbVtUzCwyENcai_g,0.980332,Danji,"Japanese, Chinese, Korean, Restaurants",3.5,59,43.6653,-79.384899


In [29]:
def getKeyWordsRecoms(key_words, sim_bus_limit):
    
    print('\nBusinesses similar to key words: "' + key_words + '"')
    
    input_words_df = spark.sparkContext.parallelize([(0, key_words)]).toDF(['business_id', 'text'])
    
    # transform the the key words to vectors
    input_words_df = pipeline_mdl.transform(input_words_df)
    
    # choose word2vec vectors
    input_key_words_vec = input_words_df.select('word_vec').collect()[0][0]
    
    # get similarity
    sim_bus_byword_rdd = spark.sparkContext.parallelize((i[0], float(CosineSim(input_key_words_vec, i[1]))) for i in all_business_vecs)

    sim_bus_byword_df = spark.createDataFrame(sim_bus_byword_rdd) \
         .withColumnRenamed('_1', 'business_id') \
         .withColumnRenamed('_2', 'score') \
         .orderBy("score", ascending = False)
    
    # return top 10 similar businesses
    a = sim_bus_byword_df.limit(sim_bus_limit)
    return getBusinessDetails(a)


In [30]:
# test key word similarity to review text

key_words = 'chicken cheese burger'

keywords_recom_df = getKeyWordsRecoms(key_words, 10)
keywords_recom_df.toPandas()


Businesses similar to key words: "chicken cheese burger"


Unnamed: 0,business_id,score,business_name,categories,stars,review_count,latitude,longitude
0,aXPw7yszWON9ZvXjNJ9bNw,0.653982,Passion Grill,"American (Traditional), Halal, Sandwiches, Bur...",2.0,3,43.601685,-79.501768
1,DiCMYxT69I22-1nfsvYAJQ,0.669896,Gourmet Burger Co,"Burgers, Restaurants",3.5,37,43.664683,-79.368279
2,3Cu-af4en3uWCrAkkqfiHQ,0.66901,Epic Burgers and Waffles,"Restaurants, Food, Burgers",2.5,5,43.632351,-79.42128
3,ZogV_sPyn2FSb6nrc3eIng,0.656334,MELTwich Food,"American (Traditional), Restaurants, Burgers",3.5,10,43.626889,-79.500532
4,ycAW6Q5quaCSDX5zwQ3tPg,0.675376,New York Fries,"Restaurants, Specialty Food, Canadian (New), F...",3.5,9,43.776875,-79.256655
5,lN5TlKIjcQIaOICgLFr-XQ,0.649574,Potbelly Sandwich Shop,"Restaurants, Soup, Sandwiches, Salad",3.0,4,43.762509,-79.410532
6,5kwulanxcVcU3v7_8BTehw,0.646047,Cluck Clucks,"Caterers, Restaurants, Event Planning & Servic...",4.0,9,43.647321,-79.402373
7,37joQpD9m5AIcrW1c8OBnQ,0.696527,Urban Smoke Fusion BBQ Food Truck,"Desserts, Food Trucks, Street Vendors, Food, R...",4.0,8,43.718711,-79.470037
8,0KN2L1CNIj4ppjNwIKJ2nw,0.668981,Gotham Grill,"Restaurants, Burgers, Food Trucks, Food, Pouti...",3.0,11,43.653226,-79.383184
9,nP87zXxeS-8got7IBvoAuA,0.650125,McCoy Burger Company,"Poutineries, American (Traditional), Burgers, ...",3.5,38,43.731496,-79.404131
