In [1]:
!export PATH=/usr/lib/jvm/java-8-openjdk-amd64/jre/bin:/u3/cs451/packages/spark/bin:/u3/cs451/packages/hadoop/bin:/u3/cs451/packages/maven/bin:/u3/cs451/packages/scala/bin:$PATH
!export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre

In [2]:
import findspark
findspark.init('/u/cs451/packages/spark')

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from operator import add
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.recommendation import ALS, ALSModel

from pyspark.sql import Row,SQLContext
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [4]:
business_df = spark.read.parquet("Data/yelp_business.parquet")

business_df.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- business_name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- stars: double (nullable = true)
 |-- review_count: long (nullable = true)
 |-- categories: string (nullable = true)



In [5]:
review_df = spark.read.parquet("Data/yelp_review.parquet")

review_df.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- business_id: string (nullable = true)
 |-- stars: double (nullable = true)
 |-- review_date: string (nullable = true)
 |-- review_text: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- funny: long (nullable = true)
 |-- cool: long (nullable = true)



In [6]:
user_df = spark.read.parquet("Data/yelp_users.parquet")

user_df.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- user_name: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- yelping_since: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- fans: long (nullable = true)
 |-- average_stars: double (nullable = true)



In [7]:
# friends_df = spark.read.parquet("Data/yelp_friends.parquet")
# friends_df = friends_df.select(col("user_id"),trim(col("friends")))
# friends_df = friends_df.withColumnRenamed("trim(friends)","friends")
# friends_df.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- friends: string (nullable = true)



In [7]:
# Create views
business_df.createOrReplaceTempView("businesse")
user_df.createOrReplaceTempView("user")
review_df.createOrReplaceTempView("review")
#friends_df.createOrReplaceTempView("friends")

In [8]:
# Check distict pairs of user_id and business_id
temp = spark.sql("SELECT DISTINCT user_id, business_id FROM review")
temp.count()

366474

In [9]:
review_df_date = spark.sql("SELECT * from review order by review_date")
review_count = review_df_date.count()
print(review_count)

376593


In [10]:
train_count = int(review_count*0.8)
test_count = review_count - train_count
print("Train Samples: ",train_count)
print("Test Samples: ",test_count)


Train Samples:  301274
Test Samples:  75319


In [11]:
train_df = review_df_date.limit(train_count)
train_df.count()

301274

In [12]:
test_df = review_df_date.orderBy(col("review_date"),ascending=False).limit(test_count)
test_df.count()

75319

In [13]:
train_df.createOrReplaceTempView("review_train")
test_df.createOrReplaceTempView("review_test")

In [15]:
# Remove 
# temp = spark.sql("SELECT * from review_test where (user_id,business_id) not in \
# (SELECT DISTINCT user_id, business_id from review_train) ")

In [14]:
test_df = spark.sql("SELECT review_test.* from review_test left join (SELECT DISTINCT user_id, business_id as train_bid from review_train) as rtrain \
on review_test.user_id = rtrain.user_id and review_test.business_id = rtrain.train_bid where rtrain.user_id IS NULL and rtrain.train_bid IS NULL ")

In [15]:
train_df.write.parquet("Data/review_train.parquet")
test_df.write.parquet("Data/review_test.parquet")

## Content Based Filtering

In [18]:
content_model_path="model/"

review_text = spark.sql("SELECT business_id, review_text FROM review")
review_text_rdd = review_text.rdd
review_by_business_rdd = review_text_rdd.map(tuple).reduceByKey(add)  
review_by_business_df = spark.createDataFrame(review_by_business_rdd)
review_by_business_df = review_by_business_df \
                            .withColumnRenamed('_1', 'business_id') \
                            .withColumnRenamed('_2', 'text')
review_by_business_df.count()

pipeline_mdl = PipelineModel.load(content_model_path + 'pipe_txt')
review_by_business_trf_df = pipeline_mdl.transform(review_by_business_df)

all_business_vecs = review_by_business_trf_df.select('business_id', 'word_vec').rdd.map(lambda x: (x[0], x[1])).collect()

def CosineSim(vec1, vec2): 
    return np.dot(vec1, vec2) / np.sqrt(np.dot(vec1, vec1)) / np.sqrt(np.dot(vec2, vec2))

def getSimilarBusinesses(b_ids, sim_bus_limit=10):
    
    schema = StructType([   
                            StructField("business_id", StringType(), True)
                            ,StructField("score", IntegerType(), True)
                            ,StructField("input_business_id", StringType(), True)
                        ])
    
    similar_businesses_df = spark.createDataFrame([], schema)
    
    for b_id in b_ids:
        
        input_vec = [(r[1]) for r in all_business_vecs if r[0] == b_id][0]
        #input_vec = reviews_by_business_trf_df.select('word_vec')\
                    #.filter(reviews_by_business_trf_df['business_id'] == b_id)\
                    #.collect()[0][0]

        similar_business_rdd = spark.sparkContext.parallelize((i[0], float(CosineSim(input_vec, i[1]))) for i in all_business_vecs)

        similar_business_df = spark.createDataFrame(similar_business_rdd) \
            .withColumnRenamed('_1', 'business_id') \
            .withColumnRenamed('_2', 'score') \
            .orderBy("score", ascending = False)
            
        similar_business_df = similar_business_df.filter(col("business_id") != b_id).limit(sim_bus_limit)
        similar_business_df = similar_business_df.withColumn('input_business_id', lit(b_id))
        
        similar_businesses_df = similar_businesses_df \
                                    .union(similar_business_df)
        
    
    return similar_businesses_df

def getBusinessDetails(in_bus):
    
    a = in_bus.alias("a")
    b = business_df.alias("b")
    
    return a.join(b, col("a.business_id") == col("b.business_id"), 'inner') \
             .select([col('a.'+xx) for xx in a.columns] + [col('b.business_name'),col('b.categories'),
                                                           col('b.stars'),col('b.review_count'),
                                                           col('b.latitude'),col('b.longitude')])

def getContentRecoms(u_id, sim_bus_limit=10):
    
    # select restaurants previously reviewed (3+) by the user
    query = """
    SELECT distinct business_id FROM review
    where stars >= 3.5 
    and user_id = "{}"
    """.format(u_id)
    sqlContext = SQLContext(spark.sparkContext)
    usr_rev_bus = sqlContext.sql(query)
    
    # from these get sample of 5 restaurants
    usr_rev_bus = usr_rev_bus.sample(False, 0.5).limit(5)

    usr_rev_bus_det = getBusinessDetails(usr_rev_bus)
    
    # show the sample details
    # print('\nBusinesses previously reviewed by user:')
    usr_rev_bus_det.select(['business_id', 'business_name', 'categories']).show(truncate = False)

    bus_list = [i.business_id for i in usr_rev_bus.collect()]

    # get restaurants similar to the sample
    sim_bus_df = getSimilarBusinesses(bus_list, sim_bus_limit)

    # filter out those have been reviewd before by the user
    s = sim_bus_df.alias("s")
    r = usr_rev_bus.alias("r")
    j = s.join(r, col("s.business_id") == col("r.business_id"), 'left_outer') \
         .where(col("r.business_id").isNull()) \
         .select([col('s.business_id'),col('s.score')])

    a = j.orderBy("score", ascending = False).limit(sim_bus_limit)

    return getBusinessDetails(a)

## Collaborative Filtering

In [19]:
collab_model_path = "ALS/"
sqlContext = SQLContext(spark.sparkContext)
user_newid_df = sqlContext.createDataFrame(user_df.rdd.map(lambda x: x[0]).zipWithIndex(), \
        StructType([StructField("user_id", StringType(), True),StructField("userId", IntegerType(), True)]))

a = user_df.alias("a")
b = user_newid_df.alias("b")
    
user_new_df = a.join(b, col("a.user_id") == col("b.user_id"), 'inner') \
             .select([col('a.'+xx) for xx in a.columns] + [col('b.userId')])

# create a new businessId column (integer)
business_newid_df = sqlContext.createDataFrame(business_df.rdd.map(lambda x: x[0]).zipWithIndex(), \
        StructType([StructField("business_id", StringType(), True),StructField("businessId", IntegerType(), True)]))

# add the new businessId column the business dataframe
a = business_df.alias("a")
b = business_newid_df.alias("b")
    
business_new_df = a.join(b, col("a.business_id") == col("b.business_id"), 'inner') \
             .select([col('a.'+xx) for xx in a.columns] + [col('b.businessId')])

# # map new userId and businessId in the review dataframe
# review_df = review_df.select('user_id', 'business_id', 'stars')

# # map the userId
# a = review_df.alias("a")
# b = user_newid_df.alias("b")
# review_userId_df = a.join(b, col("a.user_id") == col("b.user_id"), 'inner').select([col('a.'+xx) for xx in a.columns] + [col('b.userId')])

# # map the businessId
# a = review_userId_df.alias("a")
# b = business_df.alias("b")
# review_userId_businessId_df = a.join(b, col("a.business_id") == col("b.business_id"), 'inner').select([col('a.'+xx) for xx in a.columns])

# # Get the rdd from the business_newid_df
# bn_rdd = business_newid_df.rdd.map(lambda row: (row[0], row[1])).collectAsMap()

# # Create Broadcast variable from bn_rdd
# bn_brodcast = spark.sparkContext.broadcast(bn_rdd)

# # Join the Review_user_business_df with the business_df using broadcast
# rub_rdd = review_userId_businessId_df.rdd.map(lambda row: (row[0],row[1],row[2],row[3],bn_brodcast.value[row[1]]))

# # Create Dataframe from the rdd
# review_userId_businessId_df = sqlContext.createDataFrame(rub_rdd,
#                                     StructType([StructField("user_id", StringType(), True),
#                                                 StructField("business_id", StringType(), True),
#                                                 StructField("stars", DoubleType(), True),
#                                                 StructField("userId", IntegerType(), True),
#                                                 StructField("businessId", IntegerType(), True)]))

# # create the rating dataframe required by the ALS model
# rating_df = review_userId_businessId_df.select('userId', 'businessId', review_userId_businessId_df.stars.cast('float').alias('rating'))

# load a new instance of the saved ALS model
alsn_model = ALSModel.load(collab_model_path + 'alsb')

# generate top 10 business recommendations for each user
userRecoms = alsn_model.recommendForAllUsers(10)

# add the column user_id, cache the recommendaton dataframe and show recommedations sample

a = userRecoms.alias("a")
b = user_newid_df.alias("b")
all_userRecoms = a.join(b, col("a.userId") == col("b.userId"), 'inner') \
             .select([col('a.'+xx) for xx in a.columns] + [col('b.user_id')])

def getCollabRecom(u_id):
    
    userFlatRec =  sqlContext.createDataFrame(all_userRecoms.filter(col('user_id') == u_id).rdd.flatMap(lambda p: p[1]))

    a = userFlatRec.alias("a")
    b = business_new_df.alias("b")
    
    return a.join(b, col("a.businessId") == col("b.businessId"), 'inner') \
             .select([col('b.business_id'), col('a.rating'), col('b.business_name'),col('b.categories'),
                                                           col('b.stars'),col('b.review_count'),
                                                           col('b.latitude'),col('b.longitude')]) \
             .orderBy("rating", ascending = False)
    

## Friends Recomendations

In [20]:
def getFriendRecoms(u_id, sim_bus_limit=10):
    
    query = """
    select business_id, count(*) as 4_5_stars_count 
    from review
    where user_id in
        (select f.friends from friends f
        inner join user u on f.friends = u.user_id
        where f.user_id = "{}") 
    and stars >= 4 
    and business_id not in (select business_id from review where user_id = "{}")
    group by business_id
    order by count(*) desc
    """.format(u_id, u_id)

    friend_recoms_df = sqlContext.sql(query)
    
    # friend_recoms_df = friend_recoms_df.sample(False, 0.5).limit(sim_bus_limit)

    return getBusinessDetails(friend_recoms_df)

In [21]:
u_id = 'Wc5L6iuvSNF5WGBlqIO8nw'

In [22]:
content_recom_df = getContentRecoms(u_id)
content_recom_df.toPandas().head(20)

+----------------------+----------------------------+--------------------------------------------------------------------------------------------------+
|business_id           |business_name               |categories                                                                                        |
+----------------------+----------------------------+--------------------------------------------------------------------------------------------------+
|9jYnZymuaW-XpMIS75YxgQ|The Beaver                  |Bars, American (New), Canadian (New), Cafes, Restaurants, Diners, Nightlife, Gay Bars             |
|J4_q5iMukg-UnnLnT6ZwAA|Northern Belle              |Nightlife, Food, Bars, Restaurants, Coffee & Tea, Cafes, Cocktail Bars                            |
|F_oPMHJrH42R67xp5eKtQA|Yummy Korean Food Restaurant|Korean, Restaurants                                                                               |
|xwmwTpzw9XSwK-UT5Ka3HQ|Bellwoods Brewery           |Canadian (New), Breweries, Fo

Unnamed: 0,business_id,score,business_name,categories,stars,review_count,latitude,longitude
0,uChTCA6MsLAciDRklpO-Fw,0.976159,Makkal Chon,"Restaurants, Greek, Korean",4.0,275,43.744641,-79.296277
1,JtrBa5-T6QLXv3y5_ACTkg,0.975671,Doo Roo Ae,"Restaurants, Korean",4.0,52,43.664074,-79.415611
2,ZCrK07xb6w5Vi1vathV0NQ,0.978606,Bapbo Korean Restaurant,"Korean, Japanese, Restaurants",3.0,124,43.655573,-79.384943
3,hWaptsAiWkxqCX9SFfJDkg,0.978762,BIWON,"Restaurants, Korean",3.0,72,43.652998,-79.396783
4,gd-fV04gMKMz724XM32xQA,0.981712,Bandit Brewery,"Brewpubs, Food, Restaurants, Breakfast & Brunc...",3.5,82,43.652485,-79.449621
5,rO3lZpVSoRMhhd0AEJBjlg,0.98875,Sunrise House,"Restaurants, Korean",4.0,182,43.664068,-79.415668
6,d3olNIBeuH4Eeqc3mkX8LA,0.975217,Seoul Restaurant,"Restaurants, Asian Fusion, Korean",4.0,70,43.664416,-79.414042
7,rhyjGfqYlCJoi8Zeulg6QA,0.987929,Kimchi Korea House,"Restaurants, Korean",3.5,231,43.655256,-79.385475
8,_MA98TVmvVIy-XdI0poc7w,0.982398,Mom's Korean Food,"Korean, Restaurants",3.0,89,43.664706,-79.413836
9,SNkkuchbVtUzCwyENcai_g,0.980332,Danji,"Japanese, Chinese, Korean, Restaurants",3.5,59,43.6653,-79.384899


In [23]:
collab_recom_df = getCollabRecom(u_id)
collab_recom_df.toPandas()

Unnamed: 0,business_id,rating,business_name,categories,stars,review_count,latitude,longitude
0,otsjAjxf0PNQ99xcmuj_LA,5.512009,Sushi Making For the Soul,"Japanese, Education, Restaurants, Local Flavor",4.5,3,43.656233,-79.392319
1,IM6pHgP2ewa6xhnDk6s2_g,5.438185,Mikaku Izakaya,"Japanese, Restaurants",4.5,3,43.793327,-79.419321
2,v_OLzcpFA7vgVp30vxv2uQ,5.395602,Silver Spoon,"Restaurants, American (New), Canadian (New)",5.0,4,43.650883,-79.450832
3,PT6tAoQxtCqsGc7r4nEXLQ,5.320897,Trinity Square Cafe,"Restaurants, Cafes",5.0,6,43.654877,-79.38147
4,fCZU04T_8lUdXX2aBYisEA,5.308662,Freshii,"Breakfast & Brunch, Specialty Food, Health Mar...",4.5,3,43.659574,-79.381027
5,Hn-bPW6z63BjA4XBAFsVgw,5.27386,Sugar Miracles,"Restaurants, Patisserie/Cake Shop, Chocolatier...",5.0,4,43.716805,-79.400696
6,bumAFxitMRHKAxZMijvUYg,5.26764,Cuisine of India,"Caterers, Event Planning & Services, Restauran...",5.0,3,43.782522,-79.474959
7,STukQhp01a__zTsHMBiyow,5.254052,East of York Gourmet Food,"Restaurants, Food, Indian, Cafes, Vegetarian, ...",5.0,23,43.685012,-79.316586
8,LIjlU7K-0SPXPtYFQiXamQ,5.217399,Magic Oven,"Food Stands, Restaurants, Sandwiches, Indian",5.0,3,43.652294,-79.405521
9,sGl_RLXLtSPx5kzMUFz11Q,5.179796,Alimentari Italian Grocery,"Food, Italian, Specialty Food, Restaurants",5.0,5,43.648805,-79.449725


In [24]:
friends_recom_df = getFriendRecoms(u_id)
friends_recom_df.toPandas()

Unnamed: 0,business_id,4_5_stars_count,business_name,categories,stars,review_count,latitude,longitude
0,r_BrIgzYcwo1NAuG9dLbpg,33,Pai Northern Thai Kitchen,"Restaurants, Food, Thai, Ethnic Food, Specialt...",4.5,2121,43.647866,-79.388642
1,RtUvSWO_UZ8V3Wpj0n077w,31,KINKA IZAKAYA ORIGINAL,"Bars, Restaurants, Japanese, Tapas/Small Plate...",4.0,1397,43.660430,-79.378927
2,nT16Y6AsJDwEpUB1JICKzg,28,St Lawrence Market,"Farmers Market, Restaurants, Grocery, Sandwich...",4.5,496,43.648730,-79.371541
3,N93EYZy9R0sdlEvubu94ig,28,Banh Mi Boys,"Sandwiches, Restaurants, Asian Fusion, Food, D...",4.5,1045,43.648827,-79.396970
4,7oEKIG7d1ttPRejppZ3WIA,20,Lady Marmalade,"Italian, Restaurants, Mexican, Breakfast & Brunch",4.0,456,43.660517,-79.343112
5,iGEvDk6hsizigmXhDKs2Vg,20,Seven Lives Tacos Y Mariscos,"Seafood, Restaurants, Mexican",4.5,1152,43.654341,-79.400480
6,aLcFhMe6DDJ430zelCpd2A,20,Khao San Road,"Thai, Restaurants",4.0,1410,43.646411,-79.393480
7,f5O7v_X_jCg2itqacRfxhg,19,Sansotei Ramen,"Restaurants, Ramen, Japanese, Noodles",4.0,794,43.655004,-79.386473
8,h_4dPV9M9aYaBliH1Eoeeg,18,Wvrst,"Nightlife, Pubs, Bars, Restaurants, Canadian (...",4.0,711,43.644186,-79.401015
9,dc3uoAmNo5STqKV6mlD_aA,18,The Drake Hotel,"Hotels, Restaurants, Event Planning & Services...",3.5,354,43.643205,-79.424638


In [25]:
df = content_recom_df.limit(4).union(collab_recom_df.limit(4)).union(friends_recom_df.limit(2))

In [26]:
df.toPandas()

Unnamed: 0,business_id,score,business_name,categories,stars,review_count,latitude,longitude
0,uChTCA6MsLAciDRklpO-Fw,0.976159,Makkal Chon,"Restaurants, Greek, Korean",4.0,275,43.744641,-79.296277
1,JtrBa5-T6QLXv3y5_ACTkg,0.975671,Doo Roo Ae,"Restaurants, Korean",4.0,52,43.664074,-79.415611
2,ZCrK07xb6w5Vi1vathV0NQ,0.978606,Bapbo Korean Restaurant,"Korean, Japanese, Restaurants",3.0,124,43.655573,-79.384943
3,hWaptsAiWkxqCX9SFfJDkg,0.978762,BIWON,"Restaurants, Korean",3.0,72,43.652998,-79.396783
4,otsjAjxf0PNQ99xcmuj_LA,5.512009,Sushi Making For the Soul,"Japanese, Education, Restaurants, Local Flavor",4.5,3,43.656233,-79.392319
5,IM6pHgP2ewa6xhnDk6s2_g,5.438185,Mikaku Izakaya,"Japanese, Restaurants",4.5,3,43.793327,-79.419321
6,v_OLzcpFA7vgVp30vxv2uQ,5.395602,Silver Spoon,"Restaurants, American (New), Canadian (New)",5.0,4,43.650883,-79.450832
7,PT6tAoQxtCqsGc7r4nEXLQ,5.320897,Trinity Square Cafe,"Restaurants, Cafes",5.0,6,43.654877,-79.38147
8,r_BrIgzYcwo1NAuG9dLbpg,33.0,Pai Northern Thai Kitchen,"Restaurants, Food, Thai, Ethnic Food, Specialt...",4.5,2121,43.647866,-79.388642
9,RtUvSWO_UZ8V3Wpj0n077w,31.0,KINKA IZAKAYA ORIGINAL,"Bars, Restaurants, Japanese, Tapas/Small Plate...",4.0,1397,43.66043,-79.378927


In [27]:
# content_recom_df.toPandas()
# collab_recom_df.toPandas()
# friends_recom_df.toPandas()