In [1]:
import os
import sys

spark_path = os.environ['SPARK_HOME']
sys.path.append(spark_path + "/bin")
sys.path.append(spark_path + "/python")
sys.path.append(spark_path + "/python/pyspark/")
sys.path.append(spark_path + "/python/lib")
sys.path.append(spark_path + "/python/lib/pyspark.zip")
sys.path.append(spark_path + "/python/lib/py4j-0.10.9-src.zip")

import findspark
findspark.init()

import pyspark

number_cores = 4
memory_gb = 8
conf = (pyspark.SparkConf().setMaster('local[{}]'.
                                      format(number_cores)).
        set('spark.driver.memory', '{}g'.format(memory_gb)))
sc = pyspark.SparkContext(conf=conf)

In [2]:
yelp = sc.textFile("../CSC467/yelp_academic_dataset_user.json.gz")

In [4]:
yelp.take(1)

['{"user_id":"ntlvfPzc8eglqvk92iDIAw","name":"Rafael","review_count":553,"yelping_since":"2007-07-06 03:27:11","useful":628,"funny":225,"cool":227,"elite":"","friends":"oeMvJh94PiGQnx_6GlndPQ, wm1z1PaJKvHgSDRKfwhfDg, IkRib6Xs91PPW7pon7VVig, A8Aq8f0-XvLBcyMk2GJdJQ, eEZM1kogR7eL4GOBZyPvBA, e1o1LN7ez5ckCpQeAab4iw, _HrJVzFaRFUhPva8cwBjpQ, pZeGZGzX-ROT_D5lam5uNg, 0S6EI51ej5J7dgYz3-O0lA, woDt8raW-AorxQM_tIE2eA, hWUnSE5gKXNe7bDc8uAG9A, c_3LDSO2RHwZ94_Q6j_O7w, -uv1wDiaplY6eXXS0VwQiA, QFjqxXn3acDC7hckFGUKMg, ErOqapICmHPTN8YobZIcfQ, mJLRvqLOKhqEdkgt9iEaCQ, VKX7jlScJSA-ja5hYRw12Q, ijIC9w5PRcj3dWVlanjZeg, CIZGlEw-Bp0rmkP8M6yQ9Q, OC6fT5WZ8EU7tEVJ3bzPBQ, UZSDGTDpycDzrlfUlyw2dQ, deL6e_z9xqZTIODKqnvRXQ, 5mG2ENw2PylIWElqHSMGqg, Uh5Kug2fvDd51RYmsNZkGg, 4dI4uoShugD9z84fYupelQ, EQpFHqGT9Tk6YSwORTtwpg, o4EGL2-ICGmRJzJ3GxB-vw, s8gK7sdVzJcYKcPv2dkZXw, vOYVZgb_GVe-kdtjQwSUHw, wBbjgHsrKr7BsPBrQwJf2w, p59u2EC_qcmCmLeX1jCi5Q, VSAZI1eHDrOPRWMK4Q2DIQ, efMfeI_dkhpeGykaRJqxfQ, x6qYcQ8_i0mMDzSLsFCbZg, K_zSmtNGw1fu-vm

The very first step with this dataset is figuring out how to handle json files. A quick google search shows us that the simplest way of handling them is to import json and then use the functions associated with it.

In [5]:
s = yelp.take(1)[0]
import json
json.loads(s)

{'user_id': 'ntlvfPzc8eglqvk92iDIAw',
 'name': 'Rafael',
 'review_count': 553,
 'yelping_since': '2007-07-06 03:27:11',
 'useful': 628,
 'funny': 225,
 'cool': 227,
 'elite': '',
 'friends': 'oeMvJh94PiGQnx_6GlndPQ, wm1z1PaJKvHgSDRKfwhfDg, IkRib6Xs91PPW7pon7VVig, A8Aq8f0-XvLBcyMk2GJdJQ, eEZM1kogR7eL4GOBZyPvBA, e1o1LN7ez5ckCpQeAab4iw, _HrJVzFaRFUhPva8cwBjpQ, pZeGZGzX-ROT_D5lam5uNg, 0S6EI51ej5J7dgYz3-O0lA, woDt8raW-AorxQM_tIE2eA, hWUnSE5gKXNe7bDc8uAG9A, c_3LDSO2RHwZ94_Q6j_O7w, -uv1wDiaplY6eXXS0VwQiA, QFjqxXn3acDC7hckFGUKMg, ErOqapICmHPTN8YobZIcfQ, mJLRvqLOKhqEdkgt9iEaCQ, VKX7jlScJSA-ja5hYRw12Q, ijIC9w5PRcj3dWVlanjZeg, CIZGlEw-Bp0rmkP8M6yQ9Q, OC6fT5WZ8EU7tEVJ3bzPBQ, UZSDGTDpycDzrlfUlyw2dQ, deL6e_z9xqZTIODKqnvRXQ, 5mG2ENw2PylIWElqHSMGqg, Uh5Kug2fvDd51RYmsNZkGg, 4dI4uoShugD9z84fYupelQ, EQpFHqGT9Tk6YSwORTtwpg, o4EGL2-ICGmRJzJ3GxB-vw, s8gK7sdVzJcYKcPv2dkZXw, vOYVZgb_GVe-kdtjQwSUHw, wBbjgHsrKr7BsPBrQwJf2w, p59u2EC_qcmCmLeX1jCi5Q, VSAZI1eHDrOPRWMK4Q2DIQ, efMfeI_dkhpeGykaRJqxfQ, x6qYcQ8_i0mMDzSL

In [6]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('demo').master("local").enableHiveSupport().getOrCreate()
df = spark.read.json("../CSC467/yelp_academic_dataset_user.json.gz")

With the data in a manageable format, I want to figure out how many unique users there are. I could just do a count function on the dataset, but instead, to ensure we are being as accurate as possible, I will map out every single user ID. Then I will run a distinct function to only show unique IDs. Lastly, I will run count on the final dataset.

In [7]:
yelp_json = yelp.map(lambda x: json.loads(x))
yelp_json.map(lambda x: x['user_id']).distinct().count()

1968703

As seen here, there are 1,968,703 unique users in this dataset.

My next step is to grab only the information needed for the next questions in the assignment. The important info is: user_id, review_count, and average_stars. I added in name as well to make it more visual.

In [8]:
reviews = yelp_json.map(lambda x: (x['user_id'], x['name'], x['review_count'], x['average_stars']))
reviews.take(5)

[('ntlvfPzc8eglqvk92iDIAw', 'Rafael', 553, 3.57),
 ('FOBRPlBHa3WPHFB5qYDlVg', 'Michelle', 564, 3.84),
 ('zZUnPeh2hEp0WydbAZEOOg', 'Martin', 60, 3.44),
 ('QaELAmRcDc5TfJEylaaP8g', 'John', 206, 3.08),
 ('xvu8G900tezTzbbfqmTKvA', 'Anne', 485, 4.37)]

What I want to do now is find out which users have the highest number of reviews. To accomplish this, I will run a takeOrdered function for the top 25 users. The function will be reading index of 2, which is "review_count". To help us accomplish the next task (finding out the average ratings for the most active users), I also assigned the takeOrdered function to a variable 'hiRev'.

In [9]:
hiRev = reviews.takeOrdered(25, key = lambda x: -x[2])
hiRev

[('8k3aO-mPeyhbR5HUucA5aA', 'Victor', 14455, 3.28),
 ('RtGqdDBvvBCjcu5dUqwfzA', 'Shila', 12772, 3.87),
 ('hWDybu_KvYLSdEFzGrniTw', 'Bruce', 12487, 3.64),
 ('Hi10sGSZNxQH3NLyWSZ1oA', 'Fox', 11112, 3.8),
 ('P5bUL3Engv-2z6kKohB6qQ', 'Kim', 9875, 3.8),
 ('8RcEwGrFIgkt9WQ35E6SnQ', 'George', 7745, 3.49),
 ('nmdkHL2JKFx55T3nq5VziA', 'Nijole', 7626, 3.71),
 ('Xwnf20FKuikiHcSpcEbpKQ', 'Kenneth', 6762, 3.32),
 ('CxDOIDnH8gp9KXzpBHJYXw', 'Jennifer', 6633, 3.33),
 ('HFECrzYDpgbS5EmTBtj2zQ', 'Eric', 5500, 3.93),
 ('kS1MQHYwIfD0462PE61IBw', 'Rob', 5156, 3.82),
 ('WG3w_73scm_JUWJ_3Lgn0Q', 'Jack', 5013, 3.87),
 ('m07sy7eLtOjVdZ8oN9JKag', 'Ed', 4913, 3.66),
 ('Xj0O2l0bp633ebmG468aZw', 'Andrew', 4851, 3.73),
 ('bQCHF5rn5lMI9c5kEwCaNA', 'Vincent', 4845, 3.77),
 ('XYSDrIef7g4Gmp3lNFVO6A', 'Neal', 4828, 3.96),
 ('IucvvxdQXXhjQ4z6Or6Nrw', 'Sunil', 4784, 3.46),
 ('wZPizeBxMAyOSl0M0zuCjg', 'Jess', 4697, 3.63),
 ('U4INQZOPSUaj8hMjLlZ3KA', 'Michael', 4631, 3.9),
 ('bLbSNkLggFnqwNNzzq-Ijw', 'Stefany', 4627, 3.39

In [10]:
#Function to find average rating among top 25 reviewers
def avgRate(x):
    count = 0
    total = 0
    for i in x:
        count += i[3]
        total += 1
    return count/total

avgRate(hiRev)

3.6875999999999998

To find the average rating, I created a function which basically performs a mean function on the list. When run, it shows us that the average rating among the top 25 reviewers, based upon their review count, is around 3.6876. It is worth noting that the average may actually be slightly higher (or lower) if we take out some of the outliers. Additionally, some of these "top" reviewers may be bots as it isn't likely that a person has enough time to visit 14,000 different places and leave reviews on each of them.

The next thing I wanted to do was find two "interesting" user attributes. The first attribute that caught my attention was the compliments received overall. The reason this was interesting to me is that other people found the review so helpful that they wanted to personally thank, or compliment, the reviewer (must have been quite the review). As such, I thought it would be interesting to see the top twenty-five users with the most overall compliments. 

In [11]:
#compliments = yelp_json.map(lambda x: (x['user_id'], x['name'], x['compliment_hot'] + x['compliment_more'] + 
#                                       x['compliment_profile'] + x['compliment_cute'] + x['compliment_list'] +
#                                       x['compliment_note'] + x['compliment_plain'] + x['compliment_cool'] + 
#                                       x['compliment_funny'] + x['compliment_writer'] + x['compliment_photos']))

#Update: added new column, 'friends', to help in later statistical analysis
compliments = yelp_json.map(lambda x: (x['user_id'], x['name'], x['compliment_hot'] + x['compliment_more'] + 
                                       x['compliment_profile'] + x['compliment_cute'] + x['compliment_list'] +
                                       x['compliment_note'] + x['compliment_plain'] + x['compliment_cool'] + 
                                       x['compliment_funny'] + x['compliment_writer'] + x['compliment_photos'],len((x['friends']).split())))
compliments.takeOrdered(25, key = lambda x: -x[2])

[('Tqm7Wu7IBJ1td3Ab5ZpUhw', 'Brian', 277196, 925),
 ('JjXuiru1_ONzDkYVrHN0aw', 'Richard', 263889, 7108),
 ('ax7SnXOTIpatbsmqHLqVow', 'Rohlin', 152814, 1794),
 ('eKUGKQRE-Ywi5dY55_zChg', 'Cherylynn', 147824, 5730),
 ('--2vR0DIsmQ6WfcSzKWigw', 'Harald', 127430, 3883),
 ('h4oOQdnfjpEHbygEJDsFbg', 'John', 126417, 1089),
 ('NOUfyJW-BAo_-Cbfo8edww', 'Lolia', 118563, 3001),
 ('JRAy4P4op3PCISZaMRA9_w', 'Carissa', 87765, 584),
 ('UXbCcmkYGl3DH_Py5UOtbQ', 'Toni', 77158, 297),
 ('Ggx8iUdJ7lsQSqXRuclXtg', 'Nadine', 76798, 1443),
 ('Z88N6qly3Dp5C06XbZ9IMQ', 'Janice', 69240, 1540),
 ('W7DHyQlY_kXls2iXt-_2Ag', 'Maggie', 67814, 4545),
 ('AbMjnKOwg736fcIu8apuyQ', 'Michael', 65123, 5842),
 ('59GSwMflQFllOAWvw0F5mw', 'Dana', 64447, 1400),
 ('_aSs5dSAabuwnXUGBPzASw', 'Genevieve', 64165, 2160),
 ('0juzbrgcLHqobdK-OZAyiw', 'Daniel', 63526, 1709),
 ('HH7iiWvBqV-20lA7JlSRWQ', 'Lyla', 62991, 820),
 ('w-w-k-QXosIKQ8HQVwU6IQ', 'Anthony', 60223, 5987),
 ('13f_vtUZEmlzweL91bmVng', 'Michelle', 59988, 1939),
 ('OapL

As seen above, the user with the most compliments received almost 280,000 compliments! One can only imagine how amazing their reviews must be. The second user attribute I found quite interesting was the number of friends each user has. I didn't even know you could have friends on Yelp to begin with! I wanted to run an analysis on whether or not the the users with the highest number of friends coincides with the users with the most number of compliments. My hypothesis was that if a user has a lot of friends, they should also receive a lot of compliments, many from their own friends.

In [12]:

#Function to count the amount of friends each user has
#def countFriends (x) :
#    count = 0
#    array = []
#    for i in x:
#        for j in i[2]:
#            count += 1
#        array.append((i[0],i[1],count))
#        count = 0
#    return array
#


#More effective way to count friends for each user
friends = yelp_json.map(lambda x: (x['user_id'], x['name'], len((x['friends']).split())))
 
friends.take(5)


[('ntlvfPzc8eglqvk92iDIAw', 'Rafael', 45),
 ('FOBRPlBHa3WPHFB5qYDlVg', 'Michelle', 213),
 ('zZUnPeh2hEp0WydbAZEOOg', 'Martin', 35),
 ('QaELAmRcDc5TfJEylaaP8g', 'John', 173),
 ('xvu8G900tezTzbbfqmTKvA', 'Anne', 895)]

In [13]:
#totalFriends = friends.map(lambda x: countFriends(x))
#totalFriends.takeOrdered(25, key = lambda x: -x[2])

friends.takeOrdered(25, key = lambda x: -x[2])

[('qVc8ODYU5SZjKXVBgXdI7w', 'Walker', 14995),
 ('iLjMdZi0Tm7DQxX1C1_2dg', 'Ruggy', 12320),
 ('ZIOCmdFaMIF56FR-nWr_2A', 'Randy', 10431),
 ('mV4lknblF-zOKSF8nlGqDA', 'Scott', 9685),
 ('Oi1qbcz2m2SnwUeztGYcnQ', 'Steven', 8964),
 ('F_5_UNX-wrAFCXuAkBZRDw', 'Rodney', 8716),
 ('IU86PZPgTDCFwJEuAg2j7g', 'Danny', 8672),
 ('djxnI8Ux8ZYQJhiOQkrRhA', 'Abby', 8590),
 ('hizGc5W1tBHPghM5YKCAtg', 'Katie', 8287),
 ('c-Dja5bexzEWBufNsHfRrQ', 'Vince', 8207),
 ('fgwI3rYHOv1ipfVfCSx7pg', 'Emi', 7989),
 ('wEE-YMx5pmSuagLtNxMPKA', 'Stephanie', 7935),
 ('MeDuKsZcnI3IU2g7OlV-hQ', 'Frank', 7780),
 ('MX_sGTpLIQ0EOcvK73nRyA', 'Nam', 7703),
 ('UsXqCXRZwSCSw0AT7y1uBg', 'Candice', 7413),
 ('Ve0LUwcrzxL7w0RYgY4Aaw', 'Katy', 7401),
 ('5MCBLBxr10NLUKZ4AboAMg', 'Colleen', 7169),
 ('JjXuiru1_ONzDkYVrHN0aw', 'Richard', 7108),
 ('VHdY6oG2JPVNjihWhOooAQ', 'Jessica', 7088),
 ('xsT4KZTu_KnOVavtuXn4RA', 'Rodney', 7010),
 ('IDVFG1pNSHIHoVuoLuZpcQ', 'Andy', 6955),
 ('YttDgOC9AlM4HcAlDsbB2A', 'Phil', 6834),
 ('3zxy3LVBV3ttxoYbY4

__OLD INFO:__ When comparing the top twenty-five most-friended users against the top twenty-five most-complimented users, the top twenty-five for each category is surprisingly different! While users with many compliments may have a lot of friends, the amount of friends doesn't necessarily coincide with the amount of compliments received. One possible reason for this is that a user may have many friends, but they don't post many reviews. If this is the case, then it would make sense as to why they don't have many compliments. To visualize this, I turned the top 25 of each category into individual sets and then I compared those sets. As seen below, there was only one user in both lists (Richard, user ID: 'JjXuiru1_ONzDkYVrHN0aw'); therefore, it is safe to assume that they do not coincide.

In [14]:
#Old Info
c = compliments.takeOrdered(25, key = lambda x: -x[2])
f = friends.takeOrdered(25, key = lambda x: -x[2])

In [15]:
#Old Info
#Function to compare list 'c' with list 'f'.

def compareCF(x, y):
    array = []
    for i in x:
        for j in y:
            if i[0] == j[0]:
                array.append(i[0:2])
    return array

compareCF(c, f)


[('JjXuiru1_ONzDkYVrHN0aw', 'Richard')]

__NEW INFO:__ After playing around with Jupyter notebook for a bit, I found a rather interesting way to run statistical analysis on both columns going beyond just the top 25 for each category. My new analysis contained a comparison between Compliments and Friends, and Compliments and Fans. 

In [16]:
#Dataframe for Compliments and Friends

df = compliments.toDF()
compFriends = df.selectExpr("_3 as Compliments", "_4 as Friends")
compFriends.select("Compliments", "Friends").summary().show()

+-------+------------------+-----------------+
|summary|       Compliments|          Friends|
+-------+------------------+-----------------+
|  count|           1968703|          1968703|
|   mean|14.733649514426503|52.04515409383742|
| stddev| 503.8868828574259|145.2836742064494|
|    min|                 0|                1|
|    25%|                 0|                1|
|    50%|                 0|                2|
|    75%|                 1|               47|
|    max|            277196|            14995|
+-------+------------------+-----------------+



In [17]:
from pyspark.ml.stat import Correlation
compFriends.stat.corr('Compliments','Friends')

0.2698637220656327

In [18]:
complimentsAndFans = yelp_json.map(lambda x: (x['user_id'], x['name'], x['compliment_hot'] + x['compliment_more'] + 
                                       x['compliment_profile'] + x['compliment_cute'] + x['compliment_list'] +
                                       x['compliment_note'] + x['compliment_plain'] + x['compliment_cool'] + 
                                       x['compliment_funny'] + x['compliment_writer'] + x['compliment_photos'],x['fans']))

In [19]:
#Dataframe for Compliments and Fans

df = complimentsAndFans.toDF()
data = df.selectExpr("_3 as Compliments", "_4 as Fans")
data.select("Compliments", "Fans").summary().show()

+-------+------------------+------------------+
|summary|       Compliments|              Fans|
+-------+------------------+------------------+
|  count|           1968703|           1968703|
|   mean|14.733649514426503|1.4588239058913406|
| stddev| 503.8868828574259|16.675211902790608|
|    min|                 0|                 0|
|    25%|                 0|                 0|
|    50%|                 0|                 0|
|    75%|                 1|                 0|
|    max|            277196|             11568|
+-------+------------------+------------------+



In [20]:
from pyspark.ml.stat import Correlation
data.stat.corr('Compliments','Fans')

0.4814993480267218

This new analysis is a bit confusing at first glance. There is a rather big jump between the seventy-fifth percentile to the max on both dataframes. What we really want to focus on is the correlation percentage for (Compliments, Friends) and then (Compliments, Fans). Compliments to Friends has a 0.2699 correlation (weak association) while Compliments to Fans has a 0.4815 (moderate association) correlation. Based on this information, we can assume that fans have a bigger impact on total compliments received as opposed to the number of friends. This is expected as fans can be anyone, the Yelp user doesn't need to fan someone back to have them follow. Friends on the other hand require more work on the part of both users.

__END NEW INFO__

Now to find the most influential users, we have a few different factors we could look at. The first one is obviously the number of reviews, but I believe that isn't necessarily a telling factor as it may be artifically inflated. The second one is the amount of compliments receieved. The issue with this one is that some reviews may have been left at highly visited places. As such, certain places may receive much more attention and reactions on yelp as opposed to more obscure places where people don't frequent yelp for as often. In the end, I think the best factor for deciding on whether or not a user is influential would be the number of fans they have. A fan essentially means that these people trust the reviewers opinion so much so that they follow all reviews the person posts. Now, to find the number of fans, we can just run a simple map function on the overall dataset, taking an ordered function afterwards. I have included the user_ids, but also the names so it can be a little more readable. 

__NEW INFO:__ Upon going back to my work, I decided it might also be interesting if we also show if fans correlates with Elite years in any way.

In [21]:
fansOverall = yelp_json.map(lambda x: (x['user_id'], x['name'], x['fans'], len(x['elite'].split(","))))
fansOverall.takeOrdered(10, key = lambda x: -x[2])

[('37cpUoM8hlkSQfReIEBd-Q', 'Mike', 11568, 7),
 ('hizGc5W1tBHPghM5YKCAtg', 'Katie', 3315, 10),
 ('eKUGKQRE-Ywi5dY55_zChg', 'Cherylynn', 2916, 9),
 ('Hi10sGSZNxQH3NLyWSZ1oA', 'Fox', 2718, 5),
 ('j14WgRoU_-2ZE1aw1dXrJg', 'Daniel', 2634, 10),
 ('iLjMdZi0Tm7DQxX1C1_2dg', 'Ruggy', 2516, 11),
 ('JjXuiru1_ONzDkYVrHN0aw', 'Richard', 2316, 8),
 ('ITa3vh5ERI90G_WP4SmGUQ', 'Peter', 2280, 11),
 ('UsXqCXRZwSCSw0AT7y1uBg', 'Candice', 2263, 10),
 ('VHdY6oG2JPVNjihWhOooAQ', 'Jessica', 2140, 13)]

As we can see, Mike has the highest number of fans, 11,568. One thing I found really interesting was user Fox has almost five times the amount of reviews as user Cherlynn, but Fox has fewer fans overall. The ratio of fans to number of reviews might indicate that the quality of a user's review may be more beneficial to others as opposed to the quantity, or amount, of reviews they post.

In [22]:
df = fansOverall.toDF()
data = df.selectExpr("_3 as Fans", "_4 as Elite")
data.select("Fans", "Elite").summary().show()

+-------+------------------+-----------------+
|summary|              Fans|            Elite|
+-------+------------------+-----------------+
|  count|           1968703|          1968703|
|   mean|1.4588239058913406|1.079550851499693|
| stddev|16.675211902790608|  0.5769988528825|
|    min|                 0|                1|
|    25%|                 0|                1|
|    50%|                 0|                1|
|    75%|                 0|                1|
|    max|             11568|               13|
+-------+------------------+-----------------+



In [23]:
from pyspark.ml.stat import Correlation
data.stat.corr('Fans','Elite')

0.4070426817740818

It appears as if there is a moderate correlation between the number of fans a user has and the amount of years they received Elite status. This is to be expected as having an Elite status most likely makes you more appealing to the average user and they are more likely to trust your opinion. Granted, Elite status is not given by the community; rather, it is given by Yelp so being Elite is rather subjective.