In [1]:
!ls -al datasets

total 5396028
drwxrwxr-x 2 ichwan ichwan       4096 May  7 23:04 .
drwxrwxr-x 6 ichwan ichwan       4096 May  7 23:41 ..
-rw-rw-r-- 1 ichwan ichwan      99875 Feb  6 23:57 Dataset_Challenge_Dataset_Agreement.pdf
-rw-rw-r-- 1 ichwan ichwan   41377121 Feb  7 00:02 yelp_business_attributes.csv
-rw-rw-r-- 1 ichwan ichwan   31760674 Feb  6 23:57 yelp_business.csv
-rw-rw-r-- 1 ichwan ichwan   13866351 Feb  6 23:59 yelp_business_hours.csv
-rw-rw-r-- 1 ichwan ichwan  135964892 Feb  6 23:57 yelp_checkin.csv
-rw-rw-r-- 1 ichwan ichwan 3791120545 Feb  7 00:02 yelp_review.csv
-rw-rw-r-- 1 ichwan ichwan      45957 May  7 23:04 yelp_review_subset.csv
-rw-rw-r-- 1 ichwan ichwan  148085910 Feb  7 00:02 yelp_tip.csv
-rw-rw-r-- 1 ichwan ichwan 1363176944 Feb  6 23:59 yelp_user.csv


For testing purpose, I use _only_ the first 514 lines from `yelp_review.csv` using the following command in terminal or git bash:

    head -n 514 yelp_review.csv > yelp_review_subset.csv

In [2]:
# Set up directories to the datasets
REVIEW_PATH = './datasets/yelp_review_subset.csv'

In [3]:
# Create an RDD from the yelp review dataset
def yield_record(filename):
    import csv
    
    with open(filename, 'rb') as fi:
        reader = csv.reader(fi)
        reader.next()
        for row in reader:
            yield row
            
review_rdd = sc.parallelize(yield_record(REVIEW_PATH)).cache()
review_rdd.take(1)

[['vkVSCC7xljjrAI4UGfnKEQ',
  'bv2nCi5Qv5vroFiqKGopiw',
  'AEx2SYEUJmTxVVB18LlCwA',
  '5',
  '2016-05-28',
  "Super simple place but amazing nonetheless. It's been around since the 30's and they still serve the same thing they started with: a bologna and salami sandwich with mustard. \n\nStaff was very helpful and friendly.",
  '0',
  '0',
  '0']]

In [4]:
# Create the schema necessary for the creation of a DataFrame
from pyspark.sql.types import *
schema = StructType([
    StructField("review_id", StringType(), False), 
    StructField("user_id", StringType(), False),
    StructField("business_id", StringType(), False),    
    StructField("stars", StringType(), True),    
    StructField("date", StringType(), True),    
    StructField("text", StringType(), True),    
    StructField("useful", StringType(), True),    
    StructField("funny", StringType(), True),    
    StructField("cool", StringType(), True)])

In [5]:
# Create a dataframe using the RDD and the previously declared schema
review_df = spark.createDataFrame(review_rdd, schema)
review_df.printSchema()

root
 |-- review_id: string (nullable = false)
 |-- user_id: string (nullable = false)
 |-- business_id: string (nullable = false)
 |-- stars: string (nullable = true)
 |-- date: string (nullable = true)
 |-- text: string (nullable = true)
 |-- useful: string (nullable = true)
 |-- funny: string (nullable = true)
 |-- cool: string (nullable = true)



In [6]:
# Cast all columns to the appropriate datatype
review_df = review_df.select(review_df['review_id'], 
                             review_df['user_id'],
                             review_df['business_id'], 
                             review_df['stars'].cast(IntegerType()), 
                             review_df['date'].cast(DateType()), 
                             review_df['text'], 
                             review_df['useful'].cast(IntegerType()),
                             review_df['funny'].cast(IntegerType()),
                             review_df['cool'].cast(IntegerType()),
                            ).cache()

review_df.take(5)

[Row(review_id=u'vkVSCC7xljjrAI4UGfnKEQ', user_id=u'bv2nCi5Qv5vroFiqKGopiw', business_id=u'AEx2SYEUJmTxVVB18LlCwA', stars=5, date=datetime.date(2016, 5, 28), text=u"Super simple place but amazing nonetheless. It's been around since the 30's and they still serve the same thing they started with: a bologna and salami sandwich with mustard. \n\nStaff was very helpful and friendly.", useful=0, funny=0, cool=0),
 Row(review_id=u'n6QzIUObkYshz4dz2QRJTw', user_id=u'bv2nCi5Qv5vroFiqKGopiw', business_id=u'VR6GpWIda3SfvPC-lg9H3w', stars=5, date=datetime.date(2016, 5, 28), text=u"Small unassuming place that changes their menu every so often. Cool decor and vibe inside their 30 seat restaurant. Call for a reservation. \n\nWe had their beef tartar and pork belly to start and a salmon dish and lamb meal for mains. Everything was incredible! I could go on at length about how all the listed ingredients really make their dishes amazing but honestly you just need to go. \n\nA bit outside of downtown mon

In [7]:
review_df.describe(['stars', 'useful', 'funny', 'cool']).show()

+-------+------------------+------------------+-------------------+-------------------+
|summary|             stars|            useful|              funny|               cool|
+-------+------------------+------------------+-------------------+-------------------+
|  count|                77|                77|                 77|                 77|
|   mean|3.3376623376623376|1.2727272727272727|0.19480519480519481|0.35064935064935066|
| stddev|1.1877709066100248|2.2747837585857162| 0.7078313997690232| 0.7392427449447984|
|    min|                 1|                 0|                  0|                  0|
|    max|                 5|                11|                  5|                  5|
+-------+------------------+------------------+-------------------+-------------------+



In [8]:
review_df.filter(review_df.stars < 3).count()

16