# Two ways of loading json file and working with that

## Using SQLContext

In [1]:
from pyspark.sql import SQLContext

In [2]:
file_name = 'yelp_academic_dataset_review.json'
reviews = sqlContext.read.json(file_name)
reviews.first()

Row(business_id=u'vcNAWiLM4dR7D2nwwJ7nCA', date=u'2007-05-17', review_id=u'15SdjuK7DmYqUAj6rjGowg', stars=5, text=u"dr. goldberg offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank.", type=u'review', user_id=u'Xqd0DzHaiyRqVH3WRG7hzg', votes=Row(cool=1, funny=0, useful=2))

In [7]:
stars = reviews.map(lambda x: x[3])
stars.take(10)

[5, 2, 4, 4, 4, 1, 5, 5, 1, 5]

## Using json.loads

In [3]:
import json

In [4]:
business_file = 'yelp_academic_dataset_business.json'
rdd_text = sc.textFile(business_file)
rdd_text.first()

u'{"business_id": "vcNAWiLM4dR7D2nwwJ7nCA", "full_address": "4840 E Indian School Rd\\nSte 101\\nPhoenix, AZ 85018", "hours": {"Tuesday": {"close": "17:00", "open": "08:00"}, "Friday": {"close": "17:00", "open": "08:00"}, "Monday": {"close": "17:00", "open": "08:00"}, "Wednesday": {"close": "17:00", "open": "08:00"}, "Thursday": {"close": "17:00", "open": "08:00"}}, "open": true, "categories": ["Doctors", "Health & Medical"], "city": "Phoenix", "review_count": 9, "name": "Eric Goldberg, MD", "neighborhoods": [], "longitude": -111.98375799999999, "state": "AZ", "stars": 3.5, "latitude": 33.499313000000001, "attributes": {"By Appointment Only": true}, "type": "business"}'

In [5]:
rdd_json = rdd_text.map(lambda x: json.loads(x))
rdd_json.first()

{u'attributes': {u'By Appointment Only': True},
 u'business_id': u'vcNAWiLM4dR7D2nwwJ7nCA',
 u'categories': [u'Doctors', u'Health & Medical'],
 u'city': u'Phoenix',
 u'full_address': u'4840 E Indian School Rd\nSte 101\nPhoenix, AZ 85018',
 u'hours': {u'Friday': {u'close': u'17:00', u'open': u'08:00'},
  u'Monday': {u'close': u'17:00', u'open': u'08:00'},
  u'Thursday': {u'close': u'17:00', u'open': u'08:00'},
  u'Tuesday': {u'close': u'17:00', u'open': u'08:00'},
  u'Wednesday': {u'close': u'17:00', u'open': u'08:00'}},
 u'latitude': 33.499313,
 u'longitude': -111.983758,
 u'name': u'Eric Goldberg, MD',
 u'neighborhoods': [],
 u'open': True,
 u'review_count': 9,
 u'stars': 3.5,
 u'state': u'AZ',
 u'type': u'business'}

In [6]:
categories = rdd_json.flatMap(lambda x: x['categories'])
categories.take(10)

[u'Doctors',
 u'Health & Medical',
 u'Nightlife',
 u'Active Life',
 u'Mini Golf',
 u'Golf',
 u'Shopping',
 u'Home Services',
 u'Internet Service Providers',
 u'Mobile Phones']