# Loading Yelp Data

In [None]:
sc.version

## Parsing JSON
There are several packages that provide JSON parsing API. Here, we use the  [Play Framework](https://www.playframework.com/) API https://www.playframework.com/documentation/2.1.1/ScalaJson


## DateTime processing
Dealing with dates and times can become quite complex. We use the http://www.joda.org/joda-time/ package.

In [1]:
val DATADIR = "/user/pmolnar/yelp/data"
import play.api.libs.json._
import org.joda.time._
import org.apache.spark.sql._
import org.apache.spark.sql.types.{StructType,StructField,StringType,IntegerType,FloatType,DateType};

In [30]:
def parse_review(s:String) = {
    val elem = Json.parse(s)
    val dt = DateTime.parse((elem \ "date").as[String])
    Row(
         (elem \ "business_id").as[String]
        ,(elem \ "user_id").as[String]
        ,(elem \ "review_id").as[String]
        ,dt
        ,(elem \ "text").as[String]
        ,(elem \ "stars").as[Float]
        ,(elem \ "votes" \ "funny").as[Int]
        ,(elem \ "votes" \ "useful").as[Int]
        ,(elem \ "votes" \ "cool").as[Int]
    )
}

val schema = StructType(Seq(
     StructField("business_id", StringType, false)
     ,StructField("user_id", StringType, false)
     ,StructField("review_id", StringType, false)
     ,StructField("date", DateType, false)
     ,StructField("text", StringType, false)
     ,StructField("stars", FloatType, false)
     ,StructField("votes_funny", IntegerType, false)
     ,StructField("votes_useful", IntegerType, false)
     ,StructField("votes_cool", IntegerType, false)
     )
)

In [26]:
val df = sqlContext.read.json(DATADIR+"/review/review_aa.json.gz")

In [27]:
df.printSchema

root
 |-- business_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- stars: long (nullable = true)
 |-- text: string (nullable = true)
 |-- type: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- votes: struct (nullable = true)
 |    |-- cool: long (nullable = true)
 |    |-- funny: long (nullable = true)
 |    |-- useful: long (nullable = true)



In [31]:
val review_rdd = sc.textFile(DATADIR+"/review/review_aa.json.gz")

In [32]:
val review_df = sqlContext.createDataFrame(review_rdd.map(parse_review), schema)

In [33]:
review_df.printSchema

root
 |-- business_id: string (nullable = false)
 |-- user_id: string (nullable = false)
 |-- review_id: string (nullable = false)
 |-- date: date (nullable = false)
 |-- text: string (nullable = false)
 |-- stars: float (nullable = false)
 |-- votes_funny: integer (nullable = false)
 |-- votes_useful: integer (nullable = false)
 |-- votes_cool: integer (nullable = false)



In [41]:
review_df.count()

Name: org.apache.spark.SparkException
Message: Job aborted due to stage failure: Task 0 in stage 6.0 failed 1 times, most recent failure: Lost task 0.0 in stage 6.0 (TID 4, localhost): java.lang.ClassCastException: org.joda.time.DateTime cannot be cast to java.sql.Date
	at org.apache.spark.sql.catalyst.CatalystTypeConverters$DateConverter$.toCatalystImpl(CatalystTypeConverters.scala:305)
	at org.apache.spark.sql.catalyst.CatalystTypeConverters$CatalystTypeConverter.toCatalyst(CatalystTypeConverters.scala:102)
	at org.apache.spark.sql.catalyst.CatalystTypeConverters$StructConverter.toCatalystImpl(CatalystTypeConverters.scala:260)
	at org.apache.spark.sql.catalyst.CatalystTypeConverters$StructConverter.toCatalystImpl(CatalystTypeConverters.scala:250)
	at org.apache.spark.sql.catalyst.CatalystTypeConverters$CatalystTypeConverter.toCatalyst(CatalystTypeConverters.scala:102)
	at org.apache.spark.sql.catalyst.CatalystTypeConverters$$anonfun$createToCatalystConverter$2.apply(CatalystTypeConve

In [15]:
for (k <- ('a' to 'k')) {
    val fn = "/review/review_a"+k+".json.gz"
    val review_rdd = sc.textFile(DATADIR+fn)
    val n = review_rdd.map(parse_review).map(t => (t._6, t._5.length)).count
    println(fn+" -> "+n)
}

/review/review_aa.json.gz -> 268506
/review/review_ab.json.gz -> 268506
/review/review_ac.json.gz -> 101571
/review/review_ad.json.gz -> 268506
/review/review_ae.json.gz -> 268506
/review/review_af.json.gz -> 268506
/review/review_ag.json.gz -> 268506
/review/review_ah.json.gz -> 268506
/review/review_ai.json.gz -> 268506
/review/review_aj.json.gz -> 268506
/review/review_ak.json.gz -> 6


In [5]:
for (val review_rdd = sc.textFile(DATADIR+"/review/review_aa.json.gz")
val n = review_rdd.map(parse_review).map(t => (t._6, t._5.length)).count
println(n)

268506


In [10]:
review_rdd.take(1)

Array({"votes": {"funny": 0, "useful": 0, "cool": 0}, "user_id": "PUFPaY9KxDAcGqfsorJp3Q", "review_id": "Ya85v4eqdd6k9Od8HbQjyA", "stars": 4, "date": "2012-08-01", "text": "Mr Hoagie is an institution. Walking in, it does seem like a throwback to 30 years ago, old fashioned menu board, booths out of the 70s, and a large selection of food. Their speciality is the Italian Hoagie, and it is voted the best in the area year after year. I usually order the burger, while the patties are obviously cooked from frozen, all of the other ingredients are very fresh. Overall, its a good alternative to Subway, which is down the road.", "type": "review", "business_id": "5UmKMjUEUNdYWqANhGckJw"})

In [20]:
review_rdd.take(1)(0)

{"votes": {"funny": 0, "useful": 0, "cool": 0}, "user_id": "PUFPaY9KxDAcGqfsorJp3Q", "review_id": "Ya85v4eqdd6k9Od8HbQjyA", "stars": 4, "date": "2012-08-01", "text": "Mr Hoagie is an institution. Walking in, it does seem like a throwback to 30 years ago, old fashioned menu board, booths out of the 70s, and a large selection of food. Their speciality is the Italian Hoagie, and it is voted the best in the area year after year. I usually order the burger, while the patties are obviously cooked from frozen, all of the other ingredients are very fresh. Overall, its a good alternative to Subway, which is down the road.", "type": "review", "business_id": "5UmKMjUEUNdYWqANhGckJw"}

In [3]:
import play.api.libs.json._


In [64]:
import org.apache.spark.sql.types.{StructType,StructField,StringType,IntegerType,FloatType,DateType};

In [77]:
val schema = StructType(List(
     StructField("business_id", StringType, false)
    ,StructField("user_id", StringType, false)
    ,StructField("review_id", StringType, false)
    ,StructField("date", DateType, false)
    ,StructField("text", StringType, false)
    ,StructField("stars", FloatType, false)
    ,StructField("votes_funny", IntegerType, false)
    ,StructField("votes_useful", IntegerType, false)
    ,StructField("votes_cool", IntegerType, false)
    ))

In [71]:
val review_tup_rdd = review_rdd.map(parse_review)

In [74]:
val sqlContext = new org.apache.spark.sql.SQLContext(sc)

In [80]:
val review_df = sqlContext.createDataFrame(review_rdd.map(parse_review), schema)

Name: Compile Error
Message: <console>:49: error: overloaded method value createDataFrame with alternatives:
  (data: java.util.List[_],beanClass: Class[_])org.apache.spark.sql.DataFrame <and>
  (rdd: org.apache.spark.api.java.JavaRDD[_],beanClass: Class[_])org.apache.spark.sql.DataFrame <and>
  (rdd: org.apache.spark.rdd.RDD[_],beanClass: Class[_])org.apache.spark.sql.DataFrame <and>
  (rows: java.util.List[org.apache.spark.sql.Row],schema: org.apache.spark.sql.types.StructType)org.apache.spark.sql.DataFrame <and>
  (rowRDD: org.apache.spark.api.java.JavaRDD[org.apache.spark.sql.Row],schema: org.apache.spark.sql.types.StructType)org.apache.spark.sql.DataFrame <and>
  (rowRDD: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row],schema: org.apache.spark.sql.types.StructType)org.apache.spark.sql.DataFrame
 cannot be applied to (org.apache.spark.rdd.RDD[(String, String, String, org.joda.time.DateTime, String, Float)], org.apache.spark.sql.types.StructType)
         val review_df = sqlConte

In [53]:
val t = review_rdd.map(parse_review).sample(false, 0.1).first()

In [54]:
val r = Json.parse(review_rdd.take(1)(0))

In [55]:
(r \ "votes")

{"funny":0,"useful":0,"cool":0}

In [42]:
review_rdd.map(parse_review).map(t => (t._6, t._5.length)).dependencies.length

1

In [40]:
for (s <- dep1) println(s)

org.apache.spark.OneToOneDependency@38fc3be1


In [25]:
(elem \ "text").as[String].split(' ').map(x => x.toLowerCase())

Array(mr, hoagie, is, an, institution., walking, in,, it, does, seem, like, a, throwback, to, 30, years, ago,, old, fashioned, menu, board,, booths, out, of, the, 70s,, and, a, large, selection, of, food., their, speciality, is, the, italian, hoagie,, and, it, is, voted, the, best, in, the, area, year, after, year., i, usually, order, the, burger,, while, the, patties, are, obviously, cooked, from, frozen,, all, of, the, other, ingredients, are, very, fresh., overall,, its, a, good, alternative, to, subway,, which, is, down, the, road.)

In [32]:
"  abs23432  ".toString.

abs23432

In [34]:
(elem \ "stars").as[Double]

4.0

In [54]:
def foo(x: Int): Int = {
    return x*x
}

In [56]:
foo(234)

54756

In [52]:
import scala.Int

In [61]:
def boo(x: List[Any]) {
    for(a <- x) {
        println(a)
    }
}

In [65]:
boo(List(234, "dsf", 234.0))

234
dsf
234.0


In [48]:
val row = review_rdd.take(1)(0)


In [73]:
elem.asInstanceOf[Map[String, Any]]("votes").asInstanceOf[Any]

Name: java.lang.ClassCastException
Message: scala.Some cannot be cast to scala.collection.immutable.Map
StackTrace: $line386.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:35)
$line386.$read$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:40)
$line386.$read$$iwC$$iwC$$iwC$$iwC.<init>(<console>:42)
$line386.$read$$iwC$$iwC$$iwC.<init>(<console>:44)
$line386.$read$$iwC$$iwC.<init>(<console>:46)
$line386.$read$$iwC.<init>(<console>:48)
$line386.$read.<init>(<console>:50)
$line386.$read$.<init>(<console>:54)
$line386.$read$.<clinit>(<console>)
$line386.$eval$.<init>(<console>:7)
$line386.$eval$.<clinit>(<console>)
$line386.$eval.$print(<console>)
sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
java.lang.reflect.Method.invoke(Method.java:497)
org.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:1065)
o

In [62]:
(val k in elem)

Name: Compile Error
Message: <console>:1: error: illegal start of simple expression
       (val k in elem)
        ^
StackTrace: 

In [39]:
elem.toString

{"votes":{"funny":0,"useful":0,"cool":0},"user_id":"PUFPaY9KxDAcGqfsorJp3Q","review_id":"Ya85v4eqdd6k9Od8HbQjyA","stars":4,"date":"2012-08-01","text":"Mr Hoagie is an institution. Walking in, it does seem like a throwback to 30 years ago, old fashioned menu board, booths out of the 70s, and a large selection of food. Their speciality is the Italian Hoagie, and it is voted the best in the area year after year. I usually order the burger, while the patties are obviously cooked from frozen, all of the other ingredients are very fresh. Overall, its a good alternative to Subway, which is down the road.","type":"review","business_id":"5UmKMjUEUNdYWqANhGckJw"}

In [41]:
elem

{"votes":{"funny":0,"useful":0,"cool":0},"user_id":"PUFPaY9KxDAcGqfsorJp3Q","review_id":"Ya85v4eqdd6k9Od8HbQjyA","stars":4,"date":"2012-08-01","text":"Mr Hoagie is an institution. Walking in, it does seem like a throwback to 30 years ago, old fashioned menu board, booths out of the 70s, and a large selection of food. Their speciality is the Italian Hoagie, and it is voted the best in the area year after year. I usually order the burger, while the patties are obviously cooked from frozen, all of the other ingredients are very fresh. Overall, its a good alternative to Subway, which is down the road.","type":"review","business_id":"5UmKMjUEUNdYWqANhGckJw"}

In [33]:
import org.joda.time._

In [31]:
org.joda.time.DateTime.parse("2012-08-01") //.withZone(org.joda.time.DateTimeZone.forOffsetHours(-5))

2012-08-01T00:00:00.000-04:00

In [27]:
import org.joda.time.DateTimeZone._

In [29]:
org.joda.time.DateTimeZone.forOffsetHours(-5)

-05:00

In [32]:
parse("2012-08-01")

2012-08-01T00:00:00.000-04:00

In [3]:
val user_df = sqlContext.read.json("/user/pmolnar/yelp/data/user")

In [4]:
user_df.printSchema()

root
 |-- average_stars: double (nullable = true)
 |-- compliments: struct (nullable = true)
 |    |-- cool: long (nullable = true)
 |    |-- cute: long (nullable = true)
 |    |-- funny: long (nullable = true)
 |    |-- hot: long (nullable = true)
 |    |-- list: long (nullable = true)
 |    |-- more: long (nullable = true)
 |    |-- note: long (nullable = true)
 |    |-- photos: long (nullable = true)
 |    |-- plain: long (nullable = true)
 |    |-- profile: long (nullable = true)
 |    |-- writer: long (nullable = true)
 |-- elite: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- fans: long (nullable = true)
 |-- friends: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- name: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- type: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- votes: struct (nullable = true)
 |    |-- cool: long (nullable = true)
 |    |-- funny: long (nullable = true)

In [15]:
val u = user_df.take(1)(0)