# Loading Yelp Data

In [1]:
println("Version "+sc.version)
println("Monitor this application at http://arc.insight.gsu.edu:8088/proxy/"+sc.applicationId)

Version 1.6.1
Monitor this application at http://arc.insight.gsu.edu:8088/proxy/application_1487525026281_0038


## Parsing JSON
There are several packages that provide JSON parsing API. Here, we use the  [Play Framework](https://www.playframework.com/) API https://www.playframework.com/documentation/2.1.1/ScalaJson


## DateTime processing
Dealing with dates and times can become quite complex. We use the http://www.joda.org/joda-time/ package.

In [2]:
val DATADIR = "/user/pmolnar/yelp/data"
import play.api.libs.json._
import org.joda.time._
import org.apache.spark.sql._
import org.apache.spark.sql.types.{StructType,StructField,StringType,IntegerType,FloatType,DateType};
import java.sql.Date

In [3]:
def parse_review(s:String) = {
    val elem = Json.parse(s)
    //val dt = DateTime.parse((elem \ "date").as[String])
    val dt = Date.valueOf((elem \ "date").as[String])
    Row(
         (elem \ "business_id").as[String]
        ,(elem \ "user_id").as[String]
        ,(elem \ "review_id").as[String]
        ,dt
        ,(elem \ "text").as[String]
        ,(elem \ "stars").as[Float]
        ,(elem \ "votes" \ "funny").as[Int]
        ,(elem \ "votes" \ "useful").as[Int]
        ,(elem \ "votes" \ "cool").as[Int]
    )
}

val schema = StructType(Seq(
     StructField("business_id", StringType, false)
     ,StructField("user_id", StringType, false)
     ,StructField("review_id", StringType, false)
     ,StructField("date", DateType, false)
     ,StructField("text", StringType, false)
     ,StructField("stars", FloatType, false)
     ,StructField("votes_funny", IntegerType, false)
     ,StructField("votes_useful", IntegerType, false)
     ,StructField("votes_cool", IntegerType, false)
     )
)

In [4]:
val df = sqlContext.read.json(DATADIR+"/review/review_aa.json.gz")

In [27]:
df.printSchema

root
 |-- business_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- stars: long (nullable = true)
 |-- text: string (nullable = true)
 |-- type: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- votes: struct (nullable = true)
 |    |-- cool: long (nullable = true)
 |    |-- funny: long (nullable = true)
 |    |-- useful: long (nullable = true)



In [4]:
val review_rdd = sc.textFile(DATADIR+"/review/review_aa.json.gz")

In [5]:
val review_df = sqlContext.createDataFrame(review_rdd.map(parse_review), schema)

In [25]:
review_df.printSchema

root
 |-- business_id: string (nullable = false)
 |-- user_id: string (nullable = false)
 |-- review_id: string (nullable = false)
 |-- date: date (nullable = false)
 |-- text: string (nullable = false)
 |-- stars: float (nullable = false)
 |-- votes_funny: integer (nullable = false)
 |-- votes_useful: integer (nullable = false)
 |-- votes_cool: integer (nullable = false)



In [6]:
review_df.count()

268506

In [15]:
for (k <- ('a' to 'k')) {
    val fn = "/review/review_a"+k+".json.gz"
    val review_rdd = sc.textFile(DATADIR+fn)
    val n = review_rdd.map(parse_review).map(t => (t._6, t._5.length)).count
    println(fn+" -> "+n)
}

/review/review_aa.json.gz -> 268506
/review/review_ab.json.gz -> 268506
/review/review_ac.json.gz -> 101571
/review/review_ad.json.gz -> 268506
/review/review_ae.json.gz -> 268506
/review/review_af.json.gz -> 268506
/review/review_ag.json.gz -> 268506
/review/review_ah.json.gz -> 268506
/review/review_ai.json.gz -> 268506
/review/review_aj.json.gz -> 268506
/review/review_ak.json.gz -> 6


In [5]:
for (val review_rdd = sc.textFile(DATADIR+"/review/review_aa.json.gz")
val n = review_rdd.map(parse_review).map(t => (t._6, t._5.length)).count
println(n)

268506


In [10]:
review_rdd.take(1)

Array({"votes": {"funny": 0, "useful": 0, "cool": 0}, "user_id": "PUFPaY9KxDAcGqfsorJp3Q", "review_id": "Ya85v4eqdd6k9Od8HbQjyA", "stars": 4, "date": "2012-08-01", "text": "Mr Hoagie is an institution. Walking in, it does seem like a throwback to 30 years ago, old fashioned menu board, booths out of the 70s, and a large selection of food. Their speciality is the Italian Hoagie, and it is voted the best in the area year after year. I usually order the burger, while the patties are obviously cooked from frozen, all of the other ingredients are very fresh. Overall, its a good alternative to Subway, which is down the road.", "type": "review", "business_id": "5UmKMjUEUNdYWqANhGckJw"})

In [20]:
review_rdd.take(1)(0)

{"votes": {"funny": 0, "useful": 0, "cool": 0}, "user_id": "PUFPaY9KxDAcGqfsorJp3Q", "review_id": "Ya85v4eqdd6k9Od8HbQjyA", "stars": 4, "date": "2012-08-01", "text": "Mr Hoagie is an institution. Walking in, it does seem like a throwback to 30 years ago, old fashioned menu board, booths out of the 70s, and a large selection of food. Their speciality is the Italian Hoagie, and it is voted the best in the area year after year. I usually order the burger, while the patties are obviously cooked from frozen, all of the other ingredients are very fresh. Overall, its a good alternative to Subway, which is down the road.", "type": "review", "business_id": "5UmKMjUEUNdYWqANhGckJw"}

In [7]:
import play.api.libs.json._


In [8]:
import org.apache.spark.sql.types.{StructType,StructField,StringType,IntegerType,FloatType,DateType};

In [9]:
val schema = StructType(List(
     StructField("business_id", StringType, false)
    ,StructField("user_id", StringType, false)
    ,StructField("review_id", StringType, false)
    ,StructField("date", DateType, false)
    ,StructField("text", StringType, false)
    ,StructField("stars", FloatType, false)
    ,StructField("votes_funny", IntegerType, false)
    ,StructField("votes_useful", IntegerType, false)
    ,StructField("votes_cool", IntegerType, false)
    ))

In [10]:
val review_tup_rdd = review_rdd.map(parse_review)

In [11]:
val sqlContext = new org.apache.spark.sql.SQLContext(sc)

In [12]:
val review_df = sqlContext.createDataFrame(review_rdd.map(parse_review), schema)

In [13]:
review_df.printSchema

root
 |-- business_id: string (nullable = false)
 |-- user_id: string (nullable = false)
 |-- review_id: string (nullable = false)
 |-- date: date (nullable = false)
 |-- text: string (nullable = false)
 |-- stars: float (nullable = false)
 |-- votes_funny: integer (nullable = false)
 |-- votes_useful: integer (nullable = false)
 |-- votes_cool: integer (nullable = false)



In [14]:
review_df.show

+--------------------+--------------------+--------------------+----------+--------------------+-----+-----------+------------+----------+
|         business_id|             user_id|           review_id|      date|                text|stars|votes_funny|votes_useful|votes_cool|
+--------------------+--------------------+--------------------+----------+--------------------+-----+-----------+------------+----------+
|5UmKMjUEUNdYWqANh...|PUFPaY9KxDAcGqfso...|Ya85v4eqdd6k9Od8H...|2012-08-01|Mr Hoagie is an i...|  4.0|          0|           0|         0|
|5UmKMjUEUNdYWqANh...|Iu6AxdBYGR4A0wspR...|KPvLNJ21_4wbYNctr...|2014-02-13|Excellent food. S...|  5.0|          0|           0|         0|
|5UmKMjUEUNdYWqANh...|auESFwWvW42h6alXg...|fFSoGV46Yxuwbr3fH...|2015-10-31|Yes this place is...|  5.0|          1|           1|         0|
|5UmKMjUEUNdYWqANh...|qiczib2fO_1VBG8Io...|pVMIt0a_QsKtuDfWV...|2015-12-26|PROS: Italian hoa...|  3.0|          0|           0|         0|
|5UmKMjUEUNdYWqANh...|qEE5E

In [15]:
val t = review_rdd.map(parse_review).sample(false, 0.01).first()

In [17]:
t

[mVHrayjG3uZ_RLHkLj-AMg,5OumCBQ_MyQsltSdbMyunA,vtqE5Sy9qk74qiS6ybleoQ,2016-02-22,This is my fourth time but my first review. It never disappoints! Always home cooking, old school like grandma's house. Don't expect ambiance, but if you're hungry and want nice wait staff, (usually family) squeaky clean, great value and generous portions-way more than I can ever finish, Emil's is your place.

Tonight I had a cup of the chicken dumpling soup that was sooooo good! Very obviously home made. I had a side of onion rings that were perfectly cooked, which doesn't sound fancy, but I still couldn't finish. My husband had the chicken parm. Thursday is Italian night. It came with a huge salad, very fresh, an entire loaf of Italian bread, and a generous portion of nic...

In [54]:
val r = Json.parse(review_rdd.take(1)(0))

In [55]:
(r \ "votes")

{"funny":0,"useful":0,"cool":0}

In [42]:
review_rdd.map(parse_review).map(t => (t._6, t._5.length)).dependencies.length

1

In [40]:
for (s <- dep1) println(s)

org.apache.spark.OneToOneDependency@38fc3be1


In [25]:
(elem \ "text").as[String].split(' ').map(x => x.toLowerCase())

Array(mr, hoagie, is, an, institution., walking, in,, it, does, seem, like, a, throwback, to, 30, years, ago,, old, fashioned, menu, board,, booths, out, of, the, 70s,, and, a, large, selection, of, food., their, speciality, is, the, italian, hoagie,, and, it, is, voted, the, best, in, the, area, year, after, year., i, usually, order, the, burger,, while, the, patties, are, obviously, cooked, from, frozen,, all, of, the, other, ingredients, are, very, fresh., overall,, its, a, good, alternative, to, subway,, which, is, down, the, road.)

In [32]:
"  abs23432  ".toString.

abs23432

In [34]:
(elem \ "stars").as[Double]

4.0

In [54]:
def foo(x: Int): Int = {
    return x*x
}

In [56]:
foo(234)

54756

In [52]:
import scala.Int

In [61]:
def boo(x: List[Any]) {
    for(a <- x) {
        println(a)
    }
}

In [65]:
boo(List(234, "dsf", 234.0))

234
dsf
234.0


In [48]:
val row = review_rdd.take(1)(0)


In [73]:
elem.asInstanceOf[Map[String, Any]]("votes").asInstanceOf[Any]

Name: java.lang.ClassCastException
Message: scala.Some cannot be cast to scala.collection.immutable.Map
StackTrace: $line386.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:35)
$line386.$read$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:40)
$line386.$read$$iwC$$iwC$$iwC$$iwC.<init>(<console>:42)
$line386.$read$$iwC$$iwC$$iwC.<init>(<console>:44)
$line386.$read$$iwC$$iwC.<init>(<console>:46)
$line386.$read$$iwC.<init>(<console>:48)
$line386.$read.<init>(<console>:50)
$line386.$read$.<init>(<console>:54)
$line386.$read$.<clinit>(<console>)
$line386.$eval$.<init>(<console>:7)
$line386.$eval$.<clinit>(<console>)
$line386.$eval.$print(<console>)
sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
java.lang.reflect.Method.invoke(Method.java:497)
org.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:1065)
o

In [62]:
(val k in elem)

Name: Compile Error
Message: <console>:1: error: illegal start of simple expression
       (val k in elem)
        ^
StackTrace: 

In [39]:
elem.toString

{"votes":{"funny":0,"useful":0,"cool":0},"user_id":"PUFPaY9KxDAcGqfsorJp3Q","review_id":"Ya85v4eqdd6k9Od8HbQjyA","stars":4,"date":"2012-08-01","text":"Mr Hoagie is an institution. Walking in, it does seem like a throwback to 30 years ago, old fashioned menu board, booths out of the 70s, and a large selection of food. Their speciality is the Italian Hoagie, and it is voted the best in the area year after year. I usually order the burger, while the patties are obviously cooked from frozen, all of the other ingredients are very fresh. Overall, its a good alternative to Subway, which is down the road.","type":"review","business_id":"5UmKMjUEUNdYWqANhGckJw"}

In [41]:
elem

{"votes":{"funny":0,"useful":0,"cool":0},"user_id":"PUFPaY9KxDAcGqfsorJp3Q","review_id":"Ya85v4eqdd6k9Od8HbQjyA","stars":4,"date":"2012-08-01","text":"Mr Hoagie is an institution. Walking in, it does seem like a throwback to 30 years ago, old fashioned menu board, booths out of the 70s, and a large selection of food. Their speciality is the Italian Hoagie, and it is voted the best in the area year after year. I usually order the burger, while the patties are obviously cooked from frozen, all of the other ingredients are very fresh. Overall, its a good alternative to Subway, which is down the road.","type":"review","business_id":"5UmKMjUEUNdYWqANhGckJw"}

In [33]:
import org.joda.time._

In [31]:
org.joda.time.DateTime.parse("2012-08-01") //.withZone(org.joda.time.DateTimeZone.forOffsetHours(-5))

2012-08-01T00:00:00.000-04:00

In [27]:
import org.joda.time.DateTimeZone._

In [29]:
org.joda.time.DateTimeZone.forOffsetHours(-5)

-05:00

In [32]:
parse("2012-08-01")

2012-08-01T00:00:00.000-04:00

In [3]:
val user_df = sqlContext.read.json("/user/pmolnar/yelp/data/user")

In [4]:
user_df.printSchema()

root
 |-- average_stars: double (nullable = true)
 |-- compliments: struct (nullable = true)
 |    |-- cool: long (nullable = true)
 |    |-- cute: long (nullable = true)
 |    |-- funny: long (nullable = true)
 |    |-- hot: long (nullable = true)
 |    |-- list: long (nullable = true)
 |    |-- more: long (nullable = true)
 |    |-- note: long (nullable = true)
 |    |-- photos: long (nullable = true)
 |    |-- plain: long (nullable = true)
 |    |-- profile: long (nullable = true)
 |    |-- writer: long (nullable = true)
 |-- elite: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- fans: long (nullable = true)
 |-- friends: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- name: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- type: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- votes: struct (nullable = true)
 |    |-- cool: long (nullable = true)
 |    |-- funny: long (nullable = true)

In [15]:
val u = user_df.take(1)(0)

# Create Ratings Graph

In [7]:
import org.apache.spark.graphx._

In [6]:
val review_df = sqlContext.read.json("/user/pmolnar/yelp/data/review")
review_df.printSchema

root
 |-- business_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- stars: long (nullable = true)
 |-- text: string (nullable = true)
 |-- type: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- votes: struct (nullable = true)
 |    |-- cool: long (nullable = true)
 |    |-- funny: long (nullable = true)
 |    |-- useful: long (nullable = true)



In [19]:
val rating_df = review_df.groupBy("user_id", "business_id").max("stars")
rating_df.show()


+--------------------+--------------------+----------+
|             user_id|         business_id|max(stars)|
+--------------------+--------------------+----------+
|CFh9jAl5_DUSgNBA5...|QoDa50dc7g62xciFy...|         1|
|PRTVZdLWYmhozxkpl...|6o3RK6rTcN3nw-j-r...|         3|
|IbvOxKSps_K5wa3a2...|1u2NauOy8xDj7aFQE...|         4|
|LwEfyIPU8woMqZsKq...|sbW8qHJgzEIH42B0S...|         2|
|bEIXIfk1Zys1r_nmG...|McikHxxEqZ2X0joaR...|         3|
|f3z4VItrhUYgvIIEX...|tv8cS4aaA1VDaInYg...|         3|
|gtJJBaQwTwE45NnNF...|EoAY1JSVeJriBzId8...|         4|
|RFBIkR3W31JgZVZPg...|EoAY1JSVeJriBzId8...|         2|
|rONxiM8Hafy5kba3Z...|GlYPVYSOVT7zn00Kk...|         5|
|UeYvqmyUICiVnUbN4...|4ykgzzzGEWjMD5lwk...|         5|
|cn4YL6a5F-AJA9WTq...|imM-x_nAxVtYGVFJJ...|         5|
|xWrZoK4WCF5F9So2U...|T49ZvBa6mD4xHSmjH...|         4|
|Sjb5e5-gKoLXueFDM...|kKmvSJ6Z8UVPdrnL7...|         3|
|HmckzZAXriuGNeSOf...|VkFDer80GOrelWALd...|         1|
|5gi8yR20fvnim2dg3...|Ts4xsKPU7FNPPZRj-...|         2|
|eUIlLKXcP

In [None]:

val business_df = sqlContext.read.json("/user/pmolnar/yelp/data/business")

In [None]:
val business_NV_df = businness_df.filter("state='NV'")
val rating_df = review_df.join(business_NV_df, "business_id", "inner").groupBy("user_id", "business_id").max("stars")
//val rating_df = review_df.join(business_NV_df,  business_NV_df("business_id")==review_df("business_id"), 

In [20]:
import org.apache.spark.ml.feature.StringIndexer

In [24]:
val uidIndexer = new StringIndexer
val uidIndexerMdl = uidIndexer.setInputCol("user_id").setOutputCol("user_idx").setHandleInvalid("skip").fit(rating_df)

In [25]:
val bidIndexer = new StringIndexer
val bidIndexerMdl = bidIndexer.setInputCol("business_id").setOutputCol("business_idx").setHandleInvalid("skip").fit(rating_df)

In [26]:
val rating_idx_df = bidIndexerMdl.transform(uidIndexerMdl.transform(rating_df))

In [27]:
rating_idx_df.printSchema

root
 |-- user_id: string (nullable = true)
 |-- business_id: string (nullable = true)
 |-- max(stars): long (nullable = true)
 |-- user_idx: double (nullable = true)
 |-- business_idx: double (nullable = true)



In [None]:
import org.apache.spark.sql.functions

In [31]:
val r = rating_idx_df.first()

In [41]:
Edge(
    r(3).asInstanceOf[Number].longValue,   //user_idx
    r(4).asInstanceOf[Number].longValue,   //business_idx
    r(2).asInstanceOf[Number].doubleValue  //max(stars)
)

Edge(197766,38809,5.0)

In [42]:
val edges = rating_idx_df.rdd.map(r => Edge(
    r(3).asInstanceOf[Number].longValue,   //user_idx
    r(4).asInstanceOf[Number].longValue,   //business_idx
    r(2).asInstanceOf[Number].doubleValue  //max(stars)
))

In [44]:
edges.take(5)

Array(Edge(31491,57201,1.0), Edge(81718,63477,3.0), Edge(2895,47065,4.0), Edge(8306,5112,2.0), Edge(217485,3247,3.0))

In [1]:
val conf = new lib.SVDPlusPlus.Conf(2,10,0,5,0.007,0.007,0.005,0.015)

Parameter | Example |Description
----------|---------|-----------
Rank     |  2 | Number of latent variables.
maxIters | 10 |Number of iterations to execute; the more iterations, the closer the machine learning model is able to converge to its ideal solution, and the more accurate its predictions will be.
minVal   | 0 | Minimum rating (zero stars). 
maxVal   | 5 | Maximum rating (five stars).
gamma1   | 0.007 | How quickly biases can change from one iteration to the next. γ1 from the Koren paper, which recommends 0.007.
gamma2   | 0.007 | How quickly latent variable vectors can change. γ2 from the Koren paper, which recommends 0.007.
gamma6   | 0.005 | Dampener on the biases, to keep them small. λ6 from the Koren paper, meaning lambda6 would have been a more appropriate variable name. Koren recommends 0.005.
gamma7   | 0.015 | The degree to which the different latent variable vectors are permitted to interact. λ7 from the Koren paper, meaning lambda7 would have been a more appropriate variable name. Koren recommends 0.015.





[`Conf(rank: Int, maxIters: Int, minVal: Double, maxVal: Double, gamma1: Double, gamma2: Double, gamma6: Double, gamma7: Double)`](https://spark.apache.org/docs/1.6.1/api/scala/index.html#org.apache.spark.graphx.lib.SVDPlusPlus$$Conf)

In [None]:
val (g,mean) = lib.SVDPlusPlus.run(edges, conf)

## Prediction
<img src="files/svdplusplus-equation.png">
