In [1]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.cassandra._
val sparkSession = SparkSession.builder().config(sc.getConf).getOrCreate()
import sparkSession.implicits._

In [2]:
sparkSession.read.textFile("file:///usr/lib/yelp_data/stopwords/stopwords_*.txt").createOrReplaceTempView("stop_words")
sparkSession.read.cassandraFormat(keyspace = "yelp", table = "review").load().createOrReplaceTempView("review")
sparkSession.read.cassandraFormat(keyspace = "yelp", table = "business").load().createOrReplaceTempView("business")

In [3]:
sparkSession.sql("""
select /*+ MAPJOIN(stop_words) */
    business_id,
    business_name,
    word,
    avg,
    total
from(
    select 
        b.business_id business_id,
        b.name business_name,
        regexp_replace(lw.w,"[^a-z]","") word,
        avg(r.stars) avg,
        sum(r.stars) total
    from review r
         inner join business b on b.business_id = r.business_id
     lateral view explode(split(lower(r.text)," ")) lw as w
    group by b.business_id,b.name,word
) T
left join stop_words sw on sw.value=T.word
where sw.value is null and T.word <> '' and total>200 
order by T.avg desc,T.total desc
"""
).show(50,false)

+----------------------+------------------------------------+---------+------------------+-----+
|business_id           |business_name                       |word     |avg               |total|
+----------------------+------------------------------------+---------+------------------+-----+
|EU3DE9JD5kFcgchB_P4LNw|Best Mattress                       |kevin    |4.901234567901234 |397  |
|piGQNN6ECbSC0agHhvoVeg|Luxy Nail Salon                     |amazing  |4.793103448275862 |278  |
|piGQNN6ECbSC0agHhvoVeg|Luxy Nail Salon                     |love     |4.733333333333333 |355  |
|EU3DE9JD5kFcgchB_P4LNw|Best Mattress                       |bed      |4.673076923076923 |243  |
|gMUAn6xcuE-TbY1seFw_Ww|Presto Calzone Bakery               |fresh    |4.65625           |298  |
|Eq3qA7F5uZBUbcYXROzntA|Shinano Sushi Bar & Japanese Cuisine|japanese |4.648148148148148 |251  |
|piGQNN6ECbSC0agHhvoVeg|Luxy Nail Salon                     |aiden    |4.596153846153846 |239  |
|EU3DE9JD5kFcgchB_P4LNw|Best M