In [3]:
val movieDf = spark.read.format("csv")
                .option("header", true)
                .option("inferSchema", true)
                .option("delimitter", ",")
                .load("hdfs://localhost:9000/ml-latest-small/movies.csv")
movieDf.printSchema()
movieDf.show(2)


root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)

+-------+----------------+--------------------+
|movieId|           title|              genres|
+-------+----------------+--------------------+
|      1|Toy Story (1995)|Adventure|Animati...|
|      2|  Jumanji (1995)|Adventure|Childre...|
+-------+----------------+--------------------+
only showing top 2 rows



movieDf: org.apache.spark.sql.DataFrame = [movieId: int, title: string ... 1 more field]


In [5]:
// create schema

import org.apache.spark.sql.types.{StringType, StructType, DoubleType, IntegerType, LongType, StructField}
val RatingSchema = StructType(
                                List(
                                    StructField("userId", IntegerType, true),
                                    StructField("movieId", IntegerType, true),
                                    StructField("rating", DoubleType, true),
                                    StructField("timestamp", LongType, true)
                                    )
                            )
                                   

import org.apache.spark.sql.types.{StringType, StructType, DoubleType, IntegerType, LongType, StructField}
RatingSchema: org.apache.spark.sql.types.StructType = StructType(StructField(userId,IntegerType,true), StructField(movieId,IntegerType,true), StructField(rating,DoubleType,true), StructField(timestamp,LongType,true))


In [7]:
val ratingDf = spark.read.format("csv")
                .option("header", true)
                .option("delimitter", ",")
                .schema(RatingSchema)
                .load("hdfs://localhost:9000/ml-latest-small/ratings.csv")
ratingDf.printSchema()
ratingDf.show(2)


root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: long (nullable = true)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
+------+-------+------+---------+
only showing top 2 rows



ratingDf: org.apache.spark.sql.DataFrame = [userId: int, movieId: int ... 2 more fields]


In [8]:
import org.apache.spark.sql.functions.{col}

import org.apache.spark.sql.functions.col


In [9]:
val df = ratingDf.filter(ratingDf("rating") > 1.0)
df.count()

df: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [userId: int, movieId: int ... 2 more fields]
res4: Long = 96655


In [10]:
val df = ratingDf.filter($"rating" > 1.0)
df.count()

df: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [userId: int, movieId: int ... 2 more fields]
res5: Long = 96655


In [12]:
val df = ratingDf.filter('rating > 1.0)
df.count()

df: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [userId: int, movieId: int ... 2 more fields]
res6: Long = 96655


In [13]:
import org.apache.spark.sql.functions.{count,avg}

import org.apache.spark.sql.functions.{count, avg}


In [17]:
val popularDf = ratingDf
                        .groupBy($"movieId")
                        .agg(count($"userId").alias("total_ratings"))

popularDf: org.apache.spark.sql.DataFrame = [movieId: int, total_ratings: bigint]


In [18]:
popularDf.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- total_ratings: long (nullable = false)



In [19]:
popularDf.show(2)

+-------+-------------+
|movieId|total_ratings|
+-------+-------------+
|   1580|          165|
|   2366|           25|
+-------+-------------+
only showing top 2 rows

