In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, IntegerType, LongType

In [2]:
spark = SparkSession.builder.appName("PopularMovies").getOrCreate()

In [3]:
# Create schema when reading u.data
schema = StructType([ \
                     StructField("userID", IntegerType(), True), \
                     StructField("movieID", IntegerType(), True), \
                     StructField("rating", IntegerType(), True), \
                     StructField("timestamp", LongType(), True)])

In [5]:
# Load up movie data as dataframe
moviesDF = spark.read.option("sep", "\t").schema(schema).csv("file:///c:/SparkCourse/ml-100k/u.data")
moviesDF.printSchema()

root
 |-- userID: integer (nullable = true)
 |-- movieID: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- timestamp: long (nullable = true)



In [6]:
# Some SQL-style magic to sort all movies by popularity in one line! (Desc)
topMovieIDs = moviesDF.groupBy("movieID").count().orderBy(func.desc("count"))

In [7]:
# Grab the top 10
topMovieIDs.show(10)

+-------+-----+
|movieID|count|
+-------+-----+
|     50|  583|
|    258|  509|
|    100|  508|
|    181|  507|
|    294|  485|
|    286|  481|
|    288|  478|
|      1|  452|
|    300|  431|
|    121|  429|
+-------+-----+
only showing top 10 rows



In [8]:
# Stop the session
spark.stop()