In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, IntegerType, LongType
import codecs

In [2]:
def loadMovieNames():
    movieNames = {}
    with codecs.open("resources/ml-100k/u.item", "r", encoding="ISO-8859-1", errors="ignore") as f:
        for line in f:
            fields = line.split("|")
            movieNames[int(fields[0])] = fields[1]
    return movieNames

In [3]:
spark = SparkSession.builder.appName("PopularMovies").getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/02/03 18:02:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
nameDict = spark.sparkContext.broadcast(loadMovieNames())

In [5]:
schema = StructType([
    StructField("userID", IntegerType(), True),
    StructField("movieID", IntegerType(), True),
    StructField("rating", IntegerType(), True),
    StructField("timestamp", LongType(), True),
])

In [6]:
# loading up movie data as dataframe
moviesDF = spark.read.option("sep", "\t").schema(schema).csv("resources/ml-100k/u.data")

In [7]:
movieCounts = moviesDF.groupBy("movieID").count()

In [8]:
movieCounts.show(5)

[Stage 0:>                                                          (0 + 1) / 1]

+-------+-----+
|movieID|count|
+-------+-----+
|    496|  231|
|    471|  221|
|    463|   71|
|    148|  128|
|   1342|    2|
+-------+-----+
only showing top 5 rows



                                                                                

In [9]:
# creating a user-defined function to look up movie names from our broadcasted dictionary
def lookupName(movieID):
    return nameDict.value[movieID]

In [13]:
lookupNameUDF = func.udf(lookupName)

In [14]:
# adding a movieTitle column using our new udf
moviesWithNames = movieCounts.withColumn("movieTitle", lookupNameUDF(func.col("movieID")))

In [15]:
# sort the results
sortedMoviesWithNames = moviesWithNames.orderBy(func.desc("count"))

In [16]:
# sort the results
sortedMoviesWithNames = moviesWithNames.orderBy(func.desc("count"))

In [17]:
sortedMoviesWithNames.show(10, False)

[Stage 5:>                                                          (0 + 1) / 1]

+-------+-----+-----------------------------+
|movieID|count|movieTitle                   |
+-------+-----+-----------------------------+
|50     |583  |Star Wars (1977)             |
|258    |509  |Contact (1997)               |
|100    |508  |Fargo (1996)                 |
|181    |507  |Return of the Jedi (1983)    |
|294    |485  |Liar Liar (1997)             |
|286    |481  |English Patient, The (1996)  |
|288    |478  |Scream (1996)                |
|1      |452  |Toy Story (1995)             |
|300    |431  |Air Force One (1997)         |
|121    |429  |Independence Day (ID4) (1996)|
+-------+-----+-----------------------------+
only showing top 10 rows



                                                                                

In [19]:
spark.stop()