# That's a code to read movies data, rank the most popular ones and show them

## Importing modules

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, LongType
from pyspark.sql.functions import *

### Starting SparkSession

In [2]:
spark = SparkSession.builder.appName("popularMovies").master("local[*]").getOrCreate()

### Defining movies schemas and reading the data

In [52]:
#Defining schema
moviesSchema = StructType([ \
    StructField("userID", IntegerType(), True), \
    StructField("movieID", IntegerType(), True), \
    StructField("rating", IntegerType(), True), \
    StructField("timestamp", LongType(), True), \
    ])

moviesNamesSchema = StructType([ \
    StructField("movieID", IntegerType(), True), \
    StructField("movieTitle", StringType(), True), \
    ])

#Reading files and creating dataset with movie IDs, their ratings and the users
movies = spark.read.option("sep", "\t").schema(moviesSchema).csv("data/ml-100k/u.data")

#Reading files and creating a dataset with movies ID and title
movieNames = spark.read.option("sep", "|").schema(moviesNamesSchema).csv("data/ml-100k/u.item")

### Grouping movies by their ID and joining the datasets

In [58]:
#Grouping movies by their ID
groupedMovies = movies.groupBy("movieID").count()

#Joining both datasets
bigMovies = groupedMovies.join(movieNames,groupedMovies.movieID ==  movieNames.movieID,"inner")

## Showing results

In [59]:
#Showing the result
bigMovies.select("movieTitle", "count").orderBy(desc("count")).show()

+--------------------+-----+
|          movieTitle|count|
+--------------------+-----+
|    Star Wars (1977)|  583|
|      Contact (1997)|  509|
|        Fargo (1996)|  508|
|Return of the Jed...|  507|
|    Liar Liar (1997)|  485|
|English Patient, ...|  481|
|       Scream (1996)|  478|
|    Toy Story (1995)|  452|
|Air Force One (1997)|  431|
|Independence Day ...|  429|
|Raiders of the Lo...|  420|
|Godfather, The (1...|  413|
| Pulp Fiction (1994)|  394|
|Twelve Monkeys (1...|  392|
|Silence of the La...|  390|
|Jerry Maguire (1996)|  384|
|    Rock, The (1996)|  378|
|Empire Strikes Ba...|  367|
|Star Trek: First ...|  365|
|Back to the Futur...|  350|
+--------------------+-----+
only showing top 20 rows

