# That's a code to read movies data, rank the most popular ones and show them

## Importing modules

In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, LongType
from pyspark.sql.functions import desc, avg, mean
from pyspark.sql.functions import round as _round

### Starting SparkSession

In [68]:
spark = SparkSession.builder.appName("ratingMovies").master("local[*]").getOrCreate()

### Defining movies schemas and reading the data

In [69]:
#Defining the schema
moviesSchema = StructType([ \
    StructField("userID", IntegerType(), True), \
    StructField("movieID", IntegerType(), True), \
    StructField("rating", IntegerType(), True), \
    StructField("timestamp", LongType(), True), \
    ])

#Defining the schema
moviesNamesSchema = StructType([ \
    StructField("movieID", IntegerType(), True), \
    StructField("movieTitle", StringType(), True), \
    ])

#Reading files and creating dataset with movie IDs, their ratings and the users
movies = spark.read.option("sep", "\t").schema(moviesSchema).csv("data/ml-100k/u.data")

#Reading files and creating a dataset with movies ID and title
movieNames = spark.read.option("sep", "|").schema(moviesNamesSchema).csv("data/ml-100k/u.item")

### Grouping movies by their ID and joining the datasets

In [72]:
#Grouping movies by their ID
movieCount = movies.groupBy("movieID").count()

#Removing movies with too few votes
knownMovies = movieCount.filter("count > 100")#.collect()

#Grouping movies by avg rating
avgMovies = movies.groupBy("movieID").agg(_round(avg("rating"), 2).alias("rating"))#.collect()
#ok so what's the difference between mean and avg functions since both work?

#joining the datasets to get the ratings from popular movies
groupedMovies = knownMovies.join(avgMovies, "movieID", "left").select("movieID", "rating", "count")

#Joining the ratings dataset with the movie names one to be easier to see which movie is which
bigMovies = groupedMovies.join(movieNames, "movieID","left").select("movieTitle", "rating")


## Showing results

In [73]:
#Showing the result
bigMovies.orderBy(desc("rating")).show()

+--------------------+------+
|          movieTitle|rating|
+--------------------+------+
|Close Shave, A (1...|  4.49|
|Schindler's List ...|  4.47|
|Wrong Trousers, T...|  4.47|
|   Casablanca (1942)|  4.46|
|Shawshank Redempt...|  4.45|
|Usual Suspects, T...|  4.39|
|  Rear Window (1954)|  4.39|
|    Star Wars (1977)|  4.36|
| 12 Angry Men (1957)|  4.34|
|Silence of the La...|  4.29|
|One Flew Over the...|  4.29|
| Citizen Kane (1941)|  4.29|
|To Kill a Mocking...|  4.29|
|North by Northwes...|  4.28|
|Godfather, The (1...|  4.28|
|Secrets & Lies (1...|  4.27|
|Good Will Hunting...|  4.26|
|Manchurian Candid...|  4.26|
|Raiders of the Lo...|  4.25|
|Dr. Strangelove o...|  4.25|
+--------------------+------+
only showing top 20 rows



In [74]:
spark.stop()