In [2]:
# Uncomment this line to run in Colab
#!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.1 -s 4.0.1 -g

--2022-07-25 22:59:19--  https://setup.johnsnowlabs.com/colab.sh
Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]
--2022-07-25 22:59:20--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1191 (1.2K) [text/plain]
Saving to: ‘STDOUT’


2022-07-25 22:59:20 (39.7 MB/s) - written to stdout [1191/1191]

Installing PySpark 3.2.1 and Spark NLP 4.0.1
setup Colab for PySpark 3.2.1 and Spark NLP

In [95]:
from pathlib import Path
import requests
import zipfile
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, col, desc, min, avg, max, split, explode, regexp_extract

In [4]:
data_dir = Path('./data')
data_dir.mkdir(exist_ok=True)

ml_data_dir = data_dir/'ml-latest-small'

In [5]:
if not ml_data_dir.exists():
    zip_output_path = data_dir/"ml-latest-small.zip"
    data_url = "http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
    response = requests.get(data_url)
    open(zip_output_path, "wb").write(response.content)
    with zipfile.ZipFile(zip_output_path, 'r') as zip_ref:
        zip_ref.extractall(data_dir)

In [7]:
# Uncomment this code block to run in colab

# import findspark
# findspark.init()

In [16]:
spark = SparkSession \
            .builder \
            .appName("movielens-nb") \
            .master("spark://spark-master:7077") \
            .config("spark.executor.memory", "512m") \
            .getOrCreate() \

In [None]:
# Use this to run in Colab

# spark = SparkSession \
#             .builder \
#             .appName("movielens-nb") \
#             .master("local") \
#             .config("spark.executor.memory", "512m") \
#             .getOrCreate() \

In [66]:
movies_dataset_path = ml_data_dir/"movies.csv"
ratings_dataset_path = ml_data_dir/"ratings.csv"
tags_dataset_path = ml_data_dir/"tags.csv"

assert movies_dataset_path.exists()
assert ratings_dataset_path.exists()
assert tags_dataset_path.exists()

In [67]:
movies = spark.read.csv(str(movies_dataset_path), header=True)
ratings = spark.read.csv(str(ratings_dataset_path), header=True)
tags = spark.read.csv(str(tags_dataset_path), header=True)

## How many movies of genre `drama` are there?

In [31]:
movies.where(movies.genres.contains('Drama')).count()

4361

## How many unique movies are rated, how many are not rated?

In [44]:
print(f"Total number of movies in the dataset: {movies.count()}")

Total number of movies in the dataset: 9742


In [45]:
movie_ratings = movies.join(ratings, on="movieId", how="outer")

In [46]:
num_unique_rated_movies = movie_ratings \
                          .where(movie_ratings.rating.isNotNull()) \
                          .select("movieId") \
                          .distinct() \
                          .count()

print(f"Number of unique rated movies: {num_unique_rated_movies}")

Number of unique rated movies: 9724


In [47]:
num_unique_unrated_movies = movie_ratings \
                          .where(movie_ratings.rating.isNull()) \
                          .select("movieId") \
                          .distinct() \
                          .count()

print(f"Number of unique unrated movies: {num_unique_unrated_movies}")

Number of unique unrated movies: 18


## Who gave the most ratings, how many rates did the person make?

In [53]:
ratings \
  .groupBy("userId") \
  .agg(count("movieId").alias("numRatings")) \
  .sort(desc("numRatings")) \
  .show(1)

+------+----------+
|userId|numRatings|
+------+----------+
|   414|      2698|
+------+----------+
only showing top 1 row



## Compute min, average, max rating per movie.

In [59]:
ratings \
  .groupBy("movieId") \
  .agg(min("rating").alias("minRating"),
       max("rating").alias("maxRating"),
       avg("rating").alias("avgRating")) \
  .show(20)

+-------+---------+---------+------------------+
|movieId|minRating|maxRating|         avgRating|
+-------+---------+---------+------------------+
|      1|      0.5|      5.0|3.9209302325581397|
|     10|      0.5|      5.0| 3.496212121212121|
|    100|      1.0|      4.0|2.7857142857142856|
| 100044|      4.0|      4.0|               4.0|
| 100068|      3.5|      3.5|               3.5|
| 100083|      2.0|      5.0|               3.5|
| 100106|      3.5|      3.5|               3.5|
| 100159|      4.5|      4.5|               4.5|
| 100163|      0.5|      4.5|               2.9|
| 100194|      4.5|      4.5|               4.5|
| 100226|      1.5|      1.5|               1.5|
| 100277|      4.0|      4.0|               4.0|
|   1003|      2.0|      3.5|               2.5|
| 100302|      3.0|      3.0|               3.0|
| 100304|      3.0|      3.0|               3.0|
| 100306|      3.5|      3.5|               3.5|
| 100326|      2.5|      2.5|               2.5|
| 100383|      3.5| 

## Output data-set containing users that have rated a movie but not tagged it.

In [72]:
# This join gives us the rows in ratings such that its `userId, movieId`
# combo does not exist in the `tags` table
rate_without_tagging = ratings.join(
    tags,
    how='left_anti',
    on=['userId', 'movieId']
).select('userId').distinct()

In [73]:
print(f'{rate_without_tagging.count()} users rated a movie without tagging it.')

610 users rated a movie without tagging it.


## Output data-set containing users that have rated AND tagged a movie.

In [74]:
rate_and_tag_users = ratings.join(
    tags,
    how='inner',
    on=['userId', 'movieId']
).select('userId').distinct()

In [76]:
print(f'{rate_and_tag_users.count()} users rated a movie and also tagged it.')

54 users rated a movie and also tagged it.


In [84]:
print(f"Total number of users that left ratings: {ratings.select('userId').distinct().count()}")

Total number of users that left ratings: 610


## Output data-set showing the number of movies per genre, per year

In [108]:
movies \
  .withColumn("genre_array", split(movies.genres, "\|")) \
  .withColumn('genre', explode(col('genre_array'))) \
  .withColumn('year', regexp_extract(movies.title, '\((\d+)\)', 1)) \
  .groupBy('genre', 'year') \
  .agg(count("movieId").alias("numMovies")) \
  .orderBy(['genre', 'year'], ascending=False) \
  .show(10) 

+-------+----+---------+
|  genre|year|numMovies|
+-------+----+---------+
|Western|2017|        2|
|Western|2016|        1|
|Western|2015|        4|
|Western|2014|        3|
|Western|2013|        1|
|Western|2012|        1|
|Western|2011|        2|
|Western|2010|        5|
|Western|2009|        2|
|Western|2008|        5|
+-------+----+---------+
only showing top 10 rows

