# Recommending Movies

In this track you will be introduced to the MovieLens dataset. You will walk through how to assess it's use for ALS, build out a full cross-validated ALS model on it, and learn how to evaluate it's performance. This will be the foundation for all subsequent ALS models you build using Pyspark.

## Preparing the environment

### Importing libraries

In [1]:
import numpy as np
import pandas as pd
import time

from typing import List, Tuple
from pprint import pprint
from environment import SEED
from sklearn.decomposition import NMF
from pyspark.sql import SparkSession, Row, DataFrame as SparkDataframe, functions as F
from pyspark.sql.types import (StructType, StructField,
                               DoubleType, IntegerType, StringType, TimestampType)
from pyspark.sql.window import Window
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

### Connect to Spark

In [2]:
spark = (SparkSession.builder
                     .master('local[*]') \
                     .appName('spark_application') \
                     .config("spark.sql.repl.eagerEval.enabled", True)  # eval DataFrame in notebooks
                     .config("spark.driver.memory", "10g")
                     .config("spark.driver.maxResultSize", "10G")
                     .getOrCreate())

sc = spark.sparkContext
sc.setSystemProperty('spark.executor.memory', '10G')
sc.setCheckpointDir("ml-checkpoint/")

print(f'Spark version: {spark.version}')

Spark version: 3.5.1


In [3]:
# Review current configuration
print(f"Current driver memory: {spark.conf.get('spark.driver.memory')}")
print(f"Current number of partitions: {spark.conf.get('spark.sql.shuffle.partitions')}")

Current driver memory: 10g
Current number of partitions: 200


## Loading data
Source of datasets: [Kaggle](https://www.kaggle.com/c/msdchallenge/data)

Token to validate access can be generated following the instructions from [here](https://github.com/Kaggle/kaggle-api).
```
pip install kaggle
kaggle competitions download -c msdchallenge
```

### Songs

In [4]:
# Reading the file
schema_songs = StructType([
    StructField("songCode", StringType()),
    StructField("songId", IntegerType())
])
songs_data = spark.read.csv('data-sources/songs/songs.txt', header=False, sep=' ', schema=schema_songs)

# Reviewing the result
songs_data.createOrReplaceTempView("songs")
print(f'Dataframe shape: ({songs_data.count()}, {len(songs_data.columns)})')
songs_data.printSchema()
songs_data.limit(2)

Dataframe shape: (386213, 2)
root
 |-- songCode: string (nullable = true)
 |-- songId: integer (nullable = true)



songCode,songId
SOAAADD12AB018A9DD,1
SOAAADE12A6D4F80CC,2


### Disks (Song Details)

In [5]:
# Reading the file
schema_disks = StructType([
    StructField("soundtrack", StringType()),
    StructField("songCode", StringType()),
    StructField("singer", StringType()),
    StructField("songName", StringType())
])
disks_data = spark.read.csv('data-sources/songs/disks.txt', header=False, sep='<SEP>', schema=schema_disks)

# Cleaning and mutating/adding some colums
disks_data = disks_data.withColumn("songName",    # Removing special characters
                                   F.regexp_replace(F.col("songName"), "[^A-Za-z0-9\s]", ""))
disks_data = (disks_data.groupby('songCode').agg(F.last('singer').alias('singer'),   # Making unique records
                                                 F.last('songName').alias('songName')))
disks_data = disks_data.join(songs_data, 'songCode', 'left')  # Adding the Id to songs

# Reviewing the result
disks_data.createOrReplaceTempView("disks")
print(f'Dataframe shape: ({disks_data.count()}, {len(disks_data.columns)})')
disks_data.printSchema()
disks_data.limit(10)

Dataframe shape: (999056, 4)
root
 |-- songCode: string (nullable = true)
 |-- singer: string (nullable = true)
 |-- songName: string (nullable = true)
 |-- songId: integer (nullable = true)



songCode,singer,songName,songId
SOAAAKE12A67AD7460,Les Compagnons De...,Le Galrien,
SOAABAM12AB0180E94,NIGHT FLIGHT,Little Things,
SOAABGK12AB0185F7F,Les Trompettes Du...,La retraite de pi...,
SOAABLP12AB017D3FF,Ojm,The Sleeper,
SOAABSB12A8C143E55,UK Subs,Organised Crime,37.0
SOAACGG12A58A7A034,Matt Costa,Ballad Of Miss Kate,48.0
SOAACJX12AB018B34A,Norrisman,Welcome To Say,
SOAACUH12A8AE48B92,Infinite Mass,I Dont Care,
SOAACVP12A8C134567,Missing Persons,Rock And Roll Sus...,61.0
SOAADAS12A58A784EC,Fort Knox Five fe...,How to Start a Band,70.0


### Users

In [6]:
# Reading the file
schema_users = StructType([
    StructField("userCode", StringType())
])
users_data = spark.read.csv('data-sources/songs/users.txt', header=False, sep=' ', schema=schema_users)

# Mutating columns
users_data = users_data.coalesce(1).withColumn('userId', F.monotonically_increasing_id()).persist()

# Reviewing the result
users_data.createOrReplaceTempView("users")
print(f'Dataframe shape: ({users_data.count()}, {len(users_data.columns)})')
users_data.printSchema()
users_data.limit(2)

Dataframe shape: (110000, 2)
root
 |-- userCode: string (nullable = true)
 |-- userId: long (nullable = false)



userCode,userId
fd50c4007b68a3737...,0
d7083f5e1d50c2642...,1


### Ratings

In [7]:
# Reading the file
schema_ratings = StructType([
    StructField("userCode", StringType()),
    StructField("songCode", StringType()),
    StructField("numPlays", IntegerType())
])
ratings_data = spark.read.csv('data-sources/songs/ratings.txt', 
                            header=False, sep='\t', schema=schema_ratings)

# Adding Id's to Users and Songs
ratings_data = (ratings_data.join(songs_data, 'songCode', 'left')
                            .join(users_data, 'userCode', 'left')
                            .dropna())

# Taking just part of the data
ratings_data = (ratings_data.filter(F.col('userId').isin(ratings_data.groupby('userId').count()
                                                                     .sort('count', ascending=False)
                                                                     .limit(321).select('userId')
                                                                     .toPandas().userId.to_list()))
                            .filter(F.col('songId').isin(ratings_data.groupby('songId').count()
                                                                     .sort('count', ascending=False)
                                                                     .limit(729).select('songId')
                                                                     .toPandas().songId.to_list())))

# Reviewing the result
ratings_data.createOrReplaceTempView("ratings")
print(f'Dataframe shape: ({ratings_data.count()}, {len(ratings_data.columns)})')
ratings_data.printSchema()
ratings_data.limit(2)

Dataframe shape: (2390, 5)
root
 |-- userCode: string (nullable = true)
 |-- songCode: string (nullable = true)
 |-- numPlays: integer (nullable = true)
 |-- songId: integer (nullable = true)
 |-- userId: long (nullable = true)



userCode,songCode,numPlays,songId,userId
f5ecd768453281573...,SOEPZQS12A8C1436C7,16,74321,110
f5ecd768453281573...,SOFRQTD12A81C233C0,14,91177,110


### Movie Binary Ratings

In [8]:
# Reading the file
schema_bin_movies = StructType([
    StructField("userId", IntegerType()),
    StructField("movieId", IntegerType()),
    StructField("rating", DoubleType()),
    StructField("timestamp", IntegerType())
])
binary_movies_data = spark.read.csv('data-sources/movies/ratings.csv', header=True, schema=schema_bin_movies)

# Cleaning and mutating some columns
binary_movies_data = binary_movies_data.withColumn('timestamp', F.to_timestamp(F.from_unixtime('timestamp')))
date_range = binary_movies_data.select('timestamp').agg(F.min('timestamp'), F.max('timestamp')).collect()[0]
print(f"Date range: {date_range[0]} - {date_range[1]}")

# Taking just part of the data
binary_movies_data = (binary_movies_data.where('timestamp >= "2023-09-12"')
                            .withColumn('rating', F.lit(1)))
# binary_movies_data = binary_movies_data.sample(withReplacement=False, fraction=.05, seed=SEED)
binary_movies_data = (binary_movies_data.filter(F.col('userId')
                                                 .isin(binary_movies_data.groupby('userId').count()
                                                 .sort('count', ascending=False)
                                                 .limit(321).select('userId')
                                                 .toPandas().userId.to_list()))
                                        .filter(F.col('movieId')
                                                 .isin(binary_movies_data.groupby('movieId').count()
                                                 .sort('count', ascending=False)
                                                 .limit(729).select('movieId')
                                                 .toPandas().movieId.to_list())))

# Reviewing the result
binary_movies_data.createOrReplaceTempView("ratings")
print(f'Dataframe shape: ({binary_movies_data.count()}, {len(binary_movies_data.columns)})')
binary_movies_data.printSchema()
binary_movies_data.limit(2)

Date range: 1995-01-09 05:46:44 - 2023-10-12 20:29:07
Dataframe shape: (36471, 4)
root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: integer (nullable = false)
 |-- timestamp: timestamp (nullable = true)



userId,movieId,rating,timestamp
178,223,1,2023-10-04 18:28:39
178,260,1,2023-10-04 18:36:52


### Tables catalogue

In [9]:
spark.catalog.listTables()

[Table(name='disks', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='ratings', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='songs', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='users', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True)]

## Util functions

### add_zeros_to_ratings

In [10]:
def add_zeros_to_ratings(df, userCol="userId", productCol="songId"):
    # Extracts distinct users
    users = df.select(userCol).distinct()
    print(f'{users.count()} unique users.')
    
    # Extracts distinct products
    products = df.select(productCol).distinct()
    print(f'{products.count()} unique products.')
    
    # Joins users and products, fills blanks with 0
    cross_join = (users.crossJoin(products)
                       .join(df, [userCol, productCol], "left")
                       .fillna(0))
    print(f'{products.count()*users.count()} total expected records.')
    
    return cross_join

# Content

## Introduction to the Million Songs Dataset

### Preparing ratings dataset

In [11]:
start = time.time()
df = ratings_data.drop(*['userCode', 'songCode'])
df_zeros = add_zeros_to_ratings(df)
df_zeros.sort('numPlays', ascending=False).show(3, truncate=False)
print(f'''
Total rows = {df_zeros.count()}

Consumed time: {(time.time() - start)/60} min
''')

296 unique users.
643 unique products.
190328 total expected records.
+------+------+--------+
|userId|songId|numPlays|
+------+------+--------+
|94259 |25150 |97      |
|64180 |319911|90      |
|4770  |301674|72      |
+------+------+--------+
only showing top 3 rows


Total rows = 190328

Consumed time: 0.13290206591288248 min



### MovieLens sparsity

To measure how sparse is the data.
$$ Sparcity = 1 - \frac{Number Of Ratings In Matrix}{Number Of Users \times Number Of Songs} $$

In [12]:
# Calculating MovieLens sparsity
number_of_ratings = df.count()
number_of_users = df.select('userId').distinct().count()
number_of_movies = df.select('songId').distinct().count()

sparsity = 1 - (number_of_ratings / (number_of_users * number_of_movies))
print('Million Songs Dataset Sparsity:', sparsity)

Million Songs Dataset Sparsity: 0.9874427304442857


## Ex. 1 - Grouped summary statistics

In this exercise, we are going to combine the `.groupBy()` and `.filter()` methods that you've used previously to calculate the `min()` and `avg()` number of users that have rated each song, and the `min()` and `avg()` number of songs that each user has rated.

Because our data now includes 0's for items not yet consumed, we'll need to `.filter()` them out when doing grouped summary statistics like this. The msd dataset is provided for you here. The `col()`, `min()`, and `avg()` functions from `pyspark.sql.functions` have been imported for you.

**Instructions:**

1. As an example, the `.filter()`, `.groupBy()` and `.count()` methods are applied to the msd dataset along with `.select()` and `min()` to return the smallest number of ratings that any song in the dataset has received. Use this as a model to calculate the `avg()` number of implicit ratings the songs in msd have received.
2. Using the same model, find the `min()` and `avg()` number of implicit ratings that userIds have provided in the msd dataset.

In [13]:
# Min num implicit ratings for a song
print("Minimum implicit ratings for a song: ")
df_zeros.filter(F.col("numPlays") > 0).groupBy("songId").count().select(F.min("count")).show()

# Avg num implicit ratings per songs
print("Average implicit ratings per song: ")
df_zeros.filter(F.col("numPlays") > 0).groupBy("songId").count().select(F.avg("count")).show()

# Min num implicit ratings from a user
print("Minimum implicit ratings from a user: ")
df_zeros.filter(F.col("numPlays") > 0).groupBy("userId").count().select(F.min("count")).show()

# Avg num implicit ratings for users
print("Average implicit ratings per user: ")
df_zeros.filter(F.col("numPlays") > 0).groupBy("userId").count().select(F.avg("count")).show()

Minimum implicit ratings for a song: 
+----------+
|min(count)|
+----------+
|         1|
+----------+

Average implicit ratings per song: 
+------------------+
|        avg(count)|
+------------------+
|3.7169517884914463|
+------------------+

Minimum implicit ratings from a user: 
+----------+
|min(count)|
+----------+
|         1|
+----------+

Average implicit ratings per user: 
+-----------------+
|       avg(count)|
+-----------------+
|8.074324324324325|
+-----------------+



In [14]:
# Min num implicit ratings for a song
print("Minimum implicit ratings for a song: ")
df.groupBy("songId").count().select(F.min("count")).show()

# Avg num implicit ratings per songs
print("Average implicit ratings per song: ")
df.groupBy("songId").count().select(F.avg("count")).show()

# Min num implicit ratings from a user
print("Minimum implicit ratings from a user: ")
df.groupBy("userId").count().select(F.min("count")).show()

# Avg num implicit ratings for users
print("Average implicit ratings per user: ")
df.groupBy("userId").count().select(F.avg("count")).show()

Minimum implicit ratings for a song: 
+----------+
|min(count)|
+----------+
|         1|
+----------+

Average implicit ratings per song: 
+------------------+
|        avg(count)|
+------------------+
|3.7169517884914463|
+------------------+

Minimum implicit ratings from a user: 
+----------+
|min(count)|
+----------+
|         1|
+----------+

Average implicit ratings per user: 
+-----------------+
|       avg(count)|
+-----------------+
|8.074324324324325|
+-----------------+



## Ex. 2 - Add zeros

Many recommendation engines use implicit ratings. In many cases these datasets don't include behavior counts for items that a user has never purchased. In these cases, you'll need to add them and include zeros. The dataframe Z is provided for you. It contains userId's, productId's and num_purchases which is the number of times a user has purchased a specific product.

**Instructions:**

1. Take a look at the dataframe `Z` using the `.show()` method.
2. Extract the distinct `userId`s and `productId`s from `Z` using the `.distinct()` method. Call the results `users` and `products` respectively.
3. Perform a `.crossJoin()` on the `users` and `products` dataframes. Call the result `cj`.
4. `"left"` join `cj` to the original ratings dataframe `Z` on `["userId", "productId"]`. Call the `.fillna(0)` method on the result to fill in the blanks with zeros. Call the result `Z_expanded`.

In [15]:
# Loading the data
Z = spark.createDataFrame([Row(userId=2112, productId=777, num_purchases=1),
                           Row(userId=7, productId=44, num_purchases=23),
                           Row(userId=1132, productId=227, num_purchases=9),
                           Row(userId=686, productId=1981, num_purchases=2),
                           Row(userId=42, productId=2390, num_purchases=5),
                           Row(userId=13, productId=1662, num_purchases=21),
                           Row(userId=2112, productId=1492, num_purchases=8),
                           Row(userId=22, productId=1811, num_purchases=96)])

# View the data
Z.show(3)

+------+---------+-------------+
|userId|productId|num_purchases|
+------+---------+-------------+
|  2112|      777|            1|
|     7|       44|           23|
|  1132|      227|            9|
+------+---------+-------------+
only showing top 3 rows



In [16]:
Z_expanded = add_zeros_to_ratings(Z, userCol="userId", productCol="productId")
print(f'Total rows = {Z_expanded.count()}')
Z_expanded.sort('num_purchases', ascending=False).show(3, truncate=False)

7 unique users.
8 unique products.
56 total expected records.
Total rows = 56
+------+---------+-------------+
|userId|productId|num_purchases|
+------+---------+-------------+
|22    |1811     |96           |
|7     |44       |23           |
|13    |1662     |21           |
+------+---------+-------------+
only showing top 3 rows



# Implicit Ratings using user behavior counts

## Building the model

In [17]:
# Load the data
df = ratings_data.select('userId', 'songId', 'numPlays').repartition(5)
df.show(2)

# Split into train and test set.
df_train, df_test = df.randomSplit([0.8, 0.2], seed=SEED)
print(f"Training set: {df_train.count()}, Testing set: {df_test.count()}")

# Build ALS model
als_model = ALS(userCol="userId", itemCol="songId", ratingCol="numPlays",
                rank=10, maxIter=10, regParam=.1, alpha=10,
                nonnegative=True, coldStartStrategy="drop", implicitPrefs=True, seed=SEED).fit(df_train)

# Make some predictions
prediction = als_model.transform(df_test)
prediction.show(15, truncate=False)
pred_range = prediction.agg(F.min('prediction').alias('min'), 
                            F.max('prediction').alias('max')).collect()[0].asDict()
threshold = .5
print(f"Min predidction: {pred_range['min']}, Max prediction: {pred_range['max']}, Threshold: {threshold}")

# Setting a threshold to interpret the prediction
prediction = prediction.withColumn('Recommended', 
                                   F.when(F.col('prediction')>threshold, 'Yes').otherwise('No'))
prediction.show(15, truncate=False)

+------+------+--------+
|userId|songId|numPlays|
+------+------+--------+
|  3702|170306|       1|
|  4748|123630|       3|
+------+------+--------+
only showing top 2 rows

Training set: 1912, Testing set: 478
+------+------+--------+------------+
|userId|songId|numPlays|prediction  |
+------+------+--------+------------+
|110   |189050|18      |0.022847397 |
|1115  |112304|2       |0.0         |
|1198  |55106 |2       |0.012004802 |
|3040  |289658|2       |0.27410442  |
|3702  |349271|1       |0.0069155516|
|4608  |54368 |1       |0.0         |
|4770  |52176 |2       |1.4001423   |
|5401  |217471|8       |0.60500014  |
|6401  |91773 |1       |0.004497187 |
|6401  |329834|1       |0.22024302  |
|7494  |51296 |1       |0.0         |
|9641  |353640|1       |0.031095814 |
|14687 |25323 |16      |1.0028197   |
|14687 |353700|1       |0.58127636  |
|19497 |207916|1       |0.12981367  |
+------+------+--------+------------+
only showing top 15 rows

Min predidction: 0.0, Max prediction: 1.

## With ROEM (Rank Ordering Error Metric)

In [18]:
def ROEM_cv(ratings_df: SparkDataframe, 
            userCol: str="userId", itemCol: str="songId", ratingCol: str="numPlays",
            ranks: List=[10, 50, 100, 150, 200],
            maxIters: List=[10, 25, 50, 100, 200, 400],
            regParams: List=[.05, .1, .15],
            alphas: List=[10, 40, 80, 100],
            seed=0) -> Tuple[ALSModel, SparkDataframe]:
    '''
    This function is an Alternate cross validation approach for ALS models with implicit ratings 
    utilizing an expected percent ranking metric for model performance evaluation.
    The ratings df should contain all possible combinations between users and items, with rating = 0
    for those cases where there is no registered rating.
    Source: https://github.com/jamenlong/ALS_expected_percent_rank_cv/blob/master/ROEM_cv.py
    By [jamenlong](https://github.com/jamenlong)
    '''
    print(f'''
    Total models to create: {len(ranks) * len(maxIters) * len(regParams) * len(alphas)}
    ''')
    ratings_df = ratings_df.orderBy(F.rand())  # Shuffling to ensure randomness

    # Building train and validation test sets
    train, validate = ratings_df.randomSplit([0.8, 0.2], seed=seed)

    # Building 5 folds within the training set.
    test1, test2, test3, test4, test5 = train.randomSplit([0.2, 0.2, 0.2, 0.2, 0.2], seed=seed)
    train1 = test2.union(test3).union(test4).union(test5)
    train2 = test3.union(test4).union(test5).union(test1)
    train3 = test4.union(test5).union(test1).union(test2)
    train4 = test5.union(test1).union(test2).union(test3)
    train5 = test1.union(test2).union(test3).union(test4)

    # Creating variables that will be replaced by the best model's hyperparameters for subsequent printing
    best_validation_roem = 9999999999999
    best_rank, best_maxIter, best_regParam, best_alpha, best_model, best_predictions = 0, 0, 0, 0, 0, 0

    # Looping through each combindation of hyperparameters to ensure all combinations are tested.
    for r in ranks:
        for mi in maxIters:
            for rp in regParams:
                for a in alphas:
                    # Create ALS model
                    als = ALS(userCol=userCol, itemCol=itemCol, ratingCol=ratingCol,
                              rank=r, maxIter=mi, regParam=rp, alpha=a,
                              coldStartStrategy="drop", nonnegative=True, implicitPrefs=True)

                    # Fit model to each fold in the training set
                    model1 = als.fit(train1)
                    model2 = als.fit(train2)
                    model3 = als.fit(train3)
                    model4 = als.fit(train4)
                    model5 = als.fit(train5)

                    # Generating model's predictions for each fold in the test set
                    predictions1 = model1.transform(test1)
                    predictions2 = model2.transform(test2)
                    predictions3 = model3.transform(test3)
                    predictions4 = model4.transform(test4)
                    predictions5 = model5.transform(test5)

                    # Expected percentile rank error metric function
                    def ROEM(predictions, userCol=userCol, itemCol=itemCol, ratingCol=ratingCol):
                        # Creates table that can be queried
                        predictions.createOrReplaceTempView("predictions_temp")
                        
                        # Sum of total number of plays of all songs
                        denominator = predictions.groupBy().sum(ratingCol).collect()[0][0]
                        
                        # Calculating rankings of songs predictions by user
                        spark.sql(f'''
                            SELECT {userCol},
                                   {ratingCol},
                                   PERCENT_RANK() OVER (PARTITION BY {userCol} ORDER BY prediction DESC) AS rank
                            FROM predictions_temp
                        ''').createOrReplaceTempView("rankings_temp")
                        
                        # Multiplies the rank of each song by the number of plays and adds the products together
                        numerator = spark.sql(f'''
                            SELECT SUM({ratingCol} * rank) FROM rankings_temp
                        ''').collect()[0][0]
                        performance = numerator/denominator
                        return performance

                    # Calculating expected percentile rank error metric for the model on each 
                    # fold's prediction set
                    roem1 = ROEM(predictions1)
                    roem2 = ROEM(predictions2)
                    roem3 = ROEM(predictions3)
                    roem4 = ROEM(predictions4)
                    roem5 = ROEM(predictions5)
                    
                    #Validating the model's performance on the validation set
                    validation_model = als.fit(train)
                    validation_predictions = validation_model.transform(validate)
                    validation_roem = ROEM(validation_predictions)
                    
                    # Printing the model's performance on each fold
                    print(f'''
                    Model Parameters:
                                 Rank: {r}
                              MaxIter: {mi}
                             RegParam: {rp}
                                Alpha: {a}
                    Train Rank Errors: [{roem1:.3f}, {roem2:.3f}, {roem3:.3f}, {roem4:.3f}, {roem5:.3f}]
                       Val Rank Error: {validation_roem}
                    ''')

                    # Filling in final hyperparameters with those of the best-performing model
                    if validation_roem < best_validation_roem:
                        best_validation_roem = validation_roem
                        best_rank = r
                        best_maxIter = mi
                        best_regParam = rp
                        best_alpha = a
                        best_model = validation_model
                        best_predictions = validation_predictions

    # Printing best model's expected percentile rank and hyperparameters
    print(f'''
    Best Model
    --------------------------------------
    Rank Error: {best_validation_roem}
          Rank: {best_rank}
       MaxIter: {best_maxIter}
      RegParam: {best_regParam}
         Alpha: {best_alpha}
    ''')
    return best_model, best_predictions

In [19]:
start = time.time()

df = ratings_data.select('userId', 'songId', 'numPlays').repartition(5)
df_zeros = add_zeros_to_ratings(df)

best_model, best_predictions = ROEM_cv(ratings_df=df_zeros,
                                       userCol='userId',
                                       ranks=[10, 20],
                                       maxIters=[10],
                                       regParams=[.1,],
                                       alphas=[5, 10],
                                       seed=SEED)
print(f'''
Best model: {best_model}
''')
best_predictions.sort('numPlays', ascending=False).show(5, truncate=False)
best_predictions.sort('numPlays').show(5, truncate=False)
print(f'''
Consumed time: {(time.time() - start)/60} min.
''')

296 unique users.
643 unique products.
190328 total expected records.

    Total models to create: 4
    

                    Model Parameters:
                                 Rank: 10
                              MaxIter: 10
                             RegParam: 0.1
                                Alpha: 5
                    Train Rank Errors: [0.225, 0.199, 0.226, 0.138, 0.155]
                       Val Rank Error: 0.10898280437892374
                    

                    Model Parameters:
                                 Rank: 10
                              MaxIter: 10
                             RegParam: 0.1
                                Alpha: 10
                    Train Rank Errors: [0.140, 0.171, 0.147, 0.152, 0.186]
                       Val Rank Error: 0.07936771917908436
                    

                    Model Parameters:
                                 Rank: 20
                              MaxIter: 10
                             RegParam: 0.1
   

# Binary Ratings using binary user behavior

## With ROEM (Rank Ordering Error Metric)

In [20]:
# Reviewing the data
binary_movies_data.show(3)

+------+-------+------+-------------------+
|userId|movieId|rating|          timestamp|
+------+-------+------+-------------------+
|   178|    223|     1|2023-10-04 18:28:39|
|   178|    260|     1|2023-10-04 18:36:52|
|   178|    318|     1|2023-10-04 18:56:06|
+------+-------+------+-------------------+
only showing top 3 rows



In [21]:
start = time.time()

df = binary_movies_data.select('userId', 'movieId', 'rating').repartition(5)
df_zeros = add_zeros_to_ratings(df, userCol='userId', productCol='movieId')

best_model, best_predictions = ROEM_cv(ratings_df=df_zeros,
                                       userCol='userId',
                                       itemCol='movieId',
                                       ratingCol='rating',
                                       ranks=[10],
                                       maxIters=[5],
                                       regParams=[.1,],
                                       alphas=[5, 10],
                                       seed=SEED)
print(f'''
Best model: {best_model}
''')
best_predictions.show(5, truncate=False)
print(f'''
Consumed time: {(time.time() - start)/60} min.
''')

320 unique users.
728 unique products.
232960 total expected records.

    Total models to create: 2
    

                    Model Parameters:
                                 Rank: 10
                              MaxIter: 5
                             RegParam: 0.1
                                Alpha: 5
                    Train Rank Errors: [0.270, 0.268, 0.268, 0.268, 0.270]
                       Val Rank Error: 0.2598837376016439
                    

                    Model Parameters:
                                 Rank: 10
                              MaxIter: 5
                             RegParam: 0.1
                                Alpha: 10
                    Train Rank Errors: [0.266, 0.261, 0.275, 0.263, 0.278]
                       Val Rank Error: 0.25293775499993015
                    

    Best Model
    --------------------------------------
    Rank Error: 0.25293775499993015
          Rank: 10
       MaxIter: 5
      RegParam: 0.1
         Alpha: 10
 

## Close session

In [22]:
spark.stop()