In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pyspark.ml.recommendation import ALS

from pyspark.sql.functions import lit
from pyspark.sql.functions import isnan, when, count, col

from pyspark.sql import SparkSession

# Setup a SparkSession
spark = SparkSession.builder.getOrCreate()

In [2]:
# Load in movies and users data
movies = pd.read_csv('data/movies.dat', delimiter='::', names=['movie_id', 'movie_title', 'genre'], index_col='movie_id', engine='python')
users = pd.read_csv('data/users.dat', delimiter='::', names=['user_id', 'sex', 'age', 'occupation', 'zip_code' ], index_col='user_id', engine='python')

In [3]:
# Load in ratings data
ratings = pd.read_csv('data/training.csv')
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')

In [4]:
# Create a TRAINING and VALIDATION set. validation set will be the
# last 20% of the training set, ordered by date
ratings = ratings.sort_values(by='timestamp').reset_index(drop=True)
training = ratings.loc[:len(ratings)*.8-1, :].copy()
validation = ratings.loc[len(ratings)*.8:, :].copy()

In [5]:
training_df = spark.createDataFrame(training)

In [6]:
validation_df = spark.createDataFrame(validation)

### Create an untrained ALS model.

In [7]:
als_model = ALS(
    itemCol='movie',
    userCol='user',
    ratingCol='rating',
    nonnegative=True,    
    regParam=0.1,
    rank=10)

In [9]:
# Train the ALS model. We'll call the trained model `recommender`.
recommender = als_model.fit(training_df)

In [10]:
# Get the training errors
yhat_train = recommender.transform(training_df)

In [13]:
yhat_train.describe().show()

+-------+------------------+------------------+------------------+------------------+
|summary|              user|             movie|            rating|        prediction|
+-------+------------------+------------------+------------------+------------------+
|  count|            640000|            640000|            640000|            640000|
|   mean|   3932.6427484375|   1844.2703734375|      3.6065828125| 3.449572548599867|
| stddev|1233.3827540229956|1082.5269074473076|1.1133907292659646|0.6634801750013999|
|    min|              1572|                 1|                 1|        0.42859814|
|    max|              6040|              3952|                 5|          5.477794|
+-------+------------------+------------------+------------------+------------------+



In [14]:
# Get the validation errors
yhat_validation = recommender.transform(validation_df)

In [16]:
yhat_validation.show()

+----+-----+------+-------------------+----------+
|user|movie|rating|          timestamp|prediction|
+----+-----+------+-------------------+----------+
| 673|  148|     5|2000-11-30 21:47:04|       NaN|
|1242|  148|     3|2000-11-22 16:19:36|       NaN|
|1069|  148|     2|2000-11-23 02:05:35|       NaN|
|1605|  148|     2|2000-11-22 21:57:01| 2.2204018|
|1150|  148|     2|2000-11-22 06:38:26|       NaN|
| 660|  463|     3|2000-12-01 17:03:09|       NaN|
|1069|  463|     2|2000-11-23 02:02:18|       NaN|
|1146|  463|     2|2000-11-23 00:33:30|       NaN|
| 746|  463|     1|2000-11-29 04:05:54|       NaN|
|1980|  463|     2|2000-11-26 17:52:43| 2.3692527|
|1395|  471|     5|2000-11-23 18:51:07|       NaN|
|1303|  471|     4|2000-11-21 06:30:07|       NaN|
|1199|  471|     3|2000-11-22 19:09:41|       NaN|
|1404|  471|     3|2000-11-21 00:03:22|       NaN|
|1441|  471|     5|2000-11-20 22:23:56|       NaN|
|1496|  471|     2|2000-11-20 20:52:58|       NaN|
|1156|  471|     3|2000-11-22 0

In [15]:
yhat_validation.describe().show()

+-------+-----------------+------------------+------------------+----------+
|summary|             user|             movie|            rating|prediction|
+-------+-----------------+------------------+------------------+----------+
|  count|           160000|            160000|            160000|    160000|
|   mean|    1284.91819375|      1869.2047875|         3.5260625|       NaN|
| stddev|516.1356466463282|1103.7634605229894|1.1456347363305557|       NaN|
|    min|              636|                 1|                 1| 0.4639782|
|    max|             5996|              3952|                 5|       NaN|
+-------+-----------------+------------------+------------------+----------+



Looks like we have NANs in the validation. Let's investigate that.

In [17]:
yhat_training_df = yhat_train.toPandas()
yhat_validatin_df = yhat_validation.toPandas()

In [18]:
yhat_training_df

Unnamed: 0,user,movie,rating,timestamp,prediction
0,4227,148,2,2000-08-07 14:48:44,2.116618
1,3184,148,4,2000-09-11 21:49:13,3.317677
2,4784,148,3,2000-09-26 20:36:10,2.891599
3,2383,148,2,2000-11-16 23:34:14,2.320556
4,3539,148,3,2000-08-22 08:20:08,2.761600
...,...,...,...,...,...
639995,1753,3910,4,2000-11-20 06:32:59,3.691639
639996,2825,3910,3,2000-11-02 09:10:54,3.840279
639997,5242,3910,1,2000-10-16 22:18:49,2.315363
639998,2507,3910,2,2000-11-13 01:04:25,3.993294


In [19]:
yhat_validatin_df

Unnamed: 0,user,movie,rating,timestamp,prediction
0,673,148,5,2000-11-30 21:47:04,
1,1242,148,3,2000-11-22 16:19:36,
2,1069,148,2,2000-11-23 02:05:35,
3,1605,148,2,2000-11-22 21:57:01,2.220402
4,1150,148,2,2000-11-22 06:38:26,
...,...,...,...,...,...
159995,1067,3910,5,2000-11-23 02:02:44,
159996,670,3910,4,2000-11-30 23:57:50,
159997,745,3910,1,2000-11-29 02:58:12,
159998,1138,3910,5,2000-11-22 07:06:19,


In [20]:
count_nans_validation = yhat_validatin_df['prediction'].isna().sum()

In [23]:
count_nans_validation / yhat_validatin_df.shape[0]

0.792575

80 PERCENT OF OUR PREDICTION IS NAN!!!

Let's look at the ones that it did predict to get an idea of how it's doing.

In [24]:
yhat_validation_df_no_nans = yhat_validatin_df.dropna().copy()

In [26]:
yhat_validation_df_no_nans['squared_error'] = (yhat_validation_df_no_nans['rating'] - yhat_validation_df_no_nans['prediction'])**2

In [27]:
yhat_validation_df_no_nans

Unnamed: 0,user,movie,rating,timestamp,prediction,squared_error
3,1605,148,2,2000-11-22 21:57:01,2.220402,0.048577
9,1980,463,2,2000-11-26 17:52:43,2.369253,0.136348
22,4653,471,3,2000-11-29 21:42:53,3.124579,0.015520
25,2414,471,4,2000-11-22 18:43:49,3.575513,0.180189
40,2185,471,4,2000-11-27 22:19:38,3.615629,0.147741
...,...,...,...,...,...,...
159965,1843,3910,3,2000-11-21 02:03:07,2.420907,0.335349
159978,3821,3910,2,2000-11-20 21:41:45,2.963575,0.928476
159981,1791,3910,3,2000-11-21 00:50:41,3.428285,0.183428
159984,4790,3910,5,2000-11-26 05:35:15,3.503697,2.238922


In [28]:
# Calculate RMSE of 
rmse = np.sqrt(sum(yhat_validation_df_no_nans['squared_error']) / len(yhat_validation_df_no_nans))
print(f'Root Mean Squared Error of Predictions: {rmse:.2f}')

Root Mean Squared Error of Predictions: 0.92


In [29]:
std_rating = yhat_validation_df_no_nans.describe()['rating']['std']
std_pred = yhat_validation_df_no_nans.describe()['prediction']['std']

print(f'Standard Deviation of Rating: {std_rating:.2f}')
print(f'Standard Deviation of Prediction: {std_pred:.2f}')

Standard Deviation of Rating: 1.12
Standard Deviation of Prediction: 0.69


Not Terribly...

### Why are there so many NaN's??

NANs occur when the matrix hasn't seen a user OR a movie yet.

In [89]:
yhat_validatin_df.to_csv('data/yhat_validation_df_ALS.csv', index=False)