In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pyspark.ml.recommendation import ALS

from pyspark.sql.functions import lit
from pyspark.sql.functions import isnan, when, count, col

from pyspark.sql import SparkSession

# Setup a SparkSession
spark = SparkSession.builder.getOrCreate()

In [2]:
# Load in the training set
training = pd.read_csv('data/training.csv')
training.drop(columns='timestamp', inplace=True)

In [3]:
# Convert to a Spark DataFrame
training_df = spark.createDataFrame(training)

In [4]:
# Load in the requests set
requests = pd.read_csv('data/requests.csv')

In [5]:
# Convert to a Spark DataFrame
requests_df = spark.createDataFrame(requests)

In [6]:
# Declare an ALS model
als_model = ALS(
    itemCol='movie',
    userCol='user',
    ratingCol='rating',
    nonnegative=True,    
    regParam=0.1,
    rank=10)

In [7]:
# Create an instance of the fit ALS model
recommender = als_model.fit(training_df)

In [8]:
predictions = recommender.transform(requests_df)

In [9]:
predictions_pandas = predictions.toPandas()

In [11]:
predictions_pandas.describe()

Unnamed: 0,user,movie,prediction
count,200209.0,200209.0,104439.0
mean,1511.751225,1930.586682,3.364062
std,1582.930564,1129.67035,0.663919
min,1.0,1.0,0.549139
25%,331.0,1046.0,2.949743
50%,752.0,1946.0,3.423734
75%,2131.0,2890.0,3.835766
max,6040.0,3952.0,5.541839


In [12]:
count_nans_prediction = predictions_pandas['prediction'].isna().sum()

In [13]:
percent_nans = count_nans_prediction / predictions_pandas.shape[0]

In [14]:
print("The percent of NaN's for the predictions dataset is: ")
print(percent_nans*100)

The percent of NaN's for the predictions dataset is: 
47.83501241202943


### For the first guess, we will fill the NaN's with the mean of the training data set.

In [17]:
predictions_pandas = predictions_pandas.fillna(training['rating'].mean())

In [18]:
predictions_pandas.describe()

Unnamed: 0,user,movie,prediction
count,200209.0,200209.0,200209.0
mean,1511.751225,1930.586682,3.47009
std,1582.930564,1129.67035,0.492745
min,1.0,1.0,0.549139
25%,331.0,1046.0,3.390189
50%,752.0,1946.0,3.590479
75%,2131.0,2890.0,3.590479
max,6040.0,3952.0,5.541839


#### To fulfill requirement, change the 'prediction' column name to 'rating'

In [21]:
predictions_pandas.rename(columns={'prediction':'rating'}, inplace=True)

In [22]:
predictions_pandas

Unnamed: 0,user,movie,rating
0,53,148,3.590479
1,4169,148,3.118960
2,5333,148,2.497073
3,4387,148,2.336798
4,840,148,2.620513
...,...,...,...
200204,3371,3910,3.788533
200205,1851,3910,3.258385
200206,5198,3910,3.634098
200207,1584,3910,3.147473


#### Save this as a CSV into submissions folder as "als_mean.csv"

In [24]:
predictions_pandas.to_csv('submissions/als_mean.csv', index=False)

In [25]:
predictions_pandas

Unnamed: 0,user,movie,rating
0,53,148,3.590479
1,4169,148,3.118960
2,5333,148,2.497073
3,4387,148,2.336798
4,840,148,2.620513
...,...,...,...
200204,3371,3910,3.788533
200205,1851,3910,3.258385
200206,5198,3910,3.634098
200207,1584,3910,3.147473


In [30]:
training.head(1)

Unnamed: 0,user,movie,rating
0,6040,858,4


In [26]:
preds = predictions.toPandas()

In [29]:
users = preds['user'].unique()
movies = preds['movie'].unique()

In [31]:
unseen_users = []
for each_user in users:
    if training[training['user']==each_user].shape[0]==0:
        unseen_users.append(each_user)

In [33]:
len(unseen_users)

641

In [37]:
unseen_users[25]

402

In [34]:
unseen_movies = []
for each_movie in movies:
    if training[training['movie']==each_movie].shape[0]==0:
        unseen_movies.append(each_movie)

In [35]:
len(unseen_movies)

44