In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pyspark.ml.recommendation import ALS

from pyspark.sql.functions import lit
from pyspark.sql.functions import isnan, when, count, col

from pyspark.sql import SparkSession

# Setup a SparkSession
spark = SparkSession.builder.getOrCreate()

In [2]:
# Load in the training set
training = pd.read_csv('data/training.csv')
training.drop(columns='timestamp', inplace=True)

In [3]:
# Convert to a Spark DataFrame
training_df = spark.createDataFrame(training)

In [4]:
# Load in the requests set
requests = pd.read_csv('data/requests.csv')

In [5]:
# Convert to a Spark DataFrame
requests_df = spark.createDataFrame(requests)

In [6]:
# Declare an ALS model
als_model = ALS(
    itemCol='movie',
    userCol='user',
    ratingCol='rating',
    nonnegative=True,    
    regParam=0.1,
    rank=10)

In [7]:
# Create an instance of the fit ALS model
recommender = als_model.fit(training_df)

In [8]:
predictions = recommender.transform(requests_df)

In [9]:
predictions_pandas = predictions.toPandas()

In [10]:
predictions_pandas.describe()

Unnamed: 0,user,movie,prediction
count,200209.0,200209.0,104439.0
mean,1511.751225,1930.586682,3.364899
std,1582.930564,1129.67035,0.665702
min,1.0,1.0,0.514699
25%,331.0,1046.0,2.95182
50%,752.0,1946.0,3.425135
75%,2131.0,2890.0,3.837017
max,6040.0,3952.0,5.420891


In [12]:
predictions_pandas.head(8)

Unnamed: 0,user,movie,prediction
0,53,148,
1,4169,148,3.038984
2,5333,148,2.486224
3,4387,148,2.134042
4,840,148,2.700452
5,216,148,
6,482,148,
7,752,148,2.715211


In [13]:
count_nans_prediction = predictions_pandas['prediction'].isna().sum()

In [14]:
percent_nans = count_nans_prediction / predictions_pandas.shape[0]

In [15]:
print("The percent of NaN's for the predictions dataset is: ")
print(percent_nans*100)

The percent of NaN's for the predictions dataset is: 
47.83501241202943


### For the first guess, we will fill the NaN's with the mean of the training data set.

In [None]:
predictions_pandas = predictions_pandas.fillna(training['rating'].mean())

In [None]:
predictions_pandas.describe()

#### To fulfill requirement, change the 'prediction' column name to 'rating'

In [None]:
predictions_pandas.rename(columns={'prediction':'rating'}, inplace=True)

In [None]:
predictions_pandas

#### Save this as a CSV into submissions folder as "als_mean.csv"

In [None]:
predictions_pandas.to_csv('submissions/als_mean.csv', index=False)

In [None]:
predictions_pandas

In [None]:
training.head(1)

In [None]:
preds = predictions.toPandas()

In [None]:
users = preds['user'].unique()
movies = preds['movie'].unique()

In [None]:
unseen_users = []
for each_user in users:
    if training[training['user']==each_user].shape[0]==0:
        unseen_users.append(each_user)

In [None]:
len(unseen_users)

In [None]:
unseen_users[25]

In [None]:
unseen_movies = []
for each_movie in movies:
    if training[training['movie']==each_movie].shape[0]==0:
        unseen_movies.append(each_movie)

In [None]:
len(unseen_movies)

In [None]:
regression = pd.read_csv('data/regression.csv')

In [None]:
regression