# Isabella Zhai - Collaborative Filtering Model

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import pyspark.sql.functions
from pyspark.sql.functions import rand

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [0]:
spark = SparkSession \
    .builder\
    .appName("mongo")\
    .config('spark.sql.parquet.binaryAsString', 'true')\
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1")\
    .config('org.apache.hadoop:hadoop-aws:3.3.1')\
    .config("spark.mongodb.input.uri", 'mongodb+srv://msds697_project:msds697@cluster1.ippkl.mongodb.net/msds697.music')\
    .config("spark.mongodb.output.uri", 'mongodb+srv://msds697_project:msds697@cluster1.ippkl.mongodb.net/msds697.music')\
    .config("spark.network.timeout", "3600s")\
    .getOrCreate()

In [0]:
df = spark.read.format("mongo").option('uri', 'mongodb+srv://msds697_project:msds697@cluster1.ippkl.mongodb.net/msds697.music').load()

In [0]:
df.show()

In [0]:
# number of unique genre
df.select(count_distinct('genre_id')).show()

In [0]:
# number of unique genre level
df.select(count_distinct('genre_level')).show()

# Collborative Filtering Model

In [0]:
# cast rating, userid, songid to integer type and rename to label
df_new = df.select((df.songid).cast(IntegerType()),df.userid.cast(IntegerType())\
                   ,(df.rating).cast(IntegerType()).alias('label')
                  )


In [0]:
df_new.printSchema()

In [0]:
df_new.groupby('label').count().show()

In [0]:
train, test = df_new.randomSplit([0.8, 0.2])
train = train.cache()
test = test.cache()

In [0]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
# als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
#           coldStartStrategy="drop")
# model = als.fit(training)


# Create ALS model

In [0]:
als = ALS(
         userCol="userid", 
         itemCol="songid",
         ratingCol="label", 
         nonnegative = True, 
         implicitPrefs = False,
         coldStartStrategy="drop"
)

In [0]:
model_als = als.fit(train)

# Evaluate The Model

In [0]:
predictions = model_als.transform(test)

In [0]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="label",
                                predictionCol="prediction")

rmse = evaluator.evaluate(predictions)

print("Root-mean-square error = " + str(rmse))

In [0]:
evaluator2 = RegressionEvaluator(metricName="mse", labelCol="label",
                                predictionCol="prediction")

mse = evaluator2.evaluate(predictions)

print("Mean-square error = " + str(mse))

In [0]:
# Generate top 10 movie recommendations for each user
userRecs = model_als.recommendForAllUsers(5)
# Generate top 10 user recommendations for each movie
movieRecs = model_als.recommendForAllItems(5)

In [0]:
userRecs.show(10)