In [28]:
import pyspark
sc = pyspark.SparkContext(appName='rec')

In [29]:
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

# Load and parse the data
data = sc.textFile("./mllib_data/test.data")

#data format
'''
1,1,5.0
1,2,1.0
1,3,5.0
'''

ratings = data.map(lambda l: l.split(','))\
              .map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))

#ratings.take(5)

# model using Alternating Least Squares ---> Latent Factor Model
rank = 10  # Latent Factor Size
numIterations = 10 # Gradient Descent를 실행하는 횟수 (크면 정답을 찾아갈 확률이 높으나 대신 느림)

# rank와 numIterations는 hyper-parameter로 사람이 경험적으로 정해줘야 됨.
# hyper-parameter값은 정해진 답이 없다

model = ALS.train(ratings, rank, numIterations)

# ALS based on implicit ratings ---> Regularization을 추가한 ALS (alpha값으로 제어)
# alpha가 크면 regularization이 잘 되나 factorization이 잘 안될수도 있음.
# model = ALS.trainImplicit(ratings, rank, numIterations,alpha=0.01)

# Evaluate the model on training data
testdata = ratings.map(lambda p: (p[0], p[1]))
predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))

ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
print("Mean Squared Error = " + str(MSE))

# Save and load model
model.save(sc, "./")
sameModel = MatrixFactorizationModel.load(sc, "target/tmp/myCollaborativeFilter")

Mean Squared Error = 6.977352756149555e-06


In [27]:
sc.stop()