In [1]:
import pyspark
import math
from pyspark import SparkContext, SparkConf
from pyspark.mllib.recommendation import ALS, Rating, MatrixFactorizationModel


In [2]:
text_file = "hdfs://hadoop-master:9000/users/spark/out/fmModelArtBalanced.csv"
sc.setCheckpointDir('hdfs://hadoop-master:9000/users/spark/checkpoint')

In [3]:
RDD = sc.textFile(text_file)

In [4]:
RDD.take(5)

['2409499,232513467,203477,george w. bush,428',
 '16879990,660133657,139067,the duckworth lewis method,21',
 '15790027,565863613,93392,el presidente,55',
 '4562044,428504185,126798,david demaria,4',
 '17558632,666373732,149036,rites of undeath,21']

In [5]:
_RDD = RDD.map(lambda x: (x.split(",")))

In [6]:
_RDD.take(5)

[['2409499', '232513467', '203477', 'george w. bush', '428'],
 ['16879990', '660133657', '139067', 'the duckworth lewis method', '21'],
 ['15790027', '565863613', '93392', 'el presidente', '55'],
 ['4562044', '428504185', '126798', 'david demaria', '4'],
 ['17558632', '666373732', '149036', 'rites of undeath', '21']]

In [7]:
#Im going to norm the ratings column by dividing by max value
modelRDD = _RDD.map(lambda x : ((x[1]),(x[2]),(int(x[4]))/699))

In [8]:
modelRDD.take(5)

[('232513467', '203477', 0.6123032904148784),
 ('660133657', '139067', 0.030042918454935622),
 ('565863613', '93392', 0.07868383404864092),
 ('428504185', '126798', 0.005722460658082976),
 ('666373732', '149036', 0.030042918454935622)]

In [9]:
finalRDD = modelRDD.map(lambda x: (x[0],x[1],x[2]))

In [10]:
finalRDD.take(5)

[('232513467', '203477', 0.6123032904148784),
 ('660133657', '139067', 0.030042918454935622),
 ('565863613', '93392', 0.07868383404864092),
 ('428504185', '126798', 0.005722460658082976),
 ('666373732', '149036', 0.030042918454935622)]

In [11]:
finalRDD.map(lambda x: (x[2])).stats()

(count: 4673605, mean: 0.1969480100646009, stdev: 0.207594175697, max: 1.0, min: 0.0)

In [12]:
#Now that we have our data in the format we need, we can split the data relevant to our algorithmm (_RDD)
# Into the testing, traing, and validation sets to develop and tet our model
rdd_training, rdd_validation, rdd_test = finalRDD.randomSplit([6,2,2], seed=5)
predict_validation = rdd_validation.map(lambda x: (x[0], x[1]))
predict_test = rdd_test.map(lambda x: (x[0], x[1]))



In [13]:
rdd_training.cache()
rdd_validation.cache()

PythonRDD[8] at RDD at PythonRDD.scala:48

In [15]:
import pandas as pd
import numpy as np

ALS.checkpointInterval = 2
seed=5
ranks = [5, 10, 20, 40, 80]
lambdas_ = [0.01, .10, 1., 10.]
lambdas_.sort()
iter_array = [2, 5, 10, 15, 20]



best_rank = -1
best_iteration = -1

best_params = {}
best_params['n_factors'] = ranks[0]
best_params['lam'] = lambdas_[0]
best_params['n_iter'] = 0
best_params['test_mse'] = float('inf')
best_params['min_error'] = float('inf')
best_params['model'] = None

for rank in ranks:
    print('Rank: {:d}'.format(rank))
    
    for lam in lambdas_:
        errors = np.zeros(20)
        err = 0
        print('Regularization: {:f}'.format(lam))
        for _iter in iter_array:
            print('Iterations: {:d}'.format(_iter))
            model = ALS.train(rdd_training, iterations=_iter, rank=rank,lambda_=lam, seed=seed)
            predictions = model.predictAll(predict_validation).map(lambda r: ((r[0], r[1]), r[2]))
            rates_and_predicts = rdd_validation.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).\
            join(predictions)
  
            error = math.sqrt(rates_and_predicts.map(lambda rec: (rec[1][0] - rec[1][1])**2).mean())
            errors[err] = error
            err += 1
            if  error < best_params['test_mse']:
                best_params['n_factors'] = rank
                best_params['lam'] = lam
                best_params['n_iter'] = _iter
                best_params['test_mse'] = error
                best_params['model'] = model
                model = None
                predictions = sc.emptyRDD()
                rates_and_predicts = sc.emptyRDD()
print('New optimal hyperparameters')
print(pd.Series(best_params))



Rank: 5
Regularization: 0.010000
Iterations: 2
Iterations: 5
Iterations: 10
Iterations: 15
Iterations: 20
Regularization: 0.100000
Iterations: 2
Iterations: 5
Iterations: 10
Iterations: 15
Iterations: 20
Regularization: 1.000000
Iterations: 2
Iterations: 5
Iterations: 10
Iterations: 15
Iterations: 20
Regularization: 10.000000
Iterations: 2
Iterations: 5
Iterations: 10
Iterations: 15
Iterations: 20
Rank: 10
Regularization: 0.010000
Iterations: 2
Iterations: 5
Iterations: 10
Iterations: 15
Iterations: 20
Regularization: 0.100000
Iterations: 2
Iterations: 5
Iterations: 10
Iterations: 15
Iterations: 20
Regularization: 1.000000
Iterations: 2
Iterations: 5
Iterations: 10
Iterations: 15
Iterations: 20
Regularization: 10.000000
Iterations: 2
Iterations: 5
Iterations: 10
Iterations: 15
Iterations: 20
Rank: 20
Regularization: 0.010000
Iterations: 2
Iterations: 5
Iterations: 10
Iterations: 15
Iterations: 20
Regularization: 0.100000
Iterations: 2
Iterations: 5
Iterations: 10
Iterations: 15
Iterati

In [None]:
#This model wants to stay at .12 rmse. We can see this below, with much lower params. It converges quickly

In [14]:
# Now for the tst dataset
model = ALS.train(rdd_training, rank=5, iterations=5, seed=5)
predictions = model.predictAll(predict_test).map(lambda r: ((r[0], r[1]), r[2]))
rates_and_preds = rdd_test.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())

print('The error for the test data is {:.2f}'.format(error))

The error for the test data is 0.12


In [17]:
rdd_training, rdd_test = modelRDD.randomSplit([7, 3], seed=5)

complete_model = ALS.train(rdd_training, rank=5, iterations=5,seed=5)

In [18]:
predict_test = rdd_test.map(lambda x: (x[0], x[1]))

predictions = complete_model.predictAll(predict_test).map(lambda r: ((r[0], r[1]), r[2]))
rates_and_preds = rdd_test.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())

print('For testing data the RMSE is {:.2f}'.format(error))

For testing data the RMSE is 0.12


In [20]:
# Now we can train the model on the complete dataset
from time import time

t0 = time()
new_ratings_model = ALS.train(modelRDD, rank=5, iterations=5, seed=5)
tt = time() - t0

print("New model trained in {:.2f} seconds".format(round(tt,3)))

New model trained in 81.20 seconds


In [21]:
artist_name_lookup_table = _RDD.map(lambda x: (x[2],x[3])).sortByKey().cache()
artist_id_lookup_table = _RDD.map(lambda x: (x[3],x[2])).sortByKey().cache()

In [46]:
set(artist_id_lookup_table.lookup('satisf*cktion'))

{'3018'}

In [None]:
set(artist_name_lookup_table.lookup('23199'))
{'alain dzukam'}
{'gian - home studio'}

In [47]:
# Let's test the new model with a few recommendations:
# For the final recommender we should probably scale the recommendations numbers between perhaps 1 and 10. For now I'll
# Regard 1 as does not like, 200 as likes and 500 as loves.

johnsRecommendations = [('56','3018',.0001),
                       ('56','153162',.0001), #Not an actual artist
                       ('56','288963',.0001),# Alain Dzukam. Another movie
                        ('56','14368',.0001), #David guetta. This is a movie
                        ('56','204066',.1), #Rihanna
                        ('56','174344',.0001),# Kaiser Chiefs
                        ('56','95545',.0001), #Iron maiden
                        ('56','58275',.0001),# Dire straits 
                        ('56','43548',.18),#Regina Spektor
                        ('56','68390',.19), #Artst is 'edith piaf'
                        ('56','281461',.20), #Artist is 'maria callas' 
                        ('56','201482',.17),# Atist is 'bryn terfel'
                        ('56','243041',.17), # Anna netrebko
                        ('56','244054',.17), # The residents
                        ('56','256484',.18), #The Legendary Pink Dots
                        ('56','206805',.17), #bjork
                        ('56','127801',.0001), #Madonna
                        ('56','96194',.16), #The killers
                        ('56','216449',.0001), #Taylor Swift
                        ('56','113418',.0001), #Miley Cyrus
                        ('56','253951',.0001), # recommended: Madonna feat. Justin Timberlake. Lets see if model responds
                        ('56','279301',.0001) # recommended: t-pain
                       
                       ]
                       


In [48]:
new_user_ratings_RDD = sc.parallelize(johnsRecommendations)
new_user_ratings_RDD.take(3)

[('56', '3018', 0.0001), ('56', '153162', 0.0001), ('56', '288963', 0.0001)]

In [49]:
newRDD = modelRDD.union(new_user_ratings_RDD)

In [50]:
newRDD.take(5)

[('232513467', '203477', 0.6123032904148784),
 ('660133657', '139067', 0.030042918454935622),
 ('565863613', '93392', 0.07868383404864092),
 ('428504185', '126798', 0.005722460658082976),
 ('666373732', '149036', 0.030042918454935622)]

In [51]:
modelFM = ALS.train(newRDD, rank=5, iterations=5, seed=5)

In [52]:
#Get the artists to iterate through
artists = newRDD.map(lambda x: (x[1]))

In [53]:
artists.take(5)

['203477', '139067', '93392', '126798', '149036']

In [54]:
_rated_artists = set(new_user_ratings_RDD.map(lambda x: (x[1])).collect())
unrated_artists = newRDD.filter(lambda x: x[1] not in _rated_artists ).map(lambda x: ('56', x[1]))

In [55]:
new_user_recommendations_RDD = modelFM.predictAll(unrated_artists)

In [56]:
new_user_recommendations_RDD.map(lambda x: (x[2])).max()

0.5211543805135976

In [57]:
top_items = new_user_recommendations_RDD.distinct().takeOrdered(20, key=lambda x: -x[2])


In [58]:
print('Your top ten recommended artists:')
for x in top_items:
    artist = str(x[1])
    print(set(artist_name_lookup_table.lookup(artist)))

Your top ten recommended artists:
{'bippp (v.a.)'}
{'heinz holliger - zehetmair - larcher - holliger - etc.'}
{'dj philip'}
{'active coma'}
{'alberto cortéz y facundo cabral'}
{'jens rachut'}
{'marco antonios solis'}
{'bellhouse'}
{'mike shiflet'}
{'digital mystikz & loefah'}
{'andy vores'}
{'andrew davis'}
{'electric eel-shock'}
{'alexander schatten'}
{'lech jankowski'}
{'manu chao !'}
{'c.a.r.n.e.'}
{'컬투'}
{'die dødelsäcke'}
{'amor-te'}


In [45]:


model_path = os.path.join('/home/spark', 'model', 'lastFMExplicit')

# Save and load model
modelFM.save(sc, model_path)
#same_model = MatrixFactorizationModel.load(sc, model_path)