In [1]:
# import the usual
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import math
import re
import itertools
from scipy.sparse import csr_matrix

%matplotlib inline
pd.set_option('display.max_columns', 500)

In [2]:
import findspark
findspark.init()
#from pyspark.ml.recommendation import ALS
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

In [3]:
sc = SparkContext()

In [4]:
# sp = SparkSession.builder.appName("s").getOrCreate()

In [83]:
slot1 = sc.textFile(r"F:\Data_Repository\lastfm\df_slot1.tsv")
slot2 = sc.textFile(r"F:\Data_Repository\lastfm\df_slot2.tsv")
slot3 = sc.textFile(r"F:\Data_Repository\lastfm\df_slot3.tsv")
slot4 = sc.textFile(r"F:\Data_Repository\lastfm\df_slot4.tsv")

In [84]:
type(slot1)

pyspark.rdd.RDD

In [85]:
slots = [slot1, slot2, slot3, slot4]

In [86]:
def isNumber(s):
    try:
        float(s)
        return True
    except ValueError:
        pass
 
    return False

def computeRMSE(model,data):
    
    """ Takes ALS models and testing data as input and returns RMSE value """
    
    data_for_predict = data.map(lambda x: (x[0], x[1]))
    
    predictions = model.predictAll(data_for_predict).map(lambda r: ((r[0], r[1]), r[2]))
    rates_and_preds = data.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
    error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    
    return error

In [91]:
%%time
modelnameid = 1
for data in slots:
    path = str('F:\Data_Repository\lastfm')
    modelname = path + "\slot" + str(modelnameid) + ".tsv"
    data = data.map(lambda x: x.split('\t'))
    data2 = data.map(lambda x : [x[i] for i in [0,1,2]]) #only 3 columns exist
    data2 = data2.filter(lambda x: isNumber(x[2])) # Remove faulty rows
    data2 = data2.map(lambda x: [x[0], x[1], float(x[2])]) #Change plays into float
    users = data2.map(lambda x: x[0]).distinct().zipWithIndex()
    artists = data2.map(lambda x: x[1]).distinct().zipWithIndex()
    data2 = data2.map(lambda r: (r[0], (r[1], r[2]))).join(users).map(lambda r: (r[1][1], r[1][0][0], r[1][0][1]))
    data2 = data2.map(lambda r: (r[1], (r[0], r[2]))).join(artists).map(lambda r: (r[1][0][0], r[1][1], r[1][0][1]))
    plays = data2.map(lambda x: x[2])
    data2 = data2.map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))
    training_RDD, validation_RDD, test_RDD = data2.randomSplit([6, 2, 2], seed = 2)
    validation_for_predict_RDD = validation_RDD.map(lambda x: (x[0], x[1]))
    test_for_predict_RDD = test_RDD.map(lambda x: (x[0], x[1]))
    
    seed = 5
    #iterations = 10
    iterations = 1
    regularization_parameter = 0.1
    ranks = [5,10,15]
    #ranks = [15] #to reduce loop
    errors = [0, 0, 0]
    err = 0
    tolerance = 0.02
    alpha = 0.01

    min_error = float('inf')
    best_rank = -1
    best_iteration = -1

    for rank in ranks:
        e = 0
        #for i in range(5):
        for i in range(1):
            # Split the data
            training_RDD, validation_RDD, test_RDD = data2.randomSplit([6, 2, 2], seed = 2)
            validation_for_predict_RDD = validation_RDD.map(lambda x: (x[0], x[1]))
            test_for_predict_RDD = test_RDD.map(lambda x: (x[0], x[1]))

            model = ALS.trainImplicit(training_RDD, rank, seed=seed, iterations=iterations,
                              lambda_=regularization_parameter,alpha=alpha)
        #     predictions = model.predictAll(validation_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
        #     rates_and_preds = validation_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
        #     error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
            e += computeRMSE(model,validation_RDD)

        error = e/5.5
        errors[err] = error
        err += 1
        print ('For rank %s the RMSE is %s' % (rank, error))
        if error < min_error:
            min_error = error
            best_rank = rank

    print ('The best model was trained with rank %s' % best_rank)

    ####to save model
    print("Saving model to the given path:")
    model.save(sc, modelname)
    modelnameid = modelnameid + 1

For rank 5 the RMSE is 1.1681614305385888
For rank 10 the RMSE is 1.1678819944463574
For rank 15 the RMSE is 1.1671706920636706
The best model was trained with rank 15
Saving model to the given path:
For rank 5 the RMSE is 1.0204170683254816
For rank 10 the RMSE is 1.020022155681369
For rank 15 the RMSE is 1.0196947585522675
The best model was trained with rank 15
Saving model to the given path:
For rank 5 the RMSE is 1.2072978578503877
For rank 10 the RMSE is 1.2066129841598738
For rank 15 the RMSE is 1.2064489655607
The best model was trained with rank 15
Saving model to the given path:
For rank 5 the RMSE is 1.1906307046531146
For rank 10 the RMSE is 1.189835280967131
For rank 15 the RMSE is 1.1895705703178825
The best model was trained with rank 15
Saving model to the given path:
Wall time: 13min 17s


In [82]:
# modelnameid = 1
# for slot in range(0,4):
#     path = str('F:\Data_Repository\lastfm')
#     modelname = path + "\slot" + str(modelnameid) + ".tsv"
#     print(modelname)
#     modelnameid = modelnameid + 1

F:\Data_Repository\lastfm\slot1.tsv
F:\Data_Repository\lastfm\slot2.tsv
F:\Data_Repository\lastfm\slot3.tsv
F:\Data_Repository\lastfm\slot4.tsv


In [6]:
data = data.map(lambda x: x.split('\t'))
# header = data.first()
# print(header)

In [12]:
# data2 = data.map(lambda x : [x[i] for i in [0,1,3]])
data2 = data.map(lambda x : [x[i] for i in [0,1,2]]) #only 3 columns exist

In [13]:
#print ("length of uncleaned data -",data2.count())

In [14]:
data.first()

['user_000001', '15 Step', '2']

In [15]:
data2.first()

['user_000001', '15 Step', '2']

In [16]:
def isNumber(s):
    try:
        float(s)
        return True
    except ValueError:
        pass
 
    return False

In [17]:
data2 = data2.filter(lambda x: isNumber(x[2])) # Remove faulty rows
data2 = data2.map(lambda x: [x[0], x[1], float(x[2])]) #Change plays into float

In [18]:
data2.first()

['user_000001', '15 Step', 2.0]

#### no need to reduce

In [20]:
%%time
#Filter out values with more than 500 plays (for this sake of simplicity)
#print ("length of cleaned data -",data2.count())
#data2 = data2.filter(lambda x : x[2] <= 50)
# print (data2.take(2))
print ("length of slot data -",data2.count())

length of slot data - 247852
Wall time: 1.17 s


In [21]:
%%time
#Convert strings into integers
users = data2.map(lambda x: x[0]).distinct().zipWithIndex()
artists = data2.map(lambda x: x[1]).distinct().zipWithIndex()
# int_user = users.map(lambda u: (u[1], u[0]))
# int_artist = artists.map(lambda i: (i[1], i[0]))
# users.collect()
# artists.collect()

Wall time: 4.76 s


In [22]:
%%time
# Substitutes the ObjectIDs in the ratings RDD with the corresponding int values
data2 = data2.map(lambda r: (r[0], (r[1], r[2]))).join(users).map(lambda r: (r[1][1], r[1][0][0], r[1][0][1]))
data2 = data2.map(lambda r: (r[1], (r[0], r[2]))).join(artists).map(lambda r: (r[1][0][0], r[1][1], r[1][0][1]))

Wall time: 172 ms


In [23]:
%%time
# data2.filter(lambda x: x[0] == 12).collect()
# plays = data2.map(lambda x: x[2]).collect() ##seems like data2 loses data after .collect is called
plays = data2.map(lambda x: x[2])
# data2.collect()

Wall time: 0 ns


In [24]:
%%time
data2.first()

Wall time: 8.55 s


(1, 931, 11.0)

In [25]:
%%time
# Use 'Rating' function to get the values in the right format
data2 = data2.map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))
#data2.count()

Wall time: 0 ns


#### Dataframe converted to rating

In [27]:
data2.first()

Rating(user=1, product=931, rating=11.0)

In [28]:
%%time
# Use randomsplit to split the data into train, validation and testing sets

training_RDD, validation_RDD, test_RDD = data2.randomSplit([6, 2, 2], seed = 2)
validation_for_predict_RDD = validation_RDD.map(lambda x: (x[0], x[1]))
test_for_predict_RDD = test_RDD.map(lambda x: (x[0], x[1]))

Wall time: 0 ns


In [29]:
%%time
# Define computeRMSE

def computeRMSE(model,data):
    
    """ Takes ALS models and testing data as input and returns RMSE value """
    
    data_for_predict = data.map(lambda x: (x[0], x[1]))
    
    predictions = model.predictAll(data_for_predict).map(lambda r: ((r[0], r[1]), r[2]))
    rates_and_preds = data.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
    error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    
    return error

Wall time: 0 ns


In [45]:
%%time
seed = 5
#iterations = 10
iterations = 1
regularization_parameter = 0.1
ranks = [5,10,15]
#ranks = [15] #to reduce loop
errors = [0, 0, 0]
err = 0
tolerance = 0.02
alpha = 0.01

min_error = float('inf')
best_rank = -1
best_iteration = -1

for rank in ranks:
    e = 0
    #for i in range(5):
    for i in range(1):
        # Split the data
        training_RDD, validation_RDD, test_RDD = data2.randomSplit([6, 2, 2], seed = 2)
        validation_for_predict_RDD = validation_RDD.map(lambda x: (x[0], x[1]))
        test_for_predict_RDD = test_RDD.map(lambda x: (x[0], x[1]))
        
        model = ALS.trainImplicit(training_RDD, rank, seed=seed, iterations=iterations,
                          lambda_=regularization_parameter,alpha=alpha)
    #     predictions = model.predictAll(validation_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
    #     rates_and_preds = validation_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
    #     error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
        e += computeRMSE(model,validation_RDD)
    
    error = e/5
    errors[err] = error
    err += 1
    print ('For rank %s the RMSE is %s' % (rank, error))
    if error < min_error:
        min_error = error
        best_rank = rank

print ('The best model was trained with rank %s' % best_rank)
# For rank 5 the RMSE is 6.67535527624558
# For rank 10 the RMSE is 6.461547082828238
# For rank 15 the RMSE is 6.531429770101694
# The best model was trained with rank 10
# Wall time: 8min 21s

# For rank 5 the RMSE is 1.2219171452924378
# For rank 10 the RMSE is 1.5340979416430192
# For rank 15 the RMSE is 1.2701273441785674
# The best model was trained with rank 5
# Wall time: 1min 33s


For rank 5 the RMSE is 1.2849775735924476
For rank 10 the RMSE is 1.284670193890993
For rank 15 the RMSE is 1.2838877612700377
The best model was trained with rank 15
Wall time: 2min 50s


In [50]:
# %%time
# # Final Model

# model = ALS.train(training_RDD, best_rank, seed=seed, iterations= iterations,
#                       lambda_=regularization_parameter)
# predictions = model.predictAll(validation_for_predict_RDD)

#### Model validation

In [53]:
%%time
# Final Model

model = ALS.train(data2, best_rank, seed=seed, iterations= iterations,
                      lambda_ = regularization_parameter)
#predictions = model.predictAll(validation_for_predict_RDD)
predictions = model.predictAll(validation_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))


Wall time: 14.9 s


In [54]:
%%time
#     rates_and_preds = validation_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
#     error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
computeRMSE(model,validation_RDD)

Wall time: 47.6 s


4.093980964206101

In [55]:
rates_and_preds = validation_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)

In [56]:
rates_and_preds.first()

((819, 41), (1.0, 1.3774459478762293))

In [57]:
## validation rdd
validation_RDD.first()

Rating(user=23, product=931, rating=4.0)

In [58]:
## validation having just user item
validation_for_predict_RDD.first()

(23, 931)

In [None]:
# validation_for_predict_RDD = validation_RDD.map(lambda x: (x[0], x[1]))


In [None]:
predictions = model.predictAll(validation_RDD).map(lambda r: ((r[0], r[1]), r[2]))
ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)

In [37]:
# %%time
# print (validation_RDD.count())
# print (predictions.count()   )
# #computeRMSE(model,data2)

49731
49731
Wall time: 35.9 s


In [44]:
%%time
#computeRMSE(model,data2)

Wall time: 0 ns


#### Recommend Products for top-n

In [93]:
def recommendProducts(self, user, num):
    """
    Recommends the top "num" number of products for a given user and
    returns a list of Rating objects sorted by the predicted rating in
    descending order. """
    pass

In [95]:
%time
n = 2
recos = model.recommendProducts(0, n)

Wall time: 0 ns


In [98]:
recos

[Rating(user=0, product=2679, rating=0.04412376419356484),
 Rating(user=0, product=841, rating=0.04395402252539503)]

In [96]:
%%time
scores = []
for i in recos:
     scores.append(float(i[2]))

Wall time: 0 ns


In [97]:
scores

[0.04412376419356484, 0.04395402252539503]

In [100]:
## what doe this do

In [109]:
products_for_users = model.recommendProductsForUsers(1).collect()
len(products_for_users)
#2

970

In [110]:
products_for_users

[(720, (Rating(user=720, product=56, rating=0.05288128790728301),)),
 (0, (Rating(user=0, product=2679, rating=0.04412376419356484),)),
 (288, (Rating(user=288, product=2073, rating=0.03620816616128979),)),
 (432, (Rating(user=432, product=2760, rating=0.045512647890368726),)),
 (576, (Rating(user=576, product=841, rating=0.053661067810359495),)),
 (864, (Rating(user=864, product=841, rating=0.03974478849547542),)),
 (144, (Rating(user=144, product=2679, rating=0.05562710316594872),)),
 (577, (Rating(user=577, product=2081, rating=0.032915308015108224),)),
 (721, (Rating(user=721, product=2679, rating=0.050831324226276425),)),
 (289, (Rating(user=289, product=2679, rating=0.0316088042651284),)),
 (865, (Rating(user=865, product=2073, rating=0.0486133897324968),)),
 (1, (Rating(user=1, product=2073, rating=0.04546503447434856),)),
 (433, (Rating(user=433, product=2831, rating=0.023200820557635095),)),
 (145, (Rating(user=145, product=2760, rating=0.050434731946664674),)),
 (578, (Rating

In [49]:
predictions.filter(lambda x : x[0] == 0).count()

131

In [42]:
%%time
predictions.filter(lambda x : x[0] == 0).collect()

Wall time: 4.69 s


[Rating(user=0, product=204, rating=0.1302139087676721),
 Rating(user=0, product=288, rating=0.08125482420744135),
 Rating(user=0, product=2508, rating=0.5641149002283767),
 Rating(user=0, product=2856, rating=0.35142433988278265),
 Rating(user=0, product=2820, rating=1.6709564288988954),
 Rating(user=0, product=2808, rating=1.830578520171482),
 Rating(user=0, product=768, rating=1.2665242119013902),
 Rating(user=0, product=2844, rating=0.861640002186876),
 Rating(user=0, product=2472, rating=0.40332568305465166),
 Rating(user=0, product=2652, rating=1.096171466058772),
 Rating(user=0, product=792, rating=-0.6297734465634264),
 Rating(user=0, product=2124, rating=1.1416842288348323),
 Rating(user=0, product=132, rating=0.6341896913551128),
 Rating(user=0, product=240, rating=2.258801205518232),
 Rating(user=0, product=2893, rating=-0.01438448255242264),
 Rating(user=0, product=2089, rating=0.5896484975427381),
 Rating(user=0, product=793, rating=-0.21629310312896077),
 Rating(user=0, p

In [29]:
# Save and load model
model.save(sc, r"F:\Data_Repository\lastfm\als_plays_v3_slot1.tsv")

In [None]:
sameModel = MatrixFactorizationModel.load(sc, "target/tmp/myCollaborativeFilter")