In [1]:
# import the usual
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import math
import re
import itertools
from scipy.sparse import csr_matrix

%matplotlib inline
pd.set_option('display.max_columns', 500)

In [2]:
import findspark
findspark.init()
#from pyspark.ml.recommendation import ALS
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

In [3]:
sc = SparkContext()

In [4]:
# sp = SparkSession.builder.appName("s").getOrCreate()

In [5]:
slot1 = sc.textFile(r"F:\Data_Repository\lastfm\df_slot1.tsv")
slot2 = sc.textFile(r"F:\Data_Repository\lastfm\df_slot2.tsv")
slot3 = sc.textFile(r"F:\Data_Repository\lastfm\df_slot3.tsv")
slot4 = sc.textFile(r"F:\Data_Repository\lastfm\df_slot4.tsv")

In [6]:
type(slot1)

pyspark.rdd.RDD

In [7]:
slots = [slot1, slot2, slot3, slot4]

In [8]:
def isNumber(s):
    try:
        float(s)
        return True
    except ValueError:
        pass
 
    return False

def computeRMSE(model,data):
    
    """ Takes ALS models and testing data as input and returns RMSE value """
    
    data_for_predict = data.map(lambda x: (x[0], x[1]))
    
    predictions = model.predictAll(data_for_predict).map(lambda r: ((r[0], r[1]), r[2]))
    rates_and_preds = data.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
    error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    
    return error

In [45]:
%%time
modelnameid = 1
for data in slots:
    print("")
    print (" ## Slot : ", modelnameid)
    path = str('F:\Data_Repository\lastfm')
    modelname = path + "\slot" + str(modelnameid) + ".tsv"
    data = data.map(lambda x: x.split('\t'))
    data2 = data.map(lambda x : [x[i] for i in [0,1,2]]) #only 3 columns exist
    data2 = data2.filter(lambda x: isNumber(x[2])) # Remove faulty rows
    data2 = data2.map(lambda x: [x[0], x[1], float(x[2])]) #Change plays into float
    users = data2.map(lambda x: x[0]).distinct().zipWithIndex()
    artists = data2.map(lambda x: x[1]).distinct().zipWithIndex()
    data2 = data2.map(lambda r: (r[0], (r[1], r[2]))).join(users).map(lambda r: (r[1][1], r[1][0][0], r[1][0][1]))
    data2 = data2.map(lambda r: (r[1], (r[0], r[2]))).join(artists).map(lambda r: (r[1][0][0], r[1][1], r[1][0][1]))
    plays = data2.map(lambda x: x[2])
    data2 = data2.map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))
    training_RDD, validation_RDD, test_RDD = data2.randomSplit([6, 2, 2], seed = 2)
    validation_for_predict_RDD = validation_RDD.map(lambda x: (x[0], x[1]))
    test_for_predict_RDD = test_RDD.map(lambda x: (x[0], x[1]))
    
    seed = 5
    #iterations = 10
    iterations = 1
    regularization_parameter = 0.1
    ranks = [5,10,15]
    #ranks = [15] #to reduce loop
    errors = [0, 0, 0]
    err = 0
    tolerance = 0.02
    alpha = 0.01

    min_error = float('inf')
    best_rank = -1
    best_iteration = -1

    for rank in ranks:
        e = 0
        #for i in range(5):
        for i in range(1):
            # Split the data
            training_RDD, validation_RDD, test_RDD = data2.randomSplit([6, 2, 2], seed = 2)
            validation_for_predict_RDD = validation_RDD.map(lambda x: (x[0], x[1]))
            test_for_predict_RDD = test_RDD.map(lambda x: (x[0], x[1]))

            model = ALS.trainImplicit(training_RDD, rank, seed=seed, iterations=iterations,
                              lambda_=regularization_parameter,alpha=alpha)
        #     predictions = model.predictAll(validation_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
        #     rates_and_preds = validation_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
        #     error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
            e += computeRMSE(model,validation_RDD)
        
        
        ##to get pred
        #spark = SparkSession(sc) #to convert into df
        #valid_df = validation_RDD.toDF()
        #valid_df_pandas = valid_df.toPandas()
        #
        ##output = pd.DataFrame(columns = ['userid', 'songid','pred_score','actual_score'])
        #for index, row in valid_df_pandas[:40].iterrows():
        #    #if row['user'] == 43 or row['user'] == 439:
        #    #    continue
        #    pred_rating = model.predict( row['user'], row['product'] ) 
        #    output = output.append( pd.Series([
        #        int(row['user']), int(row['product']),
        #        pred_rating, float(row['rating'])
        #                        ], index= output.columns), ignore_index=True)  

        
        error = e/5.5
        errors[err] = error
        err += 1
        print ('For rank %s the RMSE is %s' % (rank, error))
        if error < min_error:
            min_error = error
            best_rank = rank

    print ('The best model was trained with rank %s' % best_rank)

    ####to save model
    #print("Saving model to the given path")
    #model.save(sc, modelname)
    modelnameid = modelnameid + 1


 ## Slot :  1
For rank 5 the RMSE is 1.1681614305385888
For rank 10 the RMSE is 1.1678819944463574
For rank 15 the RMSE is 1.1671706920636706
The best model was trained with rank 15

 ## Slot :  2


KeyboardInterrupt: 

In [None]:
#testing monday

In [50]:
%%time
data = slot2
modelname = path + "\slot" + str(modelnameid) + ".tsv"
data = data.map(lambda x: x.split('\t'))
data2 = data.map(lambda x : [x[i] for i in [0,1,2]]) #only 3 columns exist
data2 = data2.filter(lambda x: isNumber(x[2])) # Remove faulty rows
data2 = data2.map(lambda x: [x[0], x[1], float(x[2])]) #Change plays into float
users = data2.map(lambda x: x[0]).distinct().zipWithIndex()
artists = data2.map(lambda x: x[1]).distinct().zipWithIndex()
data2 = data2.map(lambda r: (r[0], (r[1], r[2]))).join(users).map(lambda r: (r[1][1], r[1][0][0], r[1][0][1]))
data2 = data2.map(lambda r: (r[1], (r[0], r[2]))).join(artists).map(lambda r: (r[1][0][0], r[1][1], r[1][0][1]))
plays = data2.map(lambda x: x[2])
data2 = data2.map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))
training_RDD, validation_RDD = data2.randomSplit([9, 1], seed = 2)
validation_for_predict_RDD = validation_RDD.map(lambda x: (x[0], x[1]))
#test_for_predict_RDD = test_RDD.map(lambda x: (x[0], x[1]))

seed = 5
#iterations = 10
iterations = 1
regularization_parameter = 0.1
#ranks = [5,10,15]
ranks = [15] #to reduce loop
errors = [0, 0, 0]
err = 0
tolerance = 0.02
alpha = 0.01
min_error = float('inf')
best_rank = -1
best_iteration = -1

output = pd.DataFrame(columns = ['userid', 'songid','score','count'])

for rank in ranks:
    e = 0
    #for i in range(5):
    for i in range(1):
        # Split the data
        training_RDD, validation_RDD = data2.randomSplit([9, 1], seed = 2)
        validation_for_predict_RDD = validation_RDD.map(lambda x: (x[0], x[1]))
        #test_for_predict_RDD = test_RDD.map(lambda x: (x[0], x[1]))

        model = ALS.trainImplicit(training_RDD, rank, seed=seed, iterations=iterations,
                          lambda_=regularization_parameter,alpha=alpha)
        e += computeRMSE(model,validation_RDD)
        
    spark = SparkSession(sc) #to convert into df
    valid_df = validation_RDD.toDF()
    valid_df_pandas = valid_df.toPandas()
    
    #output = pd.DataFrame(columns = ['userid', 'songid','pred_score','actual_score'])
    for index, row in valid_df_pandas[:300].iterrows():
        #if row['user'] == 43 or row['user'] == 439:
        #    continue
        pred_rating = model.predict( row['user'], row['product'] ) 
        output = output.append( pd.Series([
            int(row['user']), int(row['product']),
            pred_rating, float(row['rating'])
                            ], index= output.columns), ignore_index=True)  
        
    

    error = e/5.5
    errors[err] = error
    err += 1
    print ('For rank %s the RMSE is %s' % (rank, error))
    if error < min_error:
        min_error = error
        best_rank = rank

print ('The best model was trained with rank %s' % best_rank)


For rank 15 the RMSE is 0.8759567998416279
The best model was trained with rank 15
Wall time: 49.4 s


In [51]:
output.head()

Unnamed: 0,userid,songid,score,count
0,1.0,816.0,0.009079,8.0
1,345.0,816.0,0.006169,1.0
2,440.0,816.0,0.014888,1.0
3,668.0,816.0,0.017668,2.0
4,669.0,816.0,0.002638,1.0


In [52]:
def calculate_precision(actual, prediction):
    mean = actual['count'].mean()
    relevant = set(np.array(actual[actual['count']>mean/1.3]['songid'].astype('int')))
    recommended = set(np.array(prediction.nlargest(30, ['score'])['songid'].astype('int')))
    precision = len(recommended & relevant)/len(recommended)
    return precision

In [54]:
def calculate_recall(actual, prediction):
    mean = actual['count'].mean()
    relevant = set(np.array(actual[actual['count']>mean/1.3]['songid'].astype('int')))
    recommended = set(np.array(prediction.nlargest(30, ['score'])['songid'].astype('int')))
    if len(relevant) == 0:
        recall = 0
    else:
        recall = len(recommended & relevant)/len(relevant)
    return recall

In [55]:
def evaluation(actual, predictions):
    precision_sum = 0
    recall_sum = 0
    actual_grouped_by_user = actual.groupby('userid') 
    predictions_grouped_by_user = predictions.groupby('userid')
    number_of_users = len(actual_grouped_by_user)
    actual_key_list = actual_grouped_by_user.groups.keys()
    actual_val_list = []
    for item in actual_key_list:
        actual_val_list.append(actual_grouped_by_user.get_group(item))
        
    predictions_key_list = predictions_grouped_by_user.groups.keys()
    predictions_val_list = []
    for item in predictions_key_list:
        predictions_val_list.append(predictions_grouped_by_user.get_group(item))    
    for actual_group, pred_group in zip(actual_val_list, predictions_val_list):
        precision_sum += calculate_precision(actual_group, pred_group)
        recall_sum += calculate_recall(actual_group, pred_group)
    precision = precision_sum/number_of_users
    recall = recall_sum/number_of_users
    return precision, recall

In [56]:
evaluation( output, output)

(0.9045045045045047, 1.0)