In [None]:
cd <your path>

# Begining

## Load modules

In [None]:
! pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/45/b0/9d6860891ab14a39d4bddf80ba26ce51c2f9dc4805e5c6978ac0472c120a/pyspark-3.1.1.tar.gz (212.3MB)
[K     |████████████████████████████████| 212.3MB 26kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 38.2MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.1-py2.py3-none-any.whl size=212767604 sha256=5663394ce13b9f2efc776998c3c4f00158e4fe1b17c2290fa84539645450b350
  Stored in directory: /root/.cache/pip/wheels/0b/90/c0/01de724414ef122bd05f056541fb6a0ecf47c7ca655f8b3c0f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.1


In [None]:
from pyspark import SparkContext, SparkConf
from pyspark import StorageLevel

import pyspark.sql.functions as func
from pyspark.sql import Window
from pyspark.sql import SparkSession


from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.feature import StringIndexer
# from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.mllib.evaluation import RankingMetrics

from operator import itemgetter
import time
import sys
import getpass
from pathlib import Path

import itertools

## Define the Parameters

In [None]:
PATH = './data'

WHOLE_DATA = True
FRACTION = 1.00
MODEL_PATH = 'saved_tuned_best_model_{}'.format(str(FRACTION).replace('.','_'))
STATS_PATH = Path('./saved_tuned_model_stats')

TOP=500
PREC_AT = 500


MAX_MEMORY = "4g"
spark = SparkSession \
    .builder \
    .appName('quq') \
    .config("spark.executor.memory", MAX_MEMORY) \
    .config("spark.driver.memory", MAX_MEMORY) \
    .getOrCreate()

sc = SparkContext.getOrCreate(SparkConf())

# Load the data

In [None]:
if WHOLE_DATA:
    train = spark.read.parquet(PATH+'/cf_train_trans.parquet')
    unique_user_index_val = spark.read.parquet(PATH+'/unique_user_index_val.parquet')
else:
    train = spark.read.parquet(PATH+'/cf_train_trans_{}.parquet'.format(str(FRACTION).replace('.','_')))
    unique_user_index_val = spark.read.parquet(PATH+'/unique_user_index_val.parquet')
    unique_user_index_val = unique_user_index_val.sample(withReplacement=False, fraction=max(FRACTION,0.05))

true_rec_tracks_val_rdd = sc.textFile(PATH+'/true_rec_tracks_val').map(eval)
print('Successfully loaded the data')

Successfully loaded the data


# Tune ALS Model

In [None]:
ranks = [i for i in range(100,200,30)]  
# ranks = [i for i in range(190,300,30)]
regParams = [1] #[10**i for i in range(-2,1)   
alphas = [1]
maxIter = [5]

params = [[a,b,c,d] for a,b,c,d in itertools.product(ranks, regParams, alphas, maxIter)]
print('length of params',len(params))

precisionAt_k_highest = 0
stats_file = open(STATS_PATH/'stats_{}.txt'.format(str(FRACTION).replace('.','_')), 'a', buffering=1)
print(' '.join(sys.argv))
print(' '.join(sys.argv), file=stats_file)


for rk,reg,alp,it in itertools.product(ranks, regParams, alphas, maxIter):
    since = time.time()

    als = ALS(userCol='user_index', itemCol='track_index', ratingCol='count', 
              implicitPrefs=True, coldStartStrategy="drop",
              rank=rk, regParam=reg, alpha=alp, maxIter=it)
    model = als.fit(train)
    time_to_fit = time.time() - since

    userRecs = model.recommendForUserSubset(unique_user_index_val, TOP)
    pred_rec_tracks_val_rdd = userRecs.rdd.map(lambda row: (row['user_index'], 
                                                            [track_pred.track_index for track_pred in row['recommendations']]))

    pred_and_true_tracks = pred_rec_tracks_val_rdd.join(true_rec_tracks_val_rdd).map(lambda tup: tup[1])
    time_to_recommend = time.time() - since

    metrics = RankingMetrics(pred_and_true_tracks)
    precisionAt_k = metrics.precisionAt(PREC_AT)
    time_to_eval = time.time() - since
    if precisionAt_k > precisionAt_k_highest:
        precisionAt_k_highest = precisionAt_k
        model.write().overwrite().save(MODEL_PATH)
    
    print("With Rank:{}, Reg:{}, Alpha:{}, Maxiter:{}, Metric: {}, time TTL: {}, fit: {}, rec: {}, evl: {}".format(
        rk,reg,alp,it, 
        precisionAt_k, 
        time_to_eval, time_to_fit, time_to_recommend-time_to_fit, time_to_eval-time_to_recommend 
    ), file=stats_file)

    print("With Rank:{}, Reg:{}, Alpha:{}, Maxiter:{}, Metric: {}, time TTL: {}, fit: {}, rec: {}, evl: {}".format(
        rk,reg,alp,it, 
        precisionAt_k, 
        time_to_eval, time_to_fit, time_to_recommend-time_to_fit, time_to_eval-time_to_recommend 
    ))

length of params 4
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py -f /root/.local/share/jupyter/runtime/kernel-da9016d2-1942-4739-8ca3-5b76bff19d93.json
With Rank:100, Reg:1, Alpha:1, Maxiter:5, Metric: 0.0093638, time TTL: 3787.433854341507, fit: 2855.6211857795715, rec: 6.690388917922974, evl: 925.1222796440125


Py4JJavaError: ignored