# Set enviroment variable for PySpark

In [1]:
import os
import sys
spark_home = os.environ['SPARK_HOME'] = '/Users/liang/Downloads/spark-1.3.0-bin-hadoop2.4/'
spark_home = os.environ['SPARK_HOME'] = '/Users/jshanahan/Dropbox/Lectures-UC-Berkeley-ML-Class-2015/spark-1.5.0-bin-hadoop2.6/'
if not spark_home:
    raise ValueError('SPARK_HOME enviroment variable is not set')
sys.path.insert(0,os.path.join(spark_home,'python'))
sys.path.insert(0,os.path.join(spark_home,'python/lib/py4j-0.8.2.1-src.zip'))
execfile(os.path.join(spark_home,'python/pyspark/shell.py'))


Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 1.3.0
      /_/

Using Python version 2.7.10 (default, May 28 2015 17:04:42)
SparkContext available as sc, HiveContext available as sqlCtx.


# ALS Example in Spark

In [2]:
import numpy as np
from numpy.random import rand
from numpy import matrix

def rmse(R, U, V): # Metric
    return np.sqrt(np.sum(np.power(R-U*V, 2))/(U.shape[0]*V.shape[1]))

def solver(mat, R, LAMBDA):  # solver to get R*mat(matT*mat + lambda*I)^-1
    d1 = mat.shape[0]
    d2 = mat.shape[1]

    X2 = mat.T * mat
    XY = mat.T * R.T

    for j in range(d2):
        X2[j, j] += LAMBDA * d1

    return np.linalg.solve(X2, XY)

# Only parallelize the calculation. It does not consider the data transmission cost
def simpleParalleling(R,InitialU,InitialVt,rank,iterations,numPartitions,LAMBDA=0.01):
    Rb = sc.broadcast(R)
    U = InitialU
    Vt = InitialVt
    Ub = sc.broadcast(U)
    Vtb = sc.broadcast(Vt)
    numUsers = InitialU.shape[0]
    numItems = InitialVt.shape[0]
    
    for i in range(iterations):
        print "Iteration %d:" % i
        print "\nRMSE: %f\n" % rmse(R, U, Vt.T)
        U3d = sc.parallelize(range(numUsers), numPartitions) \
           .map(lambda x: solver( Vtb.value, Rb.value[x, :],LAMBDA)) \
           .collect() # a list of two 2-D matrix
        U = matrix(np.array(U3d)[:, :, 0]) # transfered to 2-D matrix
        Ub = sc.broadcast(U)

        Vt3d = sc.parallelize(range(numItems), numPartitions) \
           .map(lambda x: solver(Ub.value, Rb.value.T[x,:],LAMBDA)) \
           .collect() # a list of two 2-D matrix
        Vt = matrix(np.array(Vt3d)[:, :, 0]) # transfered to 2-D matrix
        Vtb = sc.broadcast(Vt)
    return U, Vt

# Not only caculation is paralleized but also the data is wisely partitioned and shared to improve locality.
def closedFormALS(R,InitialU,InitialVt,rank,iterations,numPartitions,LAMBDA=0.01):
    R_Userslice = sc.parallelize(R,numPartitions).cache() # R will automaticly be partitioned by row index
    R_Itemslice = sc.parallelize(R.T,numPartitions).cache() # R_T will automaticly be partitioned by row index
    U = InitialU
    Vt = InitialVt
    
    for i in range(iterations):
        
        print "Iteration %d:" % i
        print "\nRMSE: %f\n" % rmse(R, U, Vt.T)
        
        Vtb = sc.broadcast(Vt)
        U3d = R_Userslice.map(lambda x:solver(Vtb.value,x,LAMBDA)).collect() # a list of two 2-D matrix
        U = matrix(np.array(U3d)[:, :, 0]) # transfered to 2-D matrix
        
        Ub = sc.broadcast(U)
        Vt3d = R_Itemslice.map(lambda x:solver(Ub.value,x,LAMBDA)).collect() # a list of two 2-D matrix
        Vt = matrix(np.array(Vt3d)[:, :, 0])  # transfered to 2-D matrix
    
    return U, Vt 
        
    
        
def main():
    LAMBDA = 0.01   # regularization parameter
    np.random.seed(100)
    numUsers = 5000
    numItems = 100
    rank = 10
    iterations = 5
    numPartitions = 2

    trueU = matrix(rand(numUsers, rank)) #True matrix U to generate R
    trueV = matrix(rand(rank, numItems)) #True matrix V to generate R
    R = matrix(trueU*trueV)   #generate Rating matrix
    
    InitialU = matrix(rand(numUsers, rank)) #Initialization of U
    InitialVt = matrix(rand(numItems,rank))#Initialization of V
    
    print "Running ALS with numUser=%d, numItem=%d, rank=%d, iterations=%d, numPartitions=%d\n" % \
    (numUsers, numItems, rank, iterations, numPartitions)
    
    print"Distributed Version---Tow copies of R, one is partitioned by rowIdx, the other is partitioned by colIndx"
    closedFormALS(R,InitialU,InitialVt,rank,iterations,numPartitions,LAMBDA)
    
    print "Simple paralleling ---Suppose MaUsertrix R is small enough to be broadcast"
    simpleParalleling(R,InitialU,InitialVt,rank,iterations,numPartitions,LAMBDA)
    
    

main()

Running ALS with numUser=5000, numItem=100, rank=10, iterations=5, numPartitions=2

Simple paralleling ---Suppose MaUsertrix R is small enough to be broadcast
Iteration 0:

RMSE: 0.981523

Iteration 1:

RMSE: 0.183839

Iteration 2:

RMSE: 0.046112

Iteration 3:

RMSE: 0.038168

Iteration 4:

RMSE: 0.035043

Distributed Version---Tow copies of R, one is partitioned by rowIdx, the other is partitioned by colIndx
Iteration 0:

RMSE: 0.183839

Iteration 1:

RMSE: 0.046112

Iteration 2:

RMSE: 0.038168

Iteration 3:

RMSE: 0.035043

Iteration 4:

RMSE: 0.033598

