In [1]:
TRAIN1_DATA_PATH = './ml-100k/ua.base'
TEST1_DATA_PATH = './ml-100k/ua.test'
TRAIN2_DATA_PATH = './ml-100k/ub.base'
TEST2_DATA_PATH = './ml-100k/ub.test'

# Matrix Factorization

In [2]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession\
        .builder\
        .master('local[*]')\
        .appName('EE551Project')\
        .getOrCreate()

from pyspark import SparkContext
sc = SparkContext.getOrCreate()

## Import the train and test dataset

In [3]:
from pyspark.sql.types import StructType, StructField, IntegerType

train_schema = StructType([
    StructField('userId', IntegerType()),
    StructField('itemId', IntegerType()),
    StructField('rating', IntegerType()),
    StructField('timestemp', IntegerType()),
])

train1 = spark.read.csv(TRAIN1_DATA_PATH,
                      sep='\t',
                      header=False,
                      schema=train_schema)

train2 = spark.read.csv(TRAIN2_DATA_PATH, 
                        sep='\t',
                        header=False,
                        schema=train_schema)

train = train1.union(train2)

In [4]:
train.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- itemId: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- timestemp: integer (nullable = true)



In [5]:
train.show(5)

+------+------+------+---------+
|userId|itemId|rating|timestemp|
+------+------+------+---------+
|     1|     1|     5|874965758|
|     1|     2|     3|876893171|
|     1|     3|     4|878542960|
|     1|     4|     3|876893119|
|     1|     5|     3|889751712|
+------+------+------+---------+
only showing top 5 rows



In [6]:
test_schema = train_schema 

test1 = spark.read.csv(TEST1_DATA_PATH,
                      sep='\t',
                      header=False,
                      schema=test_schema)
test2 = spark.read.csv(TEST2_DATA_PATH,
                      sep='\t',
                      header=False,
                      schema=test_schema)

test = test1.union(test2)

In [7]:
test.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- itemId: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- timestemp: integer (nullable = true)



In [8]:
test.show(5)

+------+------+------+---------+
|userId|itemId|rating|timestemp|
+------+------+------+---------+
|     1|    20|     4|887431883|
|     1|    33|     4|878542699|
|     1|    61|     4|878542420|
|     1|   117|     3|874965739|
|     1|   155|     2|878542201|
+------+------+------+---------+
only showing top 5 rows



# ALS Models
## Only using userId, itemId, and rating

In [9]:
from pyspark.ml.recommendation import ALS

als = ALS(userCol='userId', 
          itemCol='itemId',
          ratingCol='rating', 
          rank=5,
          maxIter= 5,
          regParam=0.01,
          nonnegative = True, 
          implicitPrefs = False)

In [10]:
model = als.fit(train)
train_results = model.transform(train)
train_results.show(5)

+------+------+------+---------+----------+
|userId|itemId|rating|timestemp|prediction|
+------+------+------+---------+----------+
|   251|   148|     2|886272547| 2.8860848|
|   580|   148|     4|884125773| 3.5820909|
|   633|   148|     1|875326138| 3.0441895|
|   633|   148|     1|875326138| 3.0441895|
|   642|   148|     5|885604163| 4.1708684|
+------+------+------+---------+----------+
only showing top 5 rows



## Training summary

In [11]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')

print('RMSE: ', evaluator.evaluate(train_results))

RMSE:  0.8201058766532773


In [12]:
train_results.select('rating').describe().show()

+-------+------------------+
|summary|            rating|
+-------+------------------+
|  count|            181140|
|   mean| 3.523744065363807|
| stddev|1.1258800843524963|
|    min|                 1|
|    max|                 5|
+-------+------------------+



## Make predictions on the test dataset

In [13]:
test_predictions = model.transform(test)
test_predictions.show(5)

+------+------+------+---------+----------+
|userId|itemId|rating|timestemp|prediction|
+------+------+------+---------+----------+
|   251|   148|     2|886272547| 2.8860848|
|   580|   148|     4|884125773| 3.5820909|
|    27|   148|     3|891543129| 3.0845172|
|   332|   148|     5|887938486| 3.8928246|
|   602|   148|     4|888638517| 4.0439777|
+------+------+------+---------+----------+
only showing top 5 rows



In [15]:
print('RMSE: ', evaluator.evaluate(test_predictions))

RMSE:  0.8540376433930686
