# Final Project Big Data

### Spark Recommendation (ALS)

Dataset = Book-Crossing Dataset from http://www2.informatik.uni-freiburg.de/~cziegler/BX/

## Spark Initialization

In [1]:
# Import findspark to read SPARK_HOME and HADOOP_HOME
import findspark
findspark.init()

In [2]:
# Import required library
from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession \
    .builder \
    .appName("Final Project Big Data Recommendation System") \
    .getOrCreate()

In [3]:
# Print Spark object ID
print(spark)

<pyspark.sql.session.SparkSession object at 0x000002C2193953C8>


In [4]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.sql import types
from pyspark.sql.types import *
from pyspark.sql import functions as F

## Praproses

In [5]:
lines = spark.read.csv("F:/fpbigdata/fpedit/dataset/BX-Book-Ratings.csv", header=True, inferSchema=True)

In [6]:
print(lines.take(5))

[Row(User-ID=276725, ISBN='034545104X', Book-Rating='0'), Row(User-ID=276726, ISBN='0155061224', Book-Rating='5'), Row(User-ID=276727, ISBN='0446520802', Book-Rating='0'), Row(User-ID=276729, ISBN='052165615X', Book-Rating='3'), Row(User-ID=276729, ISBN='0521795028', Book-Rating='6')]


In [7]:
lines = lines.selectExpr(['`User-ID` as uid','`ISBN` as iid','`Book-Rating` as rating'])

In [8]:
lines.show()

+------+----------+------+
|   uid|       iid|rating|
+------+----------+------+
|276725|034545104X|     0|
|276726|0155061224|     5|
|276727|0446520802|     0|
|276729|052165615X|     3|
|276729|0521795028|     6|
|276733|2080674722|     0|
|276736|3257224281|     8|
|276737|0600570967|     6|
|276744|038550120X|     7|
|276745| 342310538|    10|
|276746|0425115801|     0|
|276746|0449006522|     0|
|276746|0553561618|     0|
|276746|055356451X|     0|
|276746|0786013990|     0|
|276746|0786014512|     0|
|276747|0060517794|     9|
|276747|0451192001|     0|
|276747|0609801279|     0|
|276747|0671537458|     9|
+------+----------+------+
only showing top 20 rows



In [9]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import IndexToString, StringIndexer

stringindexer = StringIndexer(inputCol='iid',outputCol='iid_int')
stringindexer.setHandleInvalid("keep")
model = stringindexer.fit(lines)
lines_int = model.transform(lines)

stringindexer = StringIndexer(inputCol='rating',outputCol='rating_int')
stringindexer.setHandleInvalid("keep")
model = stringindexer.fit(lines_int)
lines_int_fix = model.transform(lines_int)

In [10]:
lines_int_fix.show()

+------+----------+------+--------+----------+
|   uid|       iid|rating| iid_int|rating_int|
+------+----------+------+--------+----------+
|276725|034545104X|     0|  1636.0|       0.0|
|276726|0155061224|     5| 87069.0|       5.0|
|276727|0446520802|     0|   568.0|       0.0|
|276729|052165615X|     3|310005.0|       8.0|
|276729|0521795028|     6|147200.0|       6.0|
|276733|2080674722|     0| 77066.0|       0.0|
|276736|3257224281|     8| 35182.0|       1.0|
|276737|0600570967|     6|293513.0|       6.0|
|276744|038550120X|     7|   232.0|       3.0|
|276745| 342310538|    10| 87749.0|       2.0|
|276746|0425115801|     0|   446.0|       0.0|
|276746|0449006522|     0|   604.0|       0.0|
|276746|0553561618|     0|   424.0|       0.0|
|276746|055356451X|     0|   280.0|       0.0|
|276746|0786013990|     0| 24580.0|       0.0|
|276746|0786014512|     0| 14934.0|       0.0|
|276747|0060517794|     9|  1413.0|       4.0|
|276747|0451192001|     0|   933.0|       0.0|
|276747|06098

## Create Model

In [11]:
(training, test) = lines_int_fix.randomSplit([0.8, 0.2])

In [12]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
import pandas as pd

# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics

# training
als = ALS(maxIter=5, regParam=0.01, userCol="uid", itemCol="iid_int", ratingCol="rating_int",
          coldStartStrategy="drop")
model = als.fit(training)

In [13]:
# Evaluate the model by computing the RMSE on the test data

predictions = model.transform(test.select(['uid','iid_int','rating_int']))
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating_int",
                                predictionCol="prediction")

In [15]:
predictions.show()

+------+-------+----------+-----------+
|   uid|iid_int|rating_int| prediction|
+------+-------+----------+-----------+
|128325|  148.0|       1.0|  0.2593763|
| 85469|  148.0|       0.0| 0.45757502|
| 39646|  148.0|       0.0| 0.26206467|
|190265|  148.0|       0.0|  0.6133786|
| 52320|  148.0|       0.0| 0.30535117|
| 87974|  148.0|       0.0|  0.2536206|
|176887|  148.0|       0.0| 0.73066115|
|126783|  148.0|       0.0|   1.689151|
|126296|  148.0|       3.0| 0.14112091|
|170151|  148.0|       0.0| 0.30627152|
| 55187|  148.0|       0.0|-0.26722392|
|135045|  148.0|       0.0| 0.16514327|
| 99277|  148.0|       0.0|        0.0|
|136733|  148.0|       0.0|  1.5444038|
| 65663|  148.0|       0.0|  0.5297955|
|227538|  148.0|       1.0|-0.55059314|
|194600|  148.0|       0.0|  1.0464805|
| 35599|  148.0|       0.0| 0.21653593|
|236621|  148.0|       4.0|  2.2070847|
|222815|  148.0|       6.0|        0.0|
+------+-------+----------+-----------+
only showing top 20 rows



In [16]:
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 2.765616461076877


In [17]:
# Generate top 10 kindle recommendations for each user
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each kindle
kindleRecs = model.recommendForAllItems(10)

In [18]:
userRecs.show()

+----+--------------------+
| uid|     recommendations|
+----+--------------------+
| 463|[[96910, 2.864231...|
| 496|[[125235, 5.56441...|
|1238|[[324380, 0.0], [...|
|1591|[[66670, 4.491403...|
|1829|[[334813, 21.0745...|
|2366|[[335730, 3.34303...|
|3175|[[314345, 13.2731...|
|3918|[[157703, 7.94734...|
|4900|[[16017, 23.96244...|
|5300|[[16010, 9.27822]...|
|5803|[[325821, 0.0], [...|
|6336|[[46852, 51.38335...|
|6357|[[0, 0.0], [10, 0...|
|6397|[[43036, 14.42467...|
|6466|[[43036, 12.08002...|
|6654|[[289336, 9.88285...|
|7253|[[181558, 15.2299...|
|7340|[[324380, 0.0], [...|
|7982|[[157302, 40.8238...|
|8086|[[16758, 15.18254...|
+----+--------------------+
only showing top 20 rows



In [19]:
kindleRecs.show()

+-------+--------------------+
|iid_int|     recommendations|
+-------+--------------------+
|    148|[[167176, 11.1398...|
|    463|[[193612, 16.9533...|
|    471|[[193612, 29.2024...|
|    496|[[122026, 15.1684...|
|    833|[[122026, 23.5405...|
|   1088|[[148996, 13.6660...|
|   1238|[[20147, 14.53997...|
|   1342|[[256303, 21.6534...|
|   1580|[[43307, 24.94523...|
|   1591|[[130063, 13.3577...|
|   1645|[[148996, 21.1738...|
|   1829|[[268761, 24.9287...|
|   1959|[[69980, 36.59109...|
|   2122|[[202353, 26.9485...|
|   2142|[[26070, 20.95494...|
|   2366|[[256303, 38.0626...|
|   2659|[[163169, 34.1202...|
|   2866|[[162474, 24.4992...|
|   3175|[[136357, 23.3752...|
|   3749|[[251754, 12.1458...|
+-------+--------------------+
only showing top 20 rows

