# Tugas 4 Big Data - Recommendation System

dataset = http://jmcauley.ucsd.edu/data/renttherunway/renttherunway_final_data.json.gz (Clothing Fit Data)

### 1.Spark Initialization

In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession \
    .builder \
    .appName("Python Spark Recommendation System Example") \
    .getOrCreate()

print(spark)

<pyspark.sql.session.SparkSession object at 0x000001C92543FF98>


In [4]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
import pandas as pd

### 2. Load Data

In [7]:
df2 = spark.read.json("D:/KULIAH/SMT-6_BIG-DATA/renttherunway_final_data.json").na.drop()

In [8]:
df2.schema

StructType(List(StructField(age,StringType,true),StructField(body type,StringType,true),StructField(bust size,StringType,true),StructField(category,StringType,true),StructField(fit,StringType,true),StructField(height,StringType,true),StructField(item_id,StringType,true),StructField(rating,StringType,true),StructField(rented for,StringType,true),StructField(review_date,StringType,true),StructField(review_summary,StringType,true),StructField(review_text,StringType,true),StructField(size,LongType,true),StructField(user_id,StringType,true),StructField(weight,StringType,true)))

In [9]:
df2.show()

+---+-----------------+---------+--------+-----+------+-------+------+-------------+------------------+--------------------+--------------------+----+-------+------+
|age|        body type|bust size|category|  fit|height|item_id|rating|   rented for|       review_date|      review_summary|         review_text|size|user_id|weight|
+---+-----------------+---------+--------+-----+------+-------+------+-------------+------------------+--------------------+--------------------+----+-------+------+
| 28|        hourglass|      34d|  romper|  fit| 5' 8"|2260466|    10|     vacation|    April 20, 2016|So many compliments!|An adorable rompe...|  14| 420272|137lbs|
| 36|straight & narrow|      34b|    gown|  fit| 5' 6"| 153475|    10|        other|     June 18, 2013|I felt so glamour...|I rented this dre...|  12| 273551|132lbs|
| 34|             pear|      34c|   dress|  fit| 5' 5"| 126335|     8|formal affair| February 12, 2014|Dress arrived on ...|I rented this for...|   8| 909926|135lbs|
| 27

In [10]:
df2 = df2.select(df2.user_id.cast("int"),
                   df2.item_id.cast("int"),
                   df2.rating.cast("int"))
df2.show()
df2.printSchema()
df2.createOrReplaceTempView("rating")

+-------+-------+------+
|user_id|item_id|rating|
+-------+-------+------+
| 420272|2260466|    10|
| 273551| 153475|    10|
| 909926| 126335|     8|
| 151944| 616682|    10|
| 734848| 364092|     8|
| 336066| 568429|    10|
|  86661| 130259|    10|
| 154309|1729232|    10|
| 185966|1077123|     8|
| 533900| 130259|     8|
|  87660|1295171|    10|
| 391778| 143094|     8|
| 721308| 123793|    10|
| 829124|2595752|    10|
| 499943| 909221|    10|
| 339899|1622747|    10|
| 649288| 172027|     8|
|  16800|1229740|    10|
| 661150| 900878|    10|
| 983550| 197391|     8|
+-------+-------+------+
only showing top 20 rows

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- rating: integer (nullable = true)



### 3. Create Model

In [11]:
(training, test) = df2.randomSplit([0.8, 0.2])

In [12]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="user_id", itemCol="item_id", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [13]:
model = als.fit(training)

In [14]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 13.703047287614117


In [15]:
# Generate top 10 item recommendations for each user
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each item
itemRecs = model.recommendForAllItems(10)

In [16]:
userRecs.show()

+-------+--------------------+
|user_id|     recommendations|
+-------+--------------------+
|   1088|[[2701429, 11.675...|
|   3997|[[574909, 37.3495...|
|   4900|[[402211, 45.3942...|
|   6397|[[246463, 41.9044...|
|   7240|[[2481108, 17.945...|
|  17420|[[2074214, 12.406...|
|  19079|[[2794512, 76.612...|
|  22097|[[1982555, 33.866...|
|  22223|[[1959063, 19.001...|
|  25517|[[2655365, 50.655...|
|  27760|[[396259, 80.8482...|
|  28146|[[2368838, 29.581...|
|  28577|[[1842684, 9.9997...|
|  29993|[[2525612, 14.873...|
|  35982|[[1115223, 60.554...|
|  41575|[[1167885, 69.811...|
|  44022|[[1030312, 22.733...|
|  47084|[[1877137, 29.481...|
|  58061|[[470444, 13.8292...|
|  63106|[[1315960, 11.260...|
+-------+--------------------+
only showing top 20 rows



In [18]:
itemRecs.show()

+-------+--------------------+
|item_id|     recommendations|
+-------+--------------------+
|1007290|[[716322, 101.004...|
|1333481|[[949937, 117.337...|
|1353371|[[762525, 95.6554...|
|2735011|[[429540, 106.322...|
|2088432|[[752953, 125.637...|
| 960713|[[752953, 87.9724...|
| 992623|[[553521, 141.926...|
|1126983|[[500850, 113.610...|
|1406963|[[270847, 121.228...|
|1438673|[[350442, 105.767...|
|2707303|[[832321, 105.967...|
|1473684|[[565950, 168.643...|
|1491185|[[820028, 71.0015...|
|1984705|[[270847, 163.911...|
|2411425|[[762525, 120.362...|
| 277366|[[820028, 193.528...|
|1312996|[[565950, 62.3102...|
|1733906|[[731844, 161.940...|
|2309796|[[949937, 151.499...|
|2336886|[[396073, 53.3747...|
+-------+--------------------+
only showing top 20 rows

