### Import

In [1]:
#conda install -c conda-forge pyspark #conda pyspark 설치
#기타 요청되는 패키지 참고 : https://spark.apache.org/docs/latest/api/python/getting_started/install.html

In [2]:
import pandas as pd
from pyspark.sql.functions import col, explode
from pyspark import SparkContext

### Init pyspark

In [3]:
from pyspark.sql import SparkSession
sc = SparkContext
# sc.setCheckpointDir('checkpoint') -- Dir 경로 설정
spark = SparkSession.builder.appName('Recommendations').getOrCreate()

### Load data

In [4]:
#https://grouplens.org/datasets/movielens/ 데이터 출처
#추후에 우리 데이터에 적용
movies = spark.read.csv("movies.csv",header=True)
ratings = spark.read.csv("ratings.csv",header=True)

In [5]:
ratings.show()

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
|     1|     70|   3.0|964982400|
|     1|    101|   5.0|964980868|
|     1|    110|   4.0|964982176|
|     1|    151|   5.0|964984041|
|     1|    157|   5.0|964984100|
|     1|    163|   5.0|964983650|
|     1|    216|   5.0|964981208|
|     1|    223|   3.0|964980985|
|     1|    231|   5.0|964981179|
|     1|    235|   4.0|964980908|
|     1|    260|   5.0|964981680|
|     1|    296|   3.0|964982967|
|     1|    316|   3.0|964982310|
|     1|    333|   5.0|964981179|
|     1|    349|   4.0|964982563|
+------+-------+------+---------+
only showing top 20 rows



In [6]:
ratings.printSchema()

root
 |-- userId: string (nullable = true)
 |-- movieId: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- timestamp: string (nullable = true)



In [7]:
ratings = ratings.\
    withColumn('userId', col('userId').cast('integer')).\
    withColumn('movieId', col('movieId').cast('integer')).\
    withColumn('rating', col('rating').cast('float')).\
    drop('timestamp')
ratings.show()

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|      1|   4.0|
|     1|      3|   4.0|
|     1|      6|   4.0|
|     1|     47|   5.0|
|     1|     50|   5.0|
|     1|     70|   3.0|
|     1|    101|   5.0|
|     1|    110|   4.0|
|     1|    151|   5.0|
|     1|    157|   5.0|
|     1|    163|   5.0|
|     1|    216|   5.0|
|     1|    223|   3.0|
|     1|    231|   5.0|
|     1|    235|   4.0|
|     1|    260|   5.0|
|     1|    296|   3.0|
|     1|    316|   3.0|
|     1|    333|   5.0|
|     1|    349|   4.0|
+------+-------+------+
only showing top 20 rows



### Calculate sparsity

In [8]:
# rating 데이터 셋의 개수
numerator = ratings.select("rating").count()

# 유저의 수와 영화의 수
num_users = ratings.select("userId").distinct().count()
num_movies = ratings.select("movieId").distinct().count()

print("Rate Data Count : %d" % numerator)
print("User Data Count : %d" % num_users)
print("Movie Data Count : %d" % num_movies)

# 전체 발생 가능 데이터의 개수
denominator = num_users * num_movies

# 전체 발생 데이터 대비 현재 데이터 비율 계산
sparsity = (1.0 - (numerator *1.0)/denominator)*100
print("The ratings dataframe is ", "%.2f" % sparsity + "% empty.")

Rate Data Count : 100836
User Data Count : 610
Movie Data Count : 9724
The ratings dataframe is  98.30% empty.


### Interpret ratings

In [9]:
# 유저의 작성 리뷰 수
userId_ratings = ratings.groupBy("userId").count().orderBy('count', ascending=False)
userId_ratings.show()

+------+-----+
|userId|count|
+------+-----+
|   414| 2698|
|   599| 2478|
|   474| 2108|
|   448| 1864|
|   274| 1346|
|   610| 1302|
|    68| 1260|
|   380| 1218|
|   606| 1115|
|   288| 1055|
|   249| 1046|
|   387| 1027|
|   182|  977|
|   307|  975|
|   603|  943|
|   298|  939|
|   177|  904|
|   318|  879|
|   232|  862|
|   480|  836|
+------+-----+
only showing top 20 rows



In [10]:
# 영화의 작성 리뷰 수
movieId_ratings = ratings.groupBy("movieId").count().orderBy('count', ascending=False)
movieId_ratings.show()

+-------+-----+
|movieId|count|
+-------+-----+
|    356|  329|
|    318|  317|
|    296|  307|
|    593|  279|
|   2571|  278|
|    260|  251|
|    480|  238|
|    110|  237|
|    589|  224|
|    527|  220|
|   2959|  218|
|      1|  215|
|   1196|  211|
|   2858|  204|
|     50|  204|
|     47|  203|
|    780|  202|
|    150|  201|
|   1198|  200|
|   4993|  198|
+-------+-----+
only showing top 20 rows



## ALS 모델링 및 측정

In [11]:
# Import
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [12]:
# Test, Train 셋 나누기
(train, test) = ratings.randomSplit([0.8, 0.2], seed = 2017152017)

# ALS 모델링
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", nonnegative = True, implicitPrefs = False, coldStartStrategy="drop")
als

ALS_9ef9c2f1e846

In [13]:
# grid의 Hyperparam 조정(수시로 조정하며 비교해보자)
#다수의 모델로 예측값을 근사화 하기 위함
param_grid = ParamGridBuilder() \
    .addGrid(als.rank, [10, 50, 100, 150, 200]) \
    .addGrid(als.regParam, [.01, .05, .1, .15]) \
    .build()
           
# RMSE 유사도 측정하고 이를 기반으로 예측
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") 
print("테스트 모델 : %d개" %len(param_grid))

테스트 모델 : 20개


In [14]:
# Cross validation
cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5)
print(cv)

CrossValidator_a2fa1aedbd4f


### 최적 모델 및 파라미터 찾기

In [15]:
#*************!!!주의!!! ML 라이브러리 모델링 연산 작업 오래 걸림 !!!!********

# Cross Validation한 als모델로 학습(fitting)
model = cv.fit(train)

#최적 모델 확인(이 부분이 PySpark ML 가장 많이 의존하는 부분)
bestmodel = model.bestModel
print(bestmodel)

#최적 모델 파라미터(Best Param)
print("Rank : ", bestmodel._java_obj.parent().getRank())
print("MaxIter : ", bestmodel._java_obj.parent().getMaxIter())
print("RegParam : ", bestmodel._java_obj.parent().getRegParam())

ALSModel: uid=ALS_9ef9c2f1e846, rank=100
Rank :  100
MaxIter :  10
RegParam :  0.15


In [16]:
# 예측 평가 (RMSE 지표)
test_predictions = bestmodel.transform(test)
RMSE = evaluator.evaluate(test_predictions)
print(RMSE)

0.8686309576639267


In [17]:
test_predictions.show()

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|   148|   1197|   3.0| 3.6441839|
|   148|   4308|   4.0| 3.0203066|
|   148|   4896|   4.0| 3.5293715|
|   148|  54001|   4.0| 3.7257106|
|   148|  69757|   3.5| 3.7190824|
|   148|  79091|   3.5| 3.4855065|
|   148|  88125|   4.0|  3.716111|
|   148|  89745|   4.0|  3.399281|
|   148|  98491|   5.0| 3.7802508|
|   148| 122882|   4.0| 3.4872246|
|   148| 152081|   4.0| 3.4503038|
|   463|    552|   3.5|  3.242611|
|   463|   1320|   4.0| 3.2979608|
|   463|   2019|   4.0|   4.03903|
|   463|   2167|   3.0| 3.5896523|
|   463|   4310|   3.0| 3.1032002|
|   463|   6377|   3.5| 3.9018312|
|   471|   8961|   3.5|  3.502676|
|   496|   1221|   4.0| 3.7824123|
|   496|   5952|   4.0| 3.4276402|
+------+-------+------+----------+
only showing top 20 rows



## 추천

In [21]:
#모든 유저에 대해서 추천 결과 출력
#괄호의 인자는 추천할 갯수
nrecommendations = bestmodel.recommendForAllUsers(10)

#10명만 예시로 출력해봄
nrecommendations.limit(10).show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|     1|[{170355, 5.74559...|
|     3|[{6835, 4.836563}...|
|     5|[{132333, 4.65060...|
|     6|[{33649, 4.733172...|
|     9|[{3379, 4.9066386...|
|    12|[{67618, 5.643062...|
|    13|[{170355, 5.30369...|
|    15|[{60943, 4.516289...|
|    16|[{170355, 4.48317...|
|    17|[{170355, 5.12963...|
+------+--------------------+



In [22]:
nrecommendations = nrecommendations.withColumn("rec_exp", explode("recommendations")).select('userId', col("rec_exp.movieId"), col("rec_exp.rating"))
nrecommendations.limit(20).show()

+------+-------+---------+
|userId|movieId|   rating|
+------+-------+---------+
|     1| 170355| 5.745598|
|     1|   3379| 5.745598|
|     1|  33649| 5.732105|
|     1|   7748| 5.616473|
|     1|   5490|  5.56624|
|     1| 132333|  5.56624|
|     1|  27156|5.5360126|
|     1| 171495|5.4380703|
|     1|   7025|5.4303746|
|     1|   5915|5.4272704|
|     3|   6835| 4.836563|
|     3|   5746| 4.836563|
|     3|  70946|4.8259654|
|     3|   5181|4.7189574|
|     3|   5919| 4.715168|
|     3|   4518|4.5783157|
|     3|   2851| 4.563806|
|     3|   5764| 4.352907|
|     3|   7899| 4.352907|
|     3|  26409| 4.180437|
+------+-------+---------+



In [36]:
#한 유저에 대한 추천
print("** 유저 123 추천리스트 **")
nrecommendations.join(movies, on='movieId').filter('userId = 123').show()
print("** 유저 123 평가기록 **")
ratings.join(movies, on='movieId').filter('userId = 123').sort('rating', ascending=False).limit(10).show()

** 유저 123 추천리스트 **
+-------+------+---------+--------------------+--------------------+
|movieId|userId|   rating|               title|              genres|
+-------+------+---------+--------------------+--------------------+
|  33649|   123|4.8141875|  Saving Face (2004)|Comedy|Drama|Romance|
| 170355|   123|4.7753453|Mulholland Dr. (1...|Drama|Mystery|Rom...|
|   3379|   123|4.7753453| On the Beach (1959)|               Drama|
| 171495|   123| 4.636222|              Cosmos|  (no genres listed)|
|  78836|   123|4.6148853|Enter the Void (2...|               Drama|
| 134796|   123|4.6148853|  Bitter Lake (2015)|         Documentary|
| 184245|   123|4.6148853|De platte jungle ...|         Documentary|
|   7071|   123|4.6148853|Woman Under the I...|               Drama|
| 138966|   123|4.6148853|Nasu: Summer in A...|           Animation|
| 179135|   123|4.6148853|Blue Planet II (2...|         Documentary|
+-------+------+---------+--------------------+--------------------+

** 유저 123 평가기록

+-------+-----+
|movieId|count|
+-------+-----+
|   1580|  165|
|   2366|   25|
|   3175|   75|
|   1088|   42|
|  32460|    4|
|  44022|   23|
|  96488|    4|
|   1238|    9|
|   1342|   11|
|   1591|   26|
|   1645|   51|
|   4519|    9|
|   2142|   10|
|    471|   40|
|   3997|   12|
|    833|    6|
|   3918|    9|
|   7982|    4|
|   1959|   15|
|  68135|   10|
+-------+-----+
only showing top 20 rows

