<a href="https://colab.research.google.com/github/jun9729/pyspark_project/blob/main/movie_recommend.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [47]:
# Pyspark Library #
# SQL
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.functions import mean, col, split, regexp_extract, when, lit
# ML
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, IndexToString
from pyspark.ml.feature import QuantileDiscretizer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [48]:
spark = SparkSession.builder.appName("movie_recommend").getOrCreate()

In [57]:
df = spark.read.csv("/content/movie_ratings.csv", inferSchema=True, header=True)
df.show()

+------+------+----------------+
|userId|rating|           title|
+------+------+----------------+
|     1|   4.0|Toy Story (1995)|
|     5|   4.0|Toy Story (1995)|
|     7|   4.5|Toy Story (1995)|
|    15|   2.5|Toy Story (1995)|
|    17|   4.5|Toy Story (1995)|
|    18|   3.5|Toy Story (1995)|
|    19|   4.0|Toy Story (1995)|
|    21|   3.5|Toy Story (1995)|
|    27|   3.0|Toy Story (1995)|
|    31|   5.0|Toy Story (1995)|
|    32|   3.0|Toy Story (1995)|
|    33|   3.0|Toy Story (1995)|
|    40|   5.0|Toy Story (1995)|
|    43|   5.0|Toy Story (1995)|
|    44|   3.0|Toy Story (1995)|
|    45|   4.0|Toy Story (1995)|
|    46|   5.0|Toy Story (1995)|
|    50|   3.0|Toy Story (1995)|
|    54|   3.0|Toy Story (1995)|
|    57|   5.0|Toy Story (1995)|
+------+------+----------------+
only showing top 20 rows



In [58]:
from pyspark.ml.feature import StringIndexer, VectorAssembler, IndexToString

stringIndexer = StringIndexer(inputCol='title',
                             outputCol='movieId')
model = stringIndexer.fit(df)
indexed = model.transform(df)
indexed.limit(5).toPandas()

Unnamed: 0,userId,rating,title,movieId
0,1,4.0,Toy Story (1995),11.0
1,5,4.0,Toy Story (1995),11.0
2,7,4.5,Toy Story (1995),11.0
3,15,2.5,Toy Story (1995),11.0
4,17,4.5,Toy Story (1995),11.0


In [59]:
# ALS recommender algorithm
from pyspark.ml.recommendation import ALS

train, test = indexed.randomSplit([0.75, 0.25])

rec = ALS(maxIter=10,
         regParam=0.01,
         userCol='userId',
         itemCol='movieId',
         ratingCol='rating', # title -> predict할 때는 필요 없음!
         nonnegative=True,
         coldStartStrategy='drop')
# ALS모델 학습 -> dataframe을 넣어주기
rec_model = rec.fit(train)

# transform을 이용해 예측 -> dataframe을 넣어주기
pred_ratings = rec_model.transform(test)
pred_ratings.show()

+------+------+---------------+-------+----------+
|userId|rating|          title|movieId|prediction|
+------+------+---------------+-------+----------+
|   597|   5.0|Rain Man (1988)|  148.0|  4.013526|
|   332|   4.0|Rain Man (1988)|  148.0| 3.3470201|
|   577|   4.0|Rain Man (1988)|  148.0| 3.7954178|
|   606|   4.0|Rain Man (1988)|  148.0| 4.2757907|
|   103|   5.0|Rain Man (1988)|  148.0| 3.8720973|
|   330|   4.0|Rain Man (1988)|  148.0| 3.9671888|
|   230|   3.0|Rain Man (1988)|  148.0|  4.569448|
|   157|   4.0|Rain Man (1988)|  148.0| 3.8274999|
|   232|   4.0|Rain Man (1988)|  148.0| 3.4499242|
|    47|   4.0|Rain Man (1988)|  148.0|  3.325891|
|   305|   4.0|Rain Man (1988)|  148.0| 3.5082836|
|   363|   3.0|Rain Man (1988)|  148.0| 4.0165424|
|   274|   3.5|Rain Man (1988)|  148.0| 3.0592623|
|   474|   4.0|Rain Man (1988)|  148.0| 4.0432987|
|   169|   4.5|Rain Man (1988)|  148.0|  4.520741|
|   227|   4.5|Rain Man (1988)|  148.0|  4.048698|
|   531|   5.0|Rain Man (1988)|

In [60]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol='rating',
                               predictionCol='prediction',
                               metricName='rmse')
# evaluate 메소드에 예측값 담겨있는 dataframe 넣어주기
rmse = evaluator.evaluate(pred_ratings)

mae_eval = RegressionEvaluator(labelCol='rating',
                              predictionCol='prediction',
                              metricName='mae')
mae = mae_eval.evaluate(pred_ratings)

print("RMSE:", rmse)
print("MAE:", mae)

RMSE: 1.0451856741422667
MAE: 0.7896248567737227


In [61]:
unique_movies = indexed.select("movieId").distinct()
def top_movies(user_id, n):
    """
    특정 user_id가 좋아할 만한 n개의 영화 추천해주는 함수
    """
    # unique_movies 데이터프레임을 'a'라는 데이터프레임으로 alias시키기
    a = unique_movies.alias('a')

    # 특정 user_id가 본 영화들만 담은 새로운 데이터프레임 생성
    watched_movies = indexed.filter(indexed['userId'] == user_id)\
                            .select('movieId')

    # 특정 user_id가 본 영화들을 'b'라는 데이터프레임으로 alias시키기
    b = watched_movies.alias('b')

    # unique_movies를 기준으로 watched_movies를 조인시켜서 user_id가 보지 못한 영화들 파악 가능
    total_movies = a.join(b, a['movieId'] == b['movieId'],
                         how='left')

    # b 데이터프레임의 movieId값이 결측치를 갖고 있는 행의 a.movieId를 뽑아냄으로써 user_id가 아직 못본 영화들 추출
    # col('b.movieId') => b 데이터프레임의 movieId칼럼 의미(SQL처럼 가능!)
    remaining_movies = total_movies\
                       .where(col('b.movieId').isNull())\
                       .select('a.movieId').distinct()
    # remaining_movies 데이터프레임에 특정 user_id값을 동일하게 새로운 변수로 추가해주기
    remaining_movies = remaining_movies.withColumn('userId',
                                                  lit(int(user_id)))
    # 위에서 만든 ALS 모델을 사용하여 추천 평점 예측 후 n개 만큼 view -> 
    recommender = rec_model.transform(remaining_movies)\
                           .orderBy('prediction', ascending=False)\
                           .limit(n)
    # StringIndexer로 만든 것을 역으로 바꾸기 위해 IndexToString사용(영화제목을 숫자->한글제목)
    movie_title = IndexToString(inputCol='movieId',
                               outputCol='title',
                               labels=model.labels) 
    # transform해서 영화제목을 숫자->한글로 변환! => dataframe으로 반환
    final_recommendations = movie_title.transform(recommender)

    return final_recommendations.show(n, truncate=False)

In [63]:
top_movies(100, 5)

+-------+------+----------+------------------------------+
|movieId|userId|prediction|title                         |
+-------+------+----------+------------------------------+
|3614.0 |100   |8.286353  |Topo, El (1970)               |
|2701.0 |100   |7.611109  |Capturing the Friedmans (2003)|
|1750.0 |100   |7.3170657 |Barcelona (1994)              |
|3206.0 |100   |7.206392  |Snow Dogs (2002)              |
|3392.0 |100   |7.1937494 |Garfield: The Movie (2004)    |
+-------+------+----------+------------------------------+

