In [1]:
from pyspark.sql import SparkSession

# 메모리 용량 설정
MAX_MEMORY = '4g'

spark = SparkSession.builder.appName("movie-recommendation")\
.config("spark.executor.memory", MAX_MEMORY)\
.config("spark.driver.memory", MAX_MEMORY)\
.getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/01 19:13:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/08/01 19:13:13 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
spark

In [3]:
filepath = "/home/ubuntu/working/spark-examples/data/ml-25m/ratings.csv"
ratings_df = spark.read.csv(f"file:///{filepath}", inferSchema=True, header=True)
ratings_df.show()

                                                                                

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|    296|   5.0|1147880044|
|     1|    306|   3.5|1147868817|
|     1|    307|   5.0|1147868828|
|     1|    665|   5.0|1147878820|
|     1|    899|   3.5|1147868510|
|     1|   1088|   4.0|1147868495|
|     1|   1175|   3.5|1147868826|
|     1|   1217|   3.5|1147878326|
|     1|   1237|   5.0|1147868839|
|     1|   1250|   4.0|1147868414|
|     1|   1260|   3.5|1147877857|
|     1|   1653|   4.0|1147868097|
|     1|   2011|   2.5|1147868079|
|     1|   2012|   2.5|1147868068|
|     1|   2068|   2.5|1147869044|
|     1|   2161|   3.5|1147868609|
|     1|   2351|   4.5|1147877957|
|     1|   2573|   4.0|1147878923|
|     1|   2632|   5.0|1147878248|
|     1|   2692|   5.0|1147869100|
+------+-------+------+----------+
only showing top 20 rows



In [4]:
#timestamp는 빼고 선택
import pyspark.sql.functions as F

ratings_df = ratings_df.select(F.col("userId"), F.col("movieId"), F.col("rating"))
ratings_df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)



In [5]:
# 통계 정보 확인
ratings_df.select(F.col("rating")).describe().show()

                                                                                

+-------+------------------+
|summary|            rating|
+-------+------------------+
|  count|          25000095|
|   mean| 3.533854451353085|
| stddev|1.0607439611423535|
|    min|               0.5|
|    max|               5.0|
+-------+------------------+



In [6]:
# 훈련 세트와 테스트 세트 분리
train_df, test_df = ratings_df.randomSplit([0.8,0.2], seed=42)

In [11]:
# 모델 생성
from pyspark.ml.recommendation import ALS

als = ALS(
    maxIter=5,  #훈련 횟수
    regParam=0.1, # 가중치 정규화 계수
    userCol="userId", # 사용자 정보 컬럼
    itemCol="movieId", # 아이템 컬럼(여기서는 영화)
    ratingCol="rating", # 평점 컬럼 
    coldStartStrategy='drop'   # 학습하지 못한 데이터에 대한 처리. 여기서는 삭제
)

In [12]:
# 모델을 훈련하면 모델 객체가 등장
als_model = als.fit(train_df)

23/08/01 19:46:42 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/08/01 19:46:42 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
23/08/01 19:46:46 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
                                                                                

In [13]:
print(type(als))
print(type(als_model))

<class 'pyspark.ml.recommendation.ALS'>
<class 'pyspark.ml.recommendation.ALSModel'>


In [15]:
# 예측을 할때 사이킷 런은 predict 메소드를 사용하지만
# 스파크에서는 feature가 들어가서 label이 나오는 데이터의 변환으로 판단 - transform
predictions = als_model.transform(test_df)
predictions.show()

[Stage 85:>                                                         (0 + 1) / 1]

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|     1|    307|   5.0|  4.097136|
|     1|   1175|   3.5|   4.02484|
|     1|   1237|   5.0| 4.0553126|
|     1|   2012|   2.5| 2.9603703|
|     1|   2692|   5.0| 4.0545907|
|     1|   3949|   5.0| 4.0692253|
|     1|   4973|   4.5|  4.226791|
|     1|   5912|   3.0| 3.8326893|
|     1|   7318|   2.0| 2.8871064|
|     1|   7323|   3.5|   3.90392|
|     1|   7327|   3.5|  3.980063|
|     1|   7365|   4.0| 3.7241123|
|     1|   7937|   3.0| 3.8105354|
|     1|   8014|   3.5| 4.0631304|
|     1|   8786|   4.0| 3.7699409|
|     1|  32591|   5.0|  3.478232|
|     3|      1|   4.0|  3.911788|
|     3|    111|   4.0| 4.0194097|
|     3|    214|   5.0| 3.9687915|
|     3|    293|   5.0| 4.3474517|
+------+-------+------+----------+
only showing top 20 rows



                                                                                

In [16]:
# 평가(RMSE)
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(
    metricName="rmse",
    predictionCol="prediction",
    labelCol="rating"
)

In [17]:
evaluator.evaluate(predictions)

                                                                                

0.8042175308573307

In [18]:
# 추천하기 1
# 각 user 에게 tip3 아이템(영화) 추천
als_model.recommendForAllUsers(3).show()

[Stage 160:>                                                        (0 + 1) / 1]

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|     1|[{177209, 5.49970...|
|     3|[{177209, 5.73822...|
|     5|[{203086, 5.92206...|
|     6|[{177209, 6.03870...|
|     9|[{185645, 6.58230...|
|    12|[{183947, 5.24145...|
|    13|[{194434, 6.07480...|
|    15|[{183947, 6.57887...|
|    16|[{183947, 5.98277...|
|    17|[{127252, 5.20187...|
|    19|[{194434, 5.39699...|
|    20|[{177209, 6.34902...|
|    22|[{177209, 6.97918...|
|    26|[{177209, 5.30613...|
|    27|[{194334, 5.92979...|
|    28|[{177209, 7.05149...|
|    31|[{185645, 4.05565...|
|    34|[{185645, 5.63664...|
|    35|[{203086, 5.69996...|
|    37|[{183947, 6.08584...|
+------+--------------------+
only showing top 20 rows



                                                                                

In [19]:
# 추천하기 2
# 각 영화에 어울리는 top3 유저를 추천
als_model.recommendForAllItems(3).show()

[Stage 189:>                                                        (0 + 1) / 1]

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|      1|[{105130, 5.60369...|
|      3|[{87426, 5.426358...|
|      5|[{143282, 5.76672...|
|      6|[{156318, 5.65070...|
|      9|[{87426, 5.298882...|
|     12|[{87426, 5.547249...|
|     13|[{87426, 5.517051...|
|     15|[{87426, 5.302298...|
|     16|[{156318, 5.56757...|
|     17|[{58248, 5.587661...|
|     19|[{87426, 5.598240...|
|     20|[{87426, 5.291905...|
|     22|[{87426, 5.471110...|
|     26|[{156318, 5.14574...|
|     27|[{87426, 5.669713...|
|     28|[{105801, 5.58554...|
|     31|[{87426, 5.495156...|
|     34|[{58248, 5.414247...|
|     35|[{32202, 4.837688...|
|     37|[{87426, 4.989361...|
+-------+--------------------+
only showing top 20 rows



                                                                                

In [20]:
# 사용자 목록을 만들어서 추천. 반드시 dataframe으로 만들어서 예측
from pyspark.sql.types import IntegerType
user_list = [276, 53,393]
users_df = spark.createDataFrame(user_list, IntegerType()).toDF("userId")
users_df.show()

[Stage 190:>                                                        (0 + 1) / 1]

+------+
|userId|
+------+
|   276|
|    53|
|   393|
+------+



                                                                                

In [21]:
user_recommend = als_model.recommendForUserSubset(users_df, 5) # 유저 3명에 대해 5개의 영화 추천
user_recommend.show()



+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    53|[{192089, 6.55530...|
|   276|[{177209, 6.78933...|
|   393|[{127252, 6.4703}...|
+------+--------------------+



                                                                                

In [22]:
# 특정 user를 위한 추천
movies_list = user_recommend.filter("userId = 393").select("recommendations")
movies_list.show()



+--------------------+
|     recommendations|
+--------------------+
|[{127252, 6.4703}...|
+--------------------+



                                                                                

In [23]:
# 데이터 프레임으로 조회해서 가져오는 것이 아닌, 실제 데이터를 가져와야 하기 때문에 collect()를 사용하는 것이 좋다.
movies_list = movies_list.select("recommendations").collect()
movies_list

                                                                                

[Row(recommendations=[Row(movieId=127252, rating=6.470300197601318), Row(movieId=190707, rating=6.36067008972168), Row(movieId=203086, rating=6.168588161468506), Row(movieId=86288, rating=6.057657241821289), Row(movieId=99724, rating=5.908327102661133)])]

In [25]:
movies_list_first = movies_list[0].recommendations
movies_list_first

[Row(movieId=127252, rating=6.470300197601318),
 Row(movieId=190707, rating=6.36067008972168),
 Row(movieId=203086, rating=6.168588161468506),
 Row(movieId=86288, rating=6.057657241821289),
 Row(movieId=99724, rating=5.908327102661133)]

In [26]:
# 추천 테이블
recommend_df = spark.createDataFrame(movies_list_first)
recommend_df.show()

+-------+-----------------+
|movieId|           rating|
+-------+-----------------+
| 127252|6.470300197601318|
| 190707| 6.36067008972168|
| 203086|6.168588161468506|
|  86288|6.057657241821289|
|  99724|5.908327102661133|
+-------+-----------------+



In [28]:
# 영화 메타 데이터 프레임 만들기
movie_filepath = "/home/ubuntu/working/spark-examples/data/ml-25m/movies.csv"
movies_df = spark.read.csv(f"file:///{movie_filepath}", inferSchema=True, header=True)
movies_df.show()

                                                                                

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

In [35]:
recommend_df.createOrReplaceTempView("rec")
movies_df.createOrReplaceTempView("movies")

In [36]:
query="""select * from movies"""
spark.sql(query).show()


+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

In [37]:
query = """
select *
from movies
join rec on movies.movieId = rec.movieId
order by rating desc;
"""
spark.sql(query).show()

                                                                                

+-------+--------------------+--------------------+-------+-----------------+
|movieId|               title|              genres|movieId|           rating|
+-------+--------------------+--------------------+-------+-----------------+
| 127252|The Veil of Twili...|Crime|Fantasy|Mys...| 127252|6.470300197601318|
| 190707|         1968 (2018)|  (no genres listed)| 190707| 6.36067008972168|
| 203086|Truth and Justice...|               Drama| 203086|6.168588161468506|
|  86288|Day the Universe ...|         Documentary|  86288|6.057657241821289|
|  99724|         K-11 (2012)|         Crime|Drama|  99724|5.908327102661133|
+-------+--------------------+--------------------+-------+-----------------+



In [None]:
spark.stop()