In [None]:
# 출처 테스트 : https://towardsdatascience.com/build-recommendation-system-with-pyspark-using-alternating-least-squares-als-matrix-factorisation-ebe1ad2e7679

In [12]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName('recommender_system1') \
    .getOrCreate()


In [8]:
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType

base_path = '/Users/hyunseokjung/data/movie_dataset/'

ratings = spark.read.csv(base_path+'ratings.csv', header=True, inferSchema=True)
movies = spark.read.csv(base_path+'movies_metadata.csv', header=True, inferSchema=True)
links = spark.read.csv(base_path+'links.csv', header=True, inferSchema=True)

                                                                                

In [6]:
ratings = ratings.select('userId', 'movieId', 'rating').cache()
ratings.show(5)

[Stage 6:>                                                          (0 + 1) / 1]

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|    110|   1.0|
|     1|    147|   4.5|
|     1|    858|   5.0|
|     1|   1221|   5.0|
|     1|   1246|   5.0|
+------+-------+------+
only showing top 5 rows



                                                                                

In [11]:
movies = movies.select('imdb_id', 'title', 'vote_average', 'release_date').cache()
movies.show(5)

+---------+--------------------+--------------------+--------------------+
|  imdb_id|               title|        vote_average|        release_date|
+---------+--------------------+--------------------+--------------------+
|tt0114709|           Toy Story|                 7.7|          1995-10-30|
|tt0113497|             Jumanji|                 6.9|          1995-12-15|
|tt0113228|    Grumpier Old Men|                 6.5|          1995-12-22|
|tt0114885|[{'iso_639_1': 'e...|Friends are the p...|/16XOMpEaLWkrcPqS...|
|tt0113041|Father of the Bri...|                 5.7|          1995-02-10|
+---------+--------------------+--------------------+--------------------+
only showing top 5 rows



22/12/06 00:26:18 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1004933 ms exceeds timeout 120000 ms
22/12/06 00:26:18 WARN SparkContext: Killing executors is not supported by current scheduler.


In [16]:
train, test = ratings.randomSplit([.7, .3], seed=42)

In [18]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

als = ALS(
    rank=30,
    maxIter=4,
    regParam=0.1,
    userCol='userId',
    itemCol='movieId',
    ratingCol='rating',
    coldStartStrategy='drop',
    implicitPrefs=False
)
model = als.fit(train)
predictions = model.transform(test)

evaluator = RegressionEvaluator(metricName='mae', labelCol='rating',
                                predictionCol='prediction')

mae = evaluator.evaluate(predictions)
print(f'MAE (Test) = {mae}')

MAE (Test) = 0.7244259116516412


In [19]:
model.recommendForAllUsers(1).show(5)



+------+-------------------+
|userId|    recommendations|
+------+-------------------+
|     1| [{1950, 3.501675}]|
|     2|[{83411, 5.050743}]|
|     3|[{1361, 4.4313684}]|
|     4| [{1948, 5.595908}]|
|     5| [{4357, 4.872115}]|
+------+-------------------+
only showing top 5 rows



                                                                                

In [20]:
model.recommendForAllItems(1).show(5)



+-------+------------------+
|movieId|   recommendations|
+-------+------------------+
|     12|[{113, 4.4283714}]|
|     26| [{113, 5.159644}]|
|     27| [{113, 4.378224}]|
|     28|[{464, 4.9823256}]|
|     31| [{113, 4.782653}]|
+-------+------------------+
only showing top 5 rows



                                                                                

In [26]:
# Get the Movie on metadata
def get_movie_metadata(movieId):
    metadata['imdb_id'] = metadata['imdb_id'].astype('category')
    imdb_id = links[links['movieId'] == movieId]
    imdb_id = imdb_id.imdbId.values[0]
    if len(str(imdb_id)) == 7:
        movie_rated = metadata[metadata['imdb_id'] == 'tt'+imdb_id.astype(str)]
        df = movie_rated.loc[:,['title', 'overview', 'vote_average', 'release_date']]
        return df.reset_index(drop=True)
    elif len(str(imdb_id)) == 6:
        movie_rated = metadata[metadata['imdb_id'] == 'tt0'+imdb_id.astype(str)]
        df = movie_rated.loc[:,['title', 'overview', 'vote_average', 'release_date']]
        return df.reset_index(drop=True)
    elif len(str(imdb_id)) == 5:
        movie_rated = metadata[metadata['imdb_id'] == 'tt00'+imdb_id.astype(str)]
        df = movie_rated.loc[:,['title', 'overview', 'vote_average', 'release_date']]
        return df.reset_index(drop=True)
    elif len(str(imdb_id)) == 4:
        movie_rated = metadata[metadata['imdb_id'] == 'tt000'+imdb_id.astype(str)]
        df = movie_rated.loc[:,['title', 'overview', 'vote_average', 'release_date']]
        return df.reset_index(drop=True)
    elif len(str(imdb_id)) == 3:
        movie_rated = metadata[metadata['imdb_id'] == 'tt0000'+imdb_id.astype(str)]
        df = movie_rated.loc[:,['title', 'overview', 'vote_average', 'release_date']]
        return df.reset_index(drop=True)
    elif len(str(imdb_id)) == 2:
        movie_rated = metadata[metadata['imdb_id'] == 'tt00000'+imdb_id.astype(str)]
        df = movie_rated.loc[:,['title', 'overview', 'vote_average', 'release_date']]
        return df.reset_index(drop=True)
    elif len(str(imdb_id)) == 1:
        movie_rated = metadata[metadata['imdb_id'] == 'tt000000'+imdb_id.astype(str)]
        df = movie_rated.loc[:,['title', 'overview', 'vote_average', 'release_date']]
        return df.reset_index(drop=True)
    else:
        pass
# Get Movie List
def get_movie(df):
    movieIdIdx = df['movieId'].values.astype(int)
    df_aux_b = pd.DataFrame({'title': ['aaa'], 
                           'overview': ['bbb'], 
                           'vote_average': [1.7], 
                           'release_date': ['1999-01-01']
        })
    for i in movieIdIdx:
        df_aux_b = df_aux_b.append(get_movie_metadata(i), ignore_index=True)

    df_aux_b.drop(0, inplace=True)
    df_aux_b['release_date'] = df_aux_b['release_date'].apply(lambda x : x.split('-')[0])
    df_aux_b['release_date'] = df_aux_b['release_date'].astype(int)
    df_aux_b.rename(columns={'release_date' : 'release_year'}, inplace=True)
    return df_aux_b.reset_index(drop=True)

In [28]:
get_movie_metadata(12)

Unnamed: 0,title,overview,vote_average,release_date
0,Dracula: Dead and Loving It,When a lawyer shows up at the vampire's doorst...,5.7,1995-12-22
