In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName('recommender_system') \
    .getOrCreate()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/05 13:47:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/12/05 13:47:26 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/12/05 13:47:26 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [12]:
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType

base_path = '/Users/hyunseokjung/data/movie_dataset/'
file_name = 'ratings_small.csv'

data_schema = StructType([
    StructField('userId', IntegerType(), False),
    StructField('movieId', IntegerType(), False),
    StructField('rating', FloatType(), False),
    StructField('timestamp',IntegerType(), False)
])

df = spark.read.format("csv") \
    .option('header', True) \
    .option('inferSchema', True) \
    .schema(data_schema) \
    .load(base_path + file_name) \
    .cache()

In [13]:
ratings = df.select('userId', 'movieId', 'rating').cache()

In [15]:
ratings.show(5)

ratings.columns


+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|     31|   2.5|
|     1|   1029|   3.0|
|     1|   1061|   3.0|
|     1|   1129|   2.0|
|     1|   1172|   4.0|
+------+-------+------+
only showing top 5 rows



['userId', 'movieId', 'rating']

In [15]:
# spark <-> pandas 변환

# df1 = df.toPandas()

# df2 = spark.createDataFrame(df1)

In [17]:
# stringIndexer 는 원핫 인코딩 작업이다.

In [16]:
train, test = ratings.randomSplit([.7, .3], seed=42)

In [18]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

als = ALS(
    rank=30,
    maxIter=4,
    regParam=0.1,
    userCol='userId',
    itemCol='movieId',
    ratingCol='rating',
    coldStartStrategy='drop',
    implicitPrefs=False
)
model = als.fit(train)
predictions = model.transform(test)

evaluator = RegressionEvaluator(metricName='mae', labelCol='rating',
                                predictionCol='prediction')

mae = evaluator.evaluate(predictions)
print(f'MAE (Test) = {mae}')

MAE (Test) = 0.7244259116516412


In [19]:
model.recommendForAllUsers(1).show(5)



+------+-------------------+
|userId|    recommendations|
+------+-------------------+
|     1| [{1950, 3.501675}]|
|     2|[{83411, 5.050743}]|
|     3|[{1361, 4.4313684}]|
|     4| [{1948, 5.595908}]|
|     5| [{4357, 4.872115}]|
+------+-------------------+
only showing top 5 rows



                                                                                

In [20]:
model.recommendForAllItems(1).show(5)



+-------+------------------+
|movieId|   recommendations|
+-------+------------------+
|     12|[{113, 4.4283714}]|
|     26| [{113, 5.159644}]|
|     27| [{113, 4.378224}]|
|     28|[{464, 4.9823256}]|
|     31| [{113, 4.782653}]|
+-------+------------------+
only showing top 5 rows



                                                                                

---

In [25]:
# movie_metadata
import pandas as pd

file_name1 = 'movies_metadata.csv'
file_name2 = 'links.csv'

metadata = pd.read_csv(base_path + file_name1)
links = pd.read_csv(base_path + file_name2)


  metadata = pd.read_csv(base_path + file_name1)


In [26]:
# Get the Movie on metadata
def get_movie_metadata(movieId):
    metadata['imdb_id'] = metadata['imdb_id'].astype('category')
    imdb_id = links[links['movieId'] == movieId]
    imdb_id = imdb_id.imdbId.values[0]
    if len(str(imdb_id)) == 7:
        movie_rated = metadata[metadata['imdb_id'] == 'tt'+imdb_id.astype(str)]
        df = movie_rated.loc[:,['title', 'overview', 'vote_average', 'release_date']]
        return df.reset_index(drop=True)
    elif len(str(imdb_id)) == 6:
        movie_rated = metadata[metadata['imdb_id'] == 'tt0'+imdb_id.astype(str)]
        df = movie_rated.loc[:,['title', 'overview', 'vote_average', 'release_date']]
        return df.reset_index(drop=True)
    elif len(str(imdb_id)) == 5:
        movie_rated = metadata[metadata['imdb_id'] == 'tt00'+imdb_id.astype(str)]
        df = movie_rated.loc[:,['title', 'overview', 'vote_average', 'release_date']]
        return df.reset_index(drop=True)
    elif len(str(imdb_id)) == 4:
        movie_rated = metadata[metadata['imdb_id'] == 'tt000'+imdb_id.astype(str)]
        df = movie_rated.loc[:,['title', 'overview', 'vote_average', 'release_date']]
        return df.reset_index(drop=True)
    elif len(str(imdb_id)) == 3:
        movie_rated = metadata[metadata['imdb_id'] == 'tt0000'+imdb_id.astype(str)]
        df = movie_rated.loc[:,['title', 'overview', 'vote_average', 'release_date']]
        return df.reset_index(drop=True)
    elif len(str(imdb_id)) == 2:
        movie_rated = metadata[metadata['imdb_id'] == 'tt00000'+imdb_id.astype(str)]
        df = movie_rated.loc[:,['title', 'overview', 'vote_average', 'release_date']]
        return df.reset_index(drop=True)
    elif len(str(imdb_id)) == 1:
        movie_rated = metadata[metadata['imdb_id'] == 'tt000000'+imdb_id.astype(str)]
        df = movie_rated.loc[:,['title', 'overview', 'vote_average', 'release_date']]
        return df.reset_index(drop=True)
    else:
        pass
# Get Movie List
def get_movie(df):
    movieIdIdx = df['movieId'].values.astype(int)
    df_aux_b = pd.DataFrame({'title': ['aaa'], 
                           'overview': ['bbb'], 
                           'vote_average': [1.7], 
                           'release_date': ['1999-01-01']
        })
    for i in movieIdIdx:
        df_aux_b = df_aux_b.append(get_movie_metadata(i), ignore_index=True)

    df_aux_b.drop(0, inplace=True)
    df_aux_b['release_date'] = df_aux_b['release_date'].apply(lambda x : x.split('-')[0])
    df_aux_b['release_date'] = df_aux_b['release_date'].astype(int)
    df_aux_b.rename(columns={'release_date' : 'release_year'}, inplace=True)
    return df_aux_b.reset_index(drop=True)

In [28]:
get_movie_metadata(12)

Unnamed: 0,title,overview,vote_average,release_date
0,Dracula: Dead and Loving It,When a lawyer shows up at the vampire's doorst...,5.7,1995-12-22
