In [58]:
# $SPARK_HOME/bin/spark-submit --master local recommender_system/spark_final.py

In [59]:
import findspark
findspark.init()

In [60]:
from pyspark.sql import SparkSession

MAX_MEMORY = '5g'

spark = SparkSession.builder \
    .appName('recommender_system1') \
    .config('spark.driver.memory', "5g") \
    .config('spark.some.config.option', 'some-value') \
    .getOrCreate()


In [61]:
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType

base_path = '/Users/hyunseokjung/data/movie_dataset/'

ratings = spark.read.csv(base_path+'ratings.csv', header=True, inferSchema=True).repartition(5).cache()
metadata = spark.read.csv(base_path+'movies_metadata.csv', header=True, inferSchema=True).repartition(5).cache()
links = spark.read.csv(base_path+'links.csv', header=True, inferSchema=True).repartition(5).cache()

                                                                                

22/12/11 15:19:36 WARN CacheManager: Asked to cache already cached data.
22/12/11 15:19:36 WARN CacheManager: Asked to cache already cached data.
22/12/11 15:19:36 WARN CacheManager: Asked to cache already cached data.


In [62]:
metadata = metadata.selectExpr('imdb_id', 'title', 'vote_average', 'release_date').cache()

22/12/11 15:19:36 WARN CacheManager: Asked to cache already cached data.


In [63]:
ratings.count()

metadata.count()

45572

In [64]:
ratings = ratings.select('userId', 'movieId', 'rating').cache()
ratings.show(5)

22/12/11 15:19:37 WARN CacheManager: Asked to cache already cached data.
+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|  1906|   5349|   3.5|
| 16458|    778|   4.0|
| 12642|  60684|   3.5|
| 25219|   2628|   3.5|
| 13210|  26810|   5.0|
+------+-------+------+
only showing top 5 rows



In [65]:
metadata = metadata.select('imdb_id', 'title', 'vote_average', 'release_date').cache()
metadata.show(5)

+---------+--------------------+------------+------------+
|  imdb_id|               title|vote_average|release_date|
+---------+--------------------+------------+------------+
|tt0093818|          Radio Days|         7.0|  1987-01-30|
|tt0144969|           Home Page|         0.0|  1998-10-14|
|tt0168987|Better Than Choco...|         6.4|  1999-02-14|
|tt0084503|Pink Floyd: The Wall|         7.7|  1982-07-14|
|tt0238015|All Access: Front...|         0.0|  2001-05-20|
+---------+--------------------+------------+------------+
only showing top 5 rows



In [66]:
train, test = ratings.randomSplit([.7, .3], seed=42)

In [67]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

als = ALS(
    rank=30,
    maxIter=4,
    regParam=0.1,
    userCol='userId',
    itemCol='movieId',
    ratingCol='rating',
    coldStartStrategy='drop',
    implicitPrefs=False
)
model = als.fit(train)
predictions = model.transform(test)

evaluator = RegressionEvaluator(metricName='mae', labelCol='rating',
                                predictionCol='prediction')

mae = evaluator.evaluate(predictions)
print(f'MAE (Test) = {mae}')



MAE (Test) = 0.6526043524745228


                                                                                

In [68]:
test.schema

StructType([StructField('userId', IntegerType(), True), StructField('movieId', IntegerType(), True), StructField('rating', DoubleType(), True)])

In [69]:
# model.recommendForAllUsers(3) \
#     .selectExpr("userId", "explode(recommendations)") \
#     .show(9)



+------+-------------------+
|userId|                col|
+------+-------------------+
|    26|{127313, 5.6663523}|
|    26| {126086, 5.494779}|
|    26| {167770, 5.373729}|
|    27| {164937, 5.652737}|
|    27|{161662, 5.6020017}|
|    27|{159467, 5.5251255}|
|    28|{164937, 5.3787107}|
|    28| {151681, 5.287017}|
|    28|{151615, 5.2290335}|
+------+-------------------+
only showing top 9 rows



                                                                                

In [70]:
user_suggest = test.filter(test['userId'] == 2).select(['movieId', 'userId'])
user_offer = model.transform(user_suggest)
user_offer.orderBy('prediction', ascending=False).show()



+-------+------+----------+
|movieId|userId|prediction|
+-------+------+----------+
|    260|     2| 3.9194903|
|     32|     2| 3.3836281|
|    339|     2|  3.233927|
|    141|     2|  3.142478|
|    648|     2| 3.0775516|
|    377|     2| 2.9468791|
|      5|     2| 2.7816482|
|    788|     2| 2.4325862|
+-------+------+----------+



                                                                                

In [71]:
user_id = int(input('INPUT USER_ID'))
print(user_id)

50


In [72]:
user_suggest = test.filter(test['userId'] == user_id).select(['movieId', 'userId'])
user_offer = model.transform(user_suggest)
user_offer.orderBy('prediction', ascending=False).show(7)



+-------+------+----------+
|movieId|userId|prediction|
+-------+------+----------+
|   1221|    50|  3.943585|
|    318|    50|  3.808083|
| 142488|    50| 3.6575289|
|   2959|    50| 3.4831002|
| 122886|    50|  3.191975|
+-------+------+----------+



                                                                                

In [73]:
user_offer_order = user_offer.orderBy('prediction', ascending=False).toPandas()

                                                                                

In [74]:
user_offer_order['movieId'][0]
len(user_offer_order['movieId'])

5

AttributeError: 'DataFrame' object has no attribute 'toPandas'

In [126]:
links[links['movieId'] == 1221]

Unnamed: 0,movieId,imdbId,tmdbId
19273,1221,71562,240.0


In [125]:
imdb_id = metadata['imdb_id']
imdb_id

0        0        0        0        0        0        0...
1        0        0        0        0        0        0...
2        0        0        0        0        0        0...
3        0        0        0        0        0        0...
4        0        0        0        0        0        0...
                               ...                        
45567    0        0        0        0        0        0...
45568    0        0        0        0        0        0...
45569    0        0        0        0        0        0...
45570    0        0        0        0        0        0...
45571    0        0        0        0        0        0...
Name: imdb_id, Length: 45572, dtype: object

In [121]:
import pandas as pd

# Get the Movie on metadata
def get_movie_metadata(movieId, metadata, links):
    
    # metadata = metadata.toPandas()
    # links = links.toPandas()
    # metadata['imdb_id'] = metadata['imdb_id'].astype('category')
    metadata['imdb_id'] = str(metadata['imdb_id'])
    imdb_id = links[links['movieId'] == movieId]
    imdb_id = imdb_id.imdbId.values[0]
    if len(str(imdb_id)) == 7:
        movie_rated = metadata[metadata['imdb_id'] == 'tt'+imdb_id.astype(str)]
        df = movie_rated.loc[:,['title', 'vote_average', 'release_date']]
        return df.reset_index(drop=True)
    elif len(str(imdb_id)) == 6:
        movie_rated = metadata[metadata['imdb_id'] == 'tt0'+imdb_id.astype(str)]
        df = movie_rated.loc[:,['title', 'vote_average', 'release_date']]
        return df.reset_index(drop=True)
    elif len(str(imdb_id)) == 5:
        movie_rated = metadata[metadata['imdb_id'] == 'tt00'+imdb_id.astype(str)]
        df = movie_rated.loc[:,['title', 'vote_average', 'release_date']]
        return df.reset_index(drop=True)
    elif len(str(imdb_id)) == 4:
        movie_rated = metadata[metadata['imdb_id'] == 'tt000'+imdb_id.astype(str)]
        df = movie_rated.loc[:,['title', 'vote_average', 'release_date']]
        return df.reset_index(drop=True)
    elif len(str(imdb_id)) == 3:
        movie_rated = metadata[metadata['imdb_id'] == 'tt0000'+imdb_id.astype(str)]
        df = movie_rated.loc[:,['title', 'vote_average', 'release_date']]
        return df.reset_index(drop=True)
    elif len(str(imdb_id)) == 2:
        movie_rated = metadata[metadata['imdb_id'] == 'tt00000'+imdb_id.astype(str)]
        df = movie_rated.loc[:,['title', 'vote_average', 'release_date']]
        return df.reset_index(drop=True)
    elif len(str(imdb_id)) == 1:
        movie_rated = metadata[metadata['imdb_id'] == 'tt000000'+imdb_id.astype(str)]
        df = movie_rated.loc[:,['title', 'vote_average', 'release_date']]
        return df.reset_index(drop=True)
    else:
        pass
# Get Movie List
def get_movie(df):
    movieIdIdx = df['movieId'].values.astype(int)
    df_aux_b = pd.DataFrame({'title': ['aaa'], 
                           'overview': ['bbb'], 
                           'vote_average': [1.7], 
                           'release_date': ['1999-01-01']
        })
    for i in movieIdIdx:
        df_aux_b = df_aux_b.append(get_movie_metadata(i, metadata, links), ignore_index=True)

    df_aux_b.drop(0, inplace=True)
    df_aux_b['release_date'] = df_aux_b['release_date'].apply(lambda x : x.split('-')[0])
    df_aux_b['release_date'] = df_aux_b['release_date'].astype(int)
    df_aux_b.rename(columns={'release_date' : 'release_year'}, inplace=True)
    return df_aux_b.reset_index(drop=True)

In [124]:
for movieId in user_offer_order['movieId']:
    print(get_movie_metadata(movieId, metadata, links))

Empty DataFrame
Columns: [title, vote_average, release_date]
Index: []
Empty DataFrame
Columns: [title, vote_average, release_date]
Index: []
Empty DataFrame
Columns: [title, vote_average, release_date]
Index: []
Empty DataFrame
Columns: [title, vote_average, release_date]
Index: []
Empty DataFrame
Columns: [title, vote_average, release_date]
Index: []
