In [17]:
import numpy as np
import pandas as pd
# from tqdm import tqdm
# from scipy.sparse import coo_matrix
from pyspark.ml.recommendation import ALS

# import sklearn
# import random 

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, IndexToString
from pyspark.ml.feature import QuantileDiscretizer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator



In [19]:
from pyspark.sql import SparkSession


MAX_MEMORY = "5g"
spark = SparkSession.builder.appName("recommender")\
.config("spark.executor.memory", MAX_MEMORY)\
.config("spark.driver.memory", MAX_MEMORY)\
.getOrCreate()

# spark = SparkSession\
#         .builder\
#         .appName('recommender_system')\
#         .getOrCreate()

In [14]:
# 데이터 불러오기
customers = pd.read_csv("data/customers.csv")
tran = pd.read_csv("data/transactions_train.csv")
item = pd.read_csv("data/articles.csv")
rfm = pd.read_csv("data/result.csv")

In [15]:
customers.head(3)

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,,,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...


In [16]:
tran['count'] = '1'


In [17]:
ALL_USERS = customers['customer_id'].unique().tolist()
ALL_ITEMS = item['article_id'].unique().tolist()

user_ids = dict(list(enumerate(ALL_USERS)))
item_ids = dict(list(enumerate(ALL_ITEMS)))

user_map = {u: uidx for uidx, u in user_ids.items()}
item_map = {i: iidx for iidx, i in item_ids.items()}

tran['user_id'] = tran['customer_id'].map(user_map)
tran['item_id'] = tran['article_id'].map(item_map)

# del customers, customers


In [18]:
tran.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,count,user_id,item_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2,1,2,40179
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2,1,2,10520
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2,1,7,6387
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2,1,7,46304
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2,1,7,46305


In [19]:
tran = tran.drop(['t_dat','price', 'sales_channel_id'], axis=1)
tran.head(20)

Unnamed: 0,customer_id,article_id,count,user_id,item_id
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,1,2,40179
1,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,1,2,10520
2,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,1,7,6387
3,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,1,7,46304
4,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,1,7,46305
5,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687001,1,7,46302
6,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221001,1,7,6386
7,00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4...,688873012,1,198,47416
8,00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4...,501323011,1,198,5944
9,00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4...,598859003,1,198,22540


In [20]:
rfm = pd.read_csv("data/result.csv")

In [21]:
rfm.head(3)

Unnamed: 0,customer_id,article_id,price,t_dat,age,count,total,t_dat_R,count_F,total_M,Class
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,176209023.0,0.035576,8467200.0,49.0,1,0.035576,2,1,8,2
1,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,568601006.0,0.050831,21340800.0,49.0,2,0.101661,4,9,10,1
2,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,568601043.0,0.050831,61862400.0,49.0,1,0.050831,10,1,9,1


In [22]:
rfm = rfm[['customer_id', 'article_id', 't_dat_R', 'count_F', 'total_M']]
sum_factor = rfm['t_dat_R'] + rfm['count_F'] + rfm['total_M']
rfm['rfm'] = sum_factor

In [23]:
rfm = rfm.drop(['t_dat_R', 'count_F', 'total_M'], axis=1)

In [24]:
rfm.head()

Unnamed: 0,customer_id,article_id,rfm
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,176209023.0,11
1,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,568601006.0,23
2,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,568601043.0,20
3,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,607642008.0,8
4,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,625548001.0,12


In [25]:
tran.head()

Unnamed: 0,customer_id,article_id,count,user_id,item_id
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,1,2,40179
1,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,1,2,10520
2,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,1,7,6387
3,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,1,7,46304
4,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,1,7,46305


In [26]:
rfm = rfm.merge(tran, on=['customer_id', 'article_id'], how='right')

In [29]:
# rfm['customer_id'] = rfm['customer_id'].astype(int)
# rfm['article_id'] = rfm['article_id'].astype(int)
rfm['rfm'].max()

30

In [28]:
rfm.to_csv('data/df2.csv', index=False)

In [30]:
rfm.head()

Unnamed: 0,customer_id,article_id,rfm,count,user_id,item_id
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001.0,20,1,2,40179
1,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023.0,8,1,2,10520
2,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004.0,5,1,7,6387
3,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003.0,5,1,7,46304
4,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004.0,5,1,7,46305


In [81]:
rfm.loc[rfm['article_id'] == 810792006]

Unnamed: 0,customer_id,article_id,rfm,user_id,item_id
29314265,fb52989c4dc4f66d7d737bd1887cf44acd3f243c3d1bc9...,810792006.0,29,1347002,82979
29314266,fb52989c4dc4f66d7d737bd1887cf44acd3f243c3d1bc9...,810792006.0,29,1347002,82979
29365527,176c85b9d5a82cd58ee9a1d9ac82bb402f3d8a3cf00b99...,810792006.0,18,125630,82979


22/04/27 21:36:16 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 921974 ms exceeds timeout 120000 ms
22/04/27 21:36:16 WARN SparkContext: Killing executors is not supported by current scheduler.
22/04/28 14:31:15 WARN TransportChannelHandler: Exception in connection from /172.30.1.23:49328
java.io.IOException: Operation timed out
	at sun.nio.ch.FileDispatcherImpl.read0(Native Method)
	at sun.nio.ch.SocketDispatcher.read(SocketDispatcher.java:39)
	at sun.nio.ch.IOUtil.readIntoNativeBuffer(IOUtil.java:223)
	at sun.nio.ch.IOUtil.read(IOUtil.java:192)
	at sun.nio.ch.SocketChannelImpl.read(SocketChannelImpl.java:379)
	at io.netty.buffer.PooledByteBuf.setBytes(PooledByteBuf.java:253)
	at io.netty.buffer.AbstractByteBuf.writeBytes(AbstractByteBuf.java:1132)
	at io.netty.channel.socket.nio.NioSocketChannel.doReadBytes(NioSocketChannel.java:350)
	at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:151)
	at io.netty.channel.n

In [None]:
# # coo 변환
# row = df['user_id'].values
# col = df['item_id'].values
# data = np.ones(tran.shape[0])
# coo_train = coo_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))
# coo_train

In [15]:
df_spark = spark.read.csv("data/df2.csv", inferSchema=True, header=True)
# coo_train = spark.createDataFrame(coo_train)

                                                                                

In [None]:
df_spark.show()

+--------------------+------------+---+-----+-------+-------+
|         customer_id|  article_id|rfm|count|user_id|item_id|
+--------------------+------------+---+-----+-------+-------+
|000058a12d5b43e67...|6.63713001E8| 20|    1|      2|  40179|
|000058a12d5b43e67...|5.41518023E8|  8|    1|      2|  10520|
|00007d2de826758b6...|5.05221004E8|  5|    1|      7|   6387|
|00007d2de826758b6...|6.85687003E8|  5|    1|      7|  46304|
|00007d2de826758b6...|6.85687004E8|  5|    1|      7|  46305|
|00007d2de826758b6...|6.85687001E8|  5|    1|      7|  46302|
|00007d2de826758b6...|5.05221001E8|  6|    1|      7|   6386|
|00083cda041544b2f...|6.88873012E8|  8|    1|    198|  47416|
|00083cda041544b2f...|5.01323011E8| 11|    1|    198|   5944|
|00083cda041544b2f...|5.98859003E8| 11|    1|    198|  22540|
|00083cda041544b2f...| 6.8887302E8|  8|    1|    198|  47423|
|00083cda041544b2f...|6.88873011E8|  8|    1|    198|  47415|
|0008968c0d451dbc5...|5.31310002E8|  7|    1|    203|   9226|
|0008968

In [20]:
train, test = df_spark.randomSplit([0.75, 0.25])

rec = ALS(maxIter=10,
        regParam=0.01,
        userCol='user_id',
        itemCol='article_id',
        # implicitPrefs = True,
        ratingCol='rfm', # label -> predict할 때는 필요 없음!
        nonnegative=True,
        coldStartStrategy='drop')
# ALS모델 학습 -> dataframe을 넣어주기
rec_model = rec.fit(train)

# transform을 이용해 예측 -> dataframe을 넣어주기
pred_ratings = rec_model.transform(test)
pred_ratings.limit(5).toPandas()

22/05/04 14:53:12 ERROR RetryingBlockTransferor: Exception while beginning fetch of 1 outstanding blocks (after 2 retries)
java.io.IOException: Failed to connect to /172.26.133.123:62764
	at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:288)
	at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:218)
	at org.apache.spark.network.netty.NettyBlockTransferService$$anon$2.createAndStart(NettyBlockTransferService.scala:126)
	at org.apache.spark.network.shuffle.RetryingBlockTransferor.transferAllOutstanding(RetryingBlockTransferor.java:154)
	at org.apache.spark.network.shuffle.RetryingBlockTransferor.lambda$initiateRetry$0(RetryingBlockTransferor.java:184)
	at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
	at java.util.concurrent.FutureTask.run(FutureTask.java:266)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.

KeyboardInterrupt: 

22/05/04 14:55:52 ERROR RetryingBlockTransferor: Exception while beginning fetch of 1 outstanding blocks (after 1 retries)
java.io.IOException: Failed to connect to /172.26.133.123:62764
	at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:288)
	at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:218)
	at org.apache.spark.network.netty.NettyBlockTransferService$$anon$2.createAndStart(NettyBlockTransferService.scala:126)
	at org.apache.spark.network.shuffle.RetryingBlockTransferor.transferAllOutstanding(RetryingBlockTransferor.java:154)
	at org.apache.spark.network.shuffle.RetryingBlockTransferor.lambda$initiateRetry$0(RetryingBlockTransferor.java:184)
	at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
	at java.util.concurrent.FutureTask.run(FutureTask.java:266)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.

In [None]:
u = rec.recommandProducts(26, 10)

In [53]:
pred_ratings.limit(100).toPandas()

                                                                                

Unnamed: 0,customer_id,article_id,rfm,count,user_id,item_id,prediction
0,0001177027259b455f979d85a278e4b280205d4de5cce4...,820032004.0,13,1,26,85498,11.998488
1,0001177027259b455f979d85a278e4b280205d4de5cce4...,198714001.0,12,1,26,222,17.686501
2,0001177027259b455f979d85a278e4b280205d4de5cce4...,570002090.0,23,1,26,16485,21.888983
3,0001177027259b455f979d85a278e4b280205d4de5cce4...,777243002.0,14,1,26,74572,28.179527
4,0001177027259b455f979d85a278e4b280205d4de5cce4...,810172003.0,27,1,26,82842,29.498299
...,...,...,...,...,...,...,...
95,0008d644deb96bdc0ca262f161cf6d5e9a4e619bb75faa...,654590001.0,26,1,210,37697,22.525757
96,0008d644deb96bdc0ca262f161cf6d5e9a4e619bb75faa...,794053001.0,26,1,210,78437,18.112341
97,0008d644deb96bdc0ca262f161cf6d5e9a4e619bb75faa...,855198004.0,17,1,210,92788,19.389961
98,0008d644deb96bdc0ca262f161cf6d5e9a4e619bb75faa...,456163062.0,16,1,210,3511,15.602011


In [14]:
from sklearn.datasets import load_iris
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

In [44]:
# 모델
logreg = LogisticRegression()

# n_split : 몇개로 분할할지
# shuffle : Fold를 나누기 전에 무작위로 섞을지
# random_state : 나눈 Fold를 그대로 사용할지
kfold = KFold(n_splits=5, shuffle = True, random_state=0)

# 파라미터는 (모델, Traingdata의 feature, Trainingdata의 target, 폴드수) 이다.
scores = cross_val_score(logreg , iris.data , iris.target ,cv=kfold)

# Trainingdata에 대한 성능을 나타낸다.
print('교차 검증별 정확도:',np.round(scores, 4))
print('평균 검증 정확도:', np.round(np.mean(scores), 4))

0.97649825

In [10]:
# Get metric for training
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol='count',
                              predictionCol='prediction',
                              metricName='rmse')
# evaluate 메소드에 예측값 담겨있는 dataframe 넣어주기
rmse = evaluator.evaluate(pred_ratings)

mae_eval = RegressionEvaluator(labelCol='count',
                              predictionCol='prediction',
                              metricName='mae')
mae = mae_eval.evaluate(pred_ratings)

print("RMSE:", rmse)
print("MAE:", mae)



RMSE: 0.9840964766233072
MAE: 0.9816765504342871


                                                                                

In [12]:
my_mc_lr = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='count', metricName='f1')
my_mc_lr.evaluate(pred_ratings)

IllegalArgumentException: requirement failed: Column prediction must be of type class org.apache.spark.sql.types.DoubleType$:double but was actually class org.apache.spark.sql.types.FloatType$:float.

22/05/04 13:31:27 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 909576 ms exceeds timeout 120000 ms
22/05/04 13:31:27 WARN SparkContext: Killing executors is not supported by current scheduler.


In [46]:
# Pyspark Library #
# SQL
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.functions import mean, col, split, regexp_extract, when, lit
# ML
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, IndexToString
from pyspark.ml.feature import QuantileDiscretizer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.recommendation import ALS


# stringIndexer = StringIndexer(inputCol='article_id',
#                             outputCol='article_new')
# print(stringIndexer)
# model = stringIndexer.bfit(df)
# indexed = model.transform(df)

# 숫자로 바꾼 영화제목들 중 Unique한 값들만 담아 추출하기 -> Dataframe 반환





In [56]:
pre1 = rfm
pre1.head(10)

Unnamed: 0,customer_id,article_id,rfm,count,user_id,item_id
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001.0,20,1,2,40179
1,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023.0,8,1,2,10520
2,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004.0,5,1,7,6387
3,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003.0,5,1,7,46304
4,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004.0,5,1,7,46305
5,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687001.0,5,1,7,46302
6,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221001.0,6,1,7,6386
7,00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4...,688873012.0,8,1,198,47416
8,00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4...,501323011.0,11,1,198,5944
9,00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4...,598859003.0,11,1,198,22540


In [63]:
df = spark.read.csv('data/df2.csv', inferSchema=True, header=True)
unique_movies = df.select("article_id").distinct()

def top_movies(user_id, n):
    """
    특정 user_id가 좋아할 만한 n개의 영화 추천해주는 함수
    """
    # unique_movies 데이터프레임을 'a'라는 데이터프레임으로 alias시키기
    a = unique_movies.alias('a')

    # 특정 user_id가 본 영화들만 담은 새로운 데이터프레임 생성
    watched_movies = df.filter(df['user_id'] == user_id)\
                            .select('article_id')

    # 특정 user_id가 본 영화들을 'b'라는 데이터프레임으로 alias시키기
    b = watched_movies.alias('b')

    # unique_movies를 기준으로 watched_movies를 조인시켜서 user_id가 보지 못한 영화들 파악 가능
    total_movies = a.join(b, a['article_id'] == b['article_id'],
                        how='left')

    # b 데이터프레임의 title_new값이 결측치를 갖고 있는 행의 a.title_new를 뽑아냄으로써 user_id가 아직 못본 영화들 추출
    # col('b.title_new') => b 데이터프레임의 title_new칼럼 의미(SQL처럼 가능!)
    remaining_movies = total_movies\
                    .where(col('b.article_id').isNull())\
                    .select('a.article_id').distinct()
    # remaining_movies 데이터프레임에 특정 user_id값을 동일하게 새로운 변수로 추가해주기
    remaining_movies = remaining_movies.withColumn('user_id',
                                                lit(int(user_id)))
    # 위에서 만든 ALS 모델을 사용하여 추천 평점 예측 후 n개 만큼 view -> 
    recommender = rec_model.transform(remaining_movies)\
                        .orderBy('prediction', ascending=False)\
                        .limit(n)

    pre = recommender.toPandas()
    # pre1 = rfm.merge(pre, on=['user_id', 'article_id'], how='right')
    # pre1 = rfm.merge(pre)
    return pre
    # return recommender.show(n, truncate=False)

# userid가 1817번인 유저가 볼만한 영화 상위 5개 추천해주기
asdf = top_movies(26, 100)

# StringIndexer로 만든 것을 역으로 바꾸기 위해 IndexToString사용(영화제목을 숫자->한글제목)
    # movie_title = IndexToString(inputCol='article_new',
    #                         outputCol='article_id',
    #                         labels=model.labels) #여기서 model.labels는 StringIndexer에서 fit시켰을 때 생긴 레이블. 즉, 영화 제목들
    # # transform해서 영화제목을 숫자->한글로 변환! => dataframe으로 반환
    # final_recommendations = movie_title.transform(recommender)

                                                                                

In [89]:
asdf.head(100)

Unnamed: 0,article_id,user_id,prediction
0,924580003.0,26,118.812256
1,874928004.0,26,117.012177
2,850786001.0,26,113.568604
3,497637024.0,26,111.181839
4,828659001.0,26,105.217400
...,...,...,...
95,697904003.0,26,57.077362
96,611147001.0,26,56.755436
97,718489001.0,26,56.504921
98,887552002.0,26,56.144985


In [90]:
rfm.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31788324 entries, 0 to 31788323
Data columns (total 6 columns):
 #   Column       Dtype  
---  ------       -----  
 0   customer_id  object 
 1   article_id   float64
 2   rfm          int64  
 3   count        object 
 4   user_id      int64  
 5   item_id      int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 1.7+ GB


In [96]:
asdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   article_id  100 non-null    float64
 1   user_id     100 non-null    int32  
 2   prediction  100 non-null    float32
dtypes: float32(1), float64(1), int32(1)
memory usage: 1.7 KB


In [103]:
asdf = asdf.astype({'user_id':'int64'})

In [104]:
asdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   article_id  100 non-null    float64
 1   user_id     100 non-null    int64  
 2   prediction  100 non-null    float32
dtypes: float32(1), float64(1), int64(1)
memory usage: 2.1 KB


In [109]:
pre1 = rfm.merge(asdf, on=['user_id', 'article_id'], how='right')

In [110]:
pre1[pre1['user_id'] == 26]

Unnamed: 0,customer_id,article_id,rfm,count,user_id,item_id,prediction
0,,924580003.0,,,26,,118.812256
1,,874928004.0,,,26,,117.012177
2,,850786001.0,,,26,,113.568604
3,,497637024.0,,,26,,111.181839
4,,828659001.0,,,26,,105.217400
...,...,...,...,...,...,...,...
95,,697904003.0,,,26,,57.077362
96,,611147001.0,,,26,,56.755436
97,,718489001.0,,,26,,56.504921
98,,887552002.0,,,26,,56.144985


In [49]:
from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics
metrics = RegressionMetrics(pred_ratings)

# Root mean squared error
print("RMSE = %s" % metrics.rootMeanSquaredError)

# R-squared
print("R-squared = %s" % metrics.r2)

AttributeError: 'DataFrame' object has no attribute 'ctx'