#### Spark Context 생성 및 라이브러리 로드

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Ch03").getOrCreate()
sc = spark.sparkContext

In [2]:
from pyspark.sql.types import Row

#### user-artist-count 데이터 로드

In [4]:
rawUserArtistData = sc.textFile("profiledata_06-May-2005\\user_artist_data.txt")
userArtistDF = rawUserArtistData\
    .map(lambda line: line.split(" "))\
    .map(lambda x: Row(user=int(x[0]), artist=int(x[1]), count=int(x[2])))\
    .toDF()
userArtistDF.take(2)

[Row(artist=1, count=55, user=1000002),
 Row(artist=1000006, count=33, user=1000002)]

In [5]:
print(userArtistDF.select('user').rdd.max())
print(userArtistDF.select('user').rdd.min())
print(userArtistDF.select('artist').rdd.max())
print(userArtistDF.select('artist').rdd.min())

Row(user=2443548)
Row(user=90)
Row(artist=10794401)
Row(artist=1)


#### artist-name 데이터 로드

In [6]:
rawArtistData = sc.textFile("profiledata_06-May-2005\\artist_data.txt")
artistDataDF = rawArtistData\
    .map(lambda line: line.split("\t", 1))\
    .filter(lambda x: len(x) == 2)\
    .filter(lambda x: x[0].isdigit())\
    .map(lambda x: Row(artist=int(x[0]), name=x[1]))\
    .toDF()
artistDataDF.take(2)

[Row(artist=1134999, name='06Crazy Life'),
 Row(artist=6821360, name='Pang Nakarin')]

#### artist alias 데이터 로드

In [7]:
rawArtistAlias = sc.textFile("profiledata_06-May-2005\\artist_alias.txt")
artistAliasDF = rawArtistAlias\
    .map(lambda line: line.split("\t"))\
    .filter(lambda x: len(x) == 2)\
    .filter(lambda x: x[0].isdigit())\
    .filter(lambda x: x[1].isdigit())\
    .map(lambda x: Row(artist=int(x[0]), alias=int(x[1])))\
    .toDF()
artistAliasDF.take(2)

[Row(alias=1000311, artist=1092764), Row(alias=1000557, artist=1095122)]

#### user-artist data와 alias data를 결합하여 train dataset 생성

In [8]:
trainData = userArtistDF.join(artistAliasDF, "artist", "left_outer")\
    .rdd\
    .map(lambda x: (x['user'], x['artist'], x['count']) if x['alias']== None else (x['user'], x['alias'], x['count']))\
    .map(lambda x: (int(x[0]), int(x[1]), int(x[2])))
trainData.take(2)

[(1000159, 26, 1), (1000320, 26, 1)]

In [9]:
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

#### Build the recommendation model using Alternating Least Squares

In [10]:
rank = 10
iterations = 5
lbda = 0.01
seed = 42
model = ALS.trainImplicit(trainData, rank=rank, iterations=iterations, lambda_=lbda, alpha=1.0, seed=seed)

#### Feature extraction 확인

In [11]:
model.userFeatures().take(5)

[(7400,
  array('d', [-0.5253109335899353, -3.717756509780884, 0.5567010641098022, -0.6934528350830078, 1.8062853813171387, -0.573974609375, 2.7792887687683105, 0.9534238576889038, -2.383761405944824, -0.7505308985710144])),
 (8500,
  array('d', [-0.07504619657993317, -0.010923611931502819, 0.036028992384672165, -0.002535992069169879, 0.03441774845123291, 0.05178917571902275, 0.023467540740966797, -0.008269968442618847, 0.006576730404049158, -0.012308135628700256])),
 (14900,
  array('d', [-0.9122973084449768, -3.81801438331604, 0.007389751262962818, -0.1773201823234558, 2.0722339153289795, 0.6981947422027588, 1.440445065498352, 1.1119272708892822, -0.4951328635215759, -0.9021040797233582])),
 (15200,
  array('d', [-0.19351011514663696, -3.4464235305786133, 1.0366121530532837, -0.7943283319473267, 1.919632077217102, 1.3739653825759888, 2.6153273582458496, 1.8604228496551514, -0.963604211807251, 0.21567213535308838])),
 (1000100,
  array('d', [1.3315505981445312, -0.2838905453681946, 1.

In [12]:
model.productFeatures().take(5)

[(100,
  array('d', [-0.08007114380598068, -0.13515709340572357, -0.02065253257751465, -0.008643229492008686, 0.03264828398823738, 0.017810245975852013, 0.06439027190208435, 0.06781982630491257, -0.08726055175065994, -0.10340531915426254])),
 (200,
  array('d', [-0.02947927825152874, -0.042818766087293625, -0.0011996664106845856, -0.022181829437613487, -0.0069001102820038795, -0.0103484271094203, -0.008582467213273048, 0.035741522908210754, 0.011813233606517315, -0.0014511671615764499])),
 (300,
  array('d', [-0.04312776401638985, -0.09535614401102066, 0.023602696135640144, -0.02801547758281231, 0.13503451645374298, 0.08189631998538971, 0.03841058909893036, 0.010272203013300896, -0.13086190819740295, -0.0037975837476551533])),
 (500,
  array('d', [-0.031643643975257874, 0.005091628059744835, -0.026599954813718796, 0.0035310471430420876, 0.03397339954972267, -0.03464490920305252, 0.023981163278222084, -0.028873350471258163, -0.017264682799577713, 0.011411288753151894])),
 (600,
  array(

#### Spot Checking Recommendations

In [13]:
userID = 2093760
existingArtistIDs = trainData.filter(lambda x: x[0] == userID).map(lambda x: x[1]).collect()
artistDataDF.filter(artistDataDF.artist.isin(existingArtistIDs)).show()

+-------+---------------+
| artist|           name|
+-------+---------------+
|   1180|     David Gray|
|    378|  Blackalicious|
|    813|     Jurassic 5|
|1255340|The Saw Doctors|
|    942|         Xzibit|
+-------+---------------+



#### 특정 user가 좋아할 것으로 예상되는 artist 예측
model.recommendProducts(user, limit)

In [14]:
model.recommendProducts(userID, 5)

[Rating(user=2093760, product=2814, rating=0.029410752287782803),
 Rating(user=2093760, product=1001819, rating=0.02901680416033587),
 Rating(user=2093760, product=1300642, rating=0.02860463611174408),
 Rating(user=2093760, product=4605, rating=0.0283993331273249),
 Rating(user=2093760, product=1811, rating=0.028051171541554305)]

In [15]:
recommendedArtistIDs = [ x[1] for x in model.recommendProducts(userID, 5) ]
artistDataDF.filter(artistDataDF.artist.isin(recommendedArtistIDs)).show()

+-------+----------+
| artist|      name|
+-------+----------+
|   2814|   50 Cent|
|   4605|Snoop Dogg|
|   1811|   Dr. Dre|
|1001819|      2Pac|
|1300642|  The Game|
+-------+----------+



#### 특정 artist를 좋아할 것으로 예상되는 user 예측
model.recommendUsers(artist, limit)

In [15]:
model.recommendUsers(1, 5)

[Rating(user=1037240, product=1, rating=1.3606984320279356),
 Rating(user=1054417, product=1, rating=1.336792097354692),
 Rating(user=2167160, product=1, rating=1.3278224025483398),
 Rating(user=1005353, product=1, rating=1.3089582300854892),
 Rating(user=1001440, product=1, rating=1.3073439566057818)]

#### 특정 user의 특정 artist에 대한 count 예측
model.predict(user, artist)

In [16]:
model.predict(1000100, 1)

0.20708320290743037

#### model의 예측 결과에 대한 Mean Squared Error 계산

In [17]:
testData = trainData.map(lambda p: (p[0], p[1]))
predictions = model.predictAll(testData).map(lambda r: ((r[0], r[1]), r[2]))
ratesAndPreds = trainData.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
print("Mean Squared Error = " + str(MSE))

Mean Squared Error = 24103.494386597584


#### model 저장 (HDFS only)