In [1]:
import os
execfile(os.path.join(os.environ["SPARK_HOME"], 'python/pyspark/shell.py'))

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.1.1
      /_/

Using Python version 2.7.12 (default, Nov 19 2016 06:48:10)
SparkSession available as 'spark'.


<img src="formula.png">

In [2]:
from pyspark.sql import SparkSession
sparkSession = SparkSession.builder.enableHiveSupport().master("local [2]").getOrCreate()

In [3]:
data = sparkSession.read.parquet("/data/sample264")
meta = sparkSession.read.parquet("/data/meta")

## Normalization could be done by next function

In [5]:
from pyspark.sql import Window
from pyspark.sql.functions import row_number, sum

def norm(df, key1, field, n): 
    window = Window.partitionBy(key1).orderBy(col(field).desc())
        
    topsDF = df.withColumn("row_number", row_number().over(window)) \
        .filter(col("row_number") <= n) \
        .drop(col("row_number")) 
        
    tmpDF = topsDF.groupBy(col(key1)).agg(col(key1), sum(col(field)).alias("sum_" + field))
   
    normalizedDF = topsDF.join(tmpDF, key1, "inner") \
        .withColumn("norm_" + field, col(field) / col("sum_" + field)) \
        .cache()

    return normalizedDF

In [6]:
from pyspark.sql import Window
from pyspark.sql.functions import col, rank



data_fin = data.alias("data_prev").join(data.alias("data_next"), col("data_prev.userId")== col("data_next.userId"))

#task-task graph construction with specific rules
graph = data_fin.filter((col("data_prev.trackId") != col("data_next.trackId")) &
                (col("data_next.timestamp") - col("data_prev.timestamp") > 0) &
                (col("data_next.timestamp") - col("data_prev.timestamp") <= 60*7) &
                (col("data_prev.userId") == col("data_next.userId"))).select(col("data_next.userId").alias("userId"),
                                                                             col("data_prev.trackId").alias("track_prev"),
                                                                             col("data_next.trackId").alias("track_next"))

#graph.show(3)

#  Colaborative Similarity Count (Task-Task)

In [7]:
#NEIGHBOURHOOD = (tracks from same user, that are ranged with 420 secundes)
# we make norm of start vertices track_prev, becauase E=(track_prev, track_next), before it was user_id because E=(userId, track_id)

from pyspark.sql.functions import when 

#rule for weight norm construction: the source node should be less than end node
graph=graph.withColumn("id", when((col('track_next')>=col('track_prev')),col('track_prev')).otherwise(col('track_next'))) 
graph=graph.withColumn("id2", when((col('track_prev')<=col('track_next')),col('track_next')).otherwise(col('track_prev'))) 


track_track = graph.groupBy(col("id"), col("id2")).count()



#count colaborative similarity between two nodes id=track_prev, id=track_next
trackTrackNorm = norm(track_track, "id", "count", 40) \
        .withColumn("id", col("id")) \
        .withColumn("id2", col("id2")) \
        .withColumn("norm_count", col("norm_count")*0.5) \
        .select(col("id"), col("id2"), col("norm_count"))     


#order edges by colaborative similarity count
window = Window.orderBy(col("norm_count").desc())
    
trackTrackList = trackTrackNorm.withColumn("position", rank().over(window))\
    .filter(col("position") < 50)\
    .orderBy(col("id"), col("id2"))\
    .select(col("id"), col("id2"))\
    .take(40)

In [8]:
# for val in trackTrackList:
#    print "%s %s" % val

#  Colaborative Similarity Count (User-Track)

In [9]:
from pyspark.sql import Window
from pyspark.sql.functions import col, rank

userTrack = data.groupBy(col("userId"), col("trackId")).count()

userTrackNorm = norm(userTrack, "userId", "count", 1000) \
        .withColumn("id", col("userId")) \
        .withColumn("id2", col("trackId")) \
        .withColumn("norm_count", col("norm_count") * 0.5) \
        .select(col("id"), col("id2"), col("norm_count"))     

window = Window.orderBy(col("norm_count").desc())
    
userTrackList = userTrackNorm.withColumn("position", rank().over(window))\
    .filter(col("position") < 50)\
    .orderBy(col("id"), col("id2"))\
    .select(col("id"), col("id2"))\
    .take(40)

In [10]:
# for val in userTrackList:
#     print "%s %s" % val

#  Colaborative Similarity Count (User-Artist)

In [11]:

userArtist = data.groupBy(col("userId"), col("artistId")).count()

userArtistNorm = norm(userArtist, "userId", "count", 100) \
        .withColumn("id", col("userId")) \
        .withColumn("id2", col("artistId")) \
        .withColumn("norm_count", col("norm_count") * 0.5) \
        .select(col("id"), col("id2"), col("norm_count"))     

window = Window.orderBy(col("norm_count").desc())
    
userArtistList = userArtistNorm.withColumn("position", rank().over(window))\
    .filter(col("position") < 50)\
    .orderBy(col("id"), col("id2"))\
    .select(col("id"), col("id2"))\
    .take(40)

In [12]:
# for val in userArtistList:
#     print "%s %s" % val

#  Colaborative Similarity Count (Artist-Track)


In [13]:

artistTrack = data.groupBy(col("artistId"), col("trackId")).count()

artistTrackNorm = norm(artistTrack, "artistId", "count", 100) \
        .withColumn("id", col("artistId")) \
        .withColumn("id2", col("trackId")) \
        .withColumn("norm_count", col("norm_count") * 0.5) \
        .select(col("id"), col("id2"), col("norm_count"))     

window = Window.orderBy(col("norm_count").desc())
    
artistTrackList = artistTrackNorm.withColumn("position", rank().over(window))\
    .filter(col("position") < 50)\
    .orderBy(col("id"), col("id2"))\
    .select(col("id"), col("id2"))\
    .take(40)

In [14]:
# for val in artistTrackList:
#     print "%s %s" % val

In [15]:
data_meta = data.filter(data.userId==776748).join(meta, ((data.trackId==meta.Id) & (meta.type=="track")) | ((data.artistId==meta.Id) & (meta.type=="artist"))).select(col("Artist"), col("Name")).distinct()

data_meta.count()

42

In [16]:
user_track_artist = data_meta.orderBy(col("Artist"), col("Name"))\
    .take(40)

In [17]:
for val in user_track_artist:
    print "%s %s" % val

Artist: 3 Doors Down Artist: 3 Doors Down
Artist: 3 Doors Down Kryptonite
Artist: 311 Artist: 311
Artist: 311 Beautiful disaster
Artist: Blur Artist: Blur
Artist: Blur Girls and Boys
Artist: Clawfinger Artist: Clawfinger
Artist: Clawfinger Nothing Going On
Artist: Disturbed Artist: Disturbed
Artist: Disturbed The Vengeful One
Artist: Gotthard Artist: Gotthard
Artist: Gotthard Eagle
Artist: Green Day 21 Guns
Artist: Green Day Artist: Green Day
Artist: Green Day Kill The DJ
Artist: Iggy Pop Artist: Iggy Pop
Artist: Iggy Pop Sunday
Artist: Korn Artist: Korn
Artist: Korn Here To Stay
Artist: Linkin Park Artist: Linkin Park
Artist: Linkin Park In The End
Artist: Linkin Park Numb
Artist: Lordi Artist: Lordi
Artist: Lordi Hard Rock Hallelujah
Artist: Nickelback Artist: Nickelback
Artist: Nickelback She Keeps Me Up
Artist: Nomy Artist: Nomy
Artist: Nomy Cocaine
Artist: Papa Roach Artist: Papa Roach
Artist: Papa Roach Getting Away With Murder
Artist: Rise Against Artist: Rise Against
Artist: Ri

In [18]:
fullGraph = trackTrackNorm.union(userTrackNorm).union(userArtistNorm).union(artistTrackNorm)
fullGraph.show()

+------+------+--------------------+
|    id|   id2|          norm_count|
+------+------+--------------------+
|798477|883244|                 0.5|
|798692|898823|                 0.5|
|800467|855206|                 0.5|
|801701|920990|                 0.5|
|802599|908754|0.017857142857142856|
|802599|937714|0.017857142857142856|
|802599|811513|0.017857142857142856|
|802599|929402|0.017857142857142856|
|802599|924227|0.017857142857142856|
|802599|901687|0.017857142857142856|
|802599|860294|0.017857142857142856|
|802599|880642|0.017857142857142856|
|802599|920627|0.017857142857142856|
|802599|843219|0.017857142857142856|
|802599|892457|0.017857142857142856|
|802599|823001|0.017857142857142856|
|802599|899859|0.017857142857142856|
|802599|866435|0.017857142857142856|
|802599|881358|0.017857142857142856|
|802599|901328|0.017857142857142856|
+------+------+--------------------+
only showing top 20 rows



In [19]:
x = fullGraph.select("id").distinct().union(fullGraph.select("id2").distinct()).distinct()

In [20]:
x.count()

114658

In [21]:
from pyspark.sql import functions

data_meta = data.filter(data.userId==776748).join(meta, ((data.trackId==meta.Id) & (meta.type=="track")) | ((data.artistId==meta.Id) & (meta.type=="artist"))).select("userId", "Id").distinct()
cs = data_meta.withColumn("new_col",functions.lit(1))
ts = cs.select(cs.Id.alias("n_id"), "new_col")
ts.show(3)

+-------+-------+
|   n_id|new_col|
+-------+-------+
|1343667|      1|
|1225222|      1|
|1331090|      1|
+-------+-------+
only showing top 3 rows



In [22]:
from pyspark.sql.functions import col, expr, when

joined = x.join(ts,x.id==ts.n_id,"left_outer").select("Id", "new_col")

Xs = joined.withColumn("new_col", when(col("new_col").isNull(), 0).otherwise(1))
x_data = Xs.withColumn("new_col", when(col("Id")==776748, 1).otherwise(col("new_col")))

                     
x_data.show(150)

+------+-------+
|    Id|new_col|
+------+-------+
|  3175|      0|
|  5518|      0|
|  5803|      0|
|  6654|      0|
|  7253|      0|
|  7340|      0|
|  8086|      0|
| 10206|      0|
| 12046|      0|
| 17420|      0|
| 18498|      0|
| 22097|      0|
| 22373|      0|
| 24354|      0|
| 26623|      0|
| 29993|      0|
| 31035|      0|
| 31951|      0|
| 33569|      0|
| 34234|      0|
| 38220|      0|
| 40383|      0|
| 45615|      0|
| 46994|      0|
| 47217|      0|
| 50353|      0|
| 51123|      0|
| 57201|      0|
| 63964|      0|
| 67861|      0|
| 69478|      0|
| 71527|      0|
| 71995|      0|
| 75122|      0|
| 75705|      0|
| 78400|      0|
| 91446|      0|
| 95476|      0|
|101627|      0|
|104880|      0|
|117500|      0|
|120861|      0|
|124743|      0|
|126365|      0|
|127109|      0|
|128589|      0|
|135267|      0|
|135533|      0|
|137377|      0|
|137501|      0|
|141422|      0|
|144475|      0|
|145011|      0|
|146411|      0|
|146988|      0|
|154034|      

In [23]:
# data_meta = data.filter(data.userId==776748).join(meta, ((data.trackId==meta.Id) & (meta.type=="track")) | ((data.artistId==meta.Id) & (meta.type=="artist"))).select(col("Artist"), col("Name")).distinct()
# data_meta.count()

In [24]:
u = x.withColumn("new_col", when(col("Id")==776748, 1).otherwise(0))

In [25]:
u.show(30)

+------+-------+
|    id|new_col|
+------+-------+
|798477|      0|
|798692|      0|
|800467|      0|
|801701|      0|
|802599|      0|
|803868|      0|
|808110|      0|
|808445|      0|
|809289|      0|
|814446|      0|
|814618|      0|
|817317|      0|
|818440|      0|
|827148|      0|
|827209|      0|
|828034|      0|
|828366|      0|
|829131|      0|
|829292|      0|
|829705|      0|
|829847|      0|
|830062|      0|
|831434|      0|
|832475|      0|
|832553|      0|
|833685|      0|
|834439|      0|
|836204|      0|
|836522|      0|
|840315|      0|
+------+-------+
only showing top 30 rows



In [26]:
fullGraph = trackTrackNorm.union(userTrackNorm).union(userArtistNorm).union(artistTrackNorm)


In [27]:
def getWBetaNorm(df, beta=0.5):
    df_ = df.withColumn("beta", functions.lit(0.5))
    return df_.withColumn("wbeta", functions.lit(col("norm_count")*col("beta"))) 

userTrack_ = getWBetaNorm(userTrackNorm, 0.5)
userArtist_ = getWBetaNorm(userArtistNorm, 0.5)
artistTrack_ = getWBetaNorm(artistTrackNorm, 1)
trackTrack_ = getWBetaNorm(trackTrackNorm,1)


full_graph = userTrack_.union(userArtist_).union(artistTrack_).union(trackTrack_)

In [28]:

def getNextV(x_data, full_graph):
    next_v = full_graph.join(x_data, full_graph.id2==x_data.Id, "right_outer").select(full_graph.wbeta,x_data.Id,x_data.new_col).withColumn("wbeta", when(col("wbeta").isNull(), 0).otherwise(col("wbeta")*col("new_col")))
    return next_v


In [29]:
#next_u.count()
def getSumV(next_u):
    sum_v = next_u.groupBy(col("Id")).agg(sum(col("wbeta")).alias("new_col"))
    return sum_v

In [32]:
def get_next_x(sum_v):
    alfa_diff = sum_v.withColumn("new_col",col("new_col")*0.85)
    #alfa_diff.filter(alfa_diff.Id==776748).show()
    alfa_fin = alfa_diff.withColumn("new_col", when(col("Id")==776748, col("new_col")+1*0.15).otherwise(col("new_col")))
    #alfa_fin.filter(alfa_fin.Id==776748).show()
    x_prime = alfa_fin
    
    return x_prime

In [33]:
for i in range(5):
    next_v = getNextV(x_data, full_graph)
    sum_v = getSumV(next_v)
    x_data = get_next_x(sum_v)


In [35]:
x_data.show()

+-----+-------+
|   Id|new_col|
+-----+-------+
| 3175|    0.0|
| 5518|    0.0|
| 5803|    0.0|
| 6654|    0.0|
| 7253|    0.0|
| 7340|    0.0|
| 8086|    0.0|
|10206|    0.0|
|12046|    0.0|
|17420|    0.0|
|18498|    0.0|
|22097|    0.0|
|22373|    0.0|
|24354|    0.0|
|26623|    0.0|
|29993|    0.0|
|31035|    0.0|
|31951|    0.0|
|33569|    0.0|
|34234|    0.0|
+-----+-------+
only showing top 20 rows



In [52]:

final_result = x_data.join(meta, ((x_data.Id==meta.Id) & (meta.type=="track")) | ((x_data.Id==meta.Id) & (meta.type=="artist"))).select(col("Name"), col("Artist"), col("new_col")).distinct()


window = Window.orderBy(col("new_col").desc())
    
top_recommendations = final_result.withColumn("position", rank().over(window))\
    .filter(col("position") < 50).select("Name","Artist","new_col").take(40)
    



In [53]:
top_recommendations

[Row(Name=u'I Hate Everything About You', Artist=u'Artist: Three Days Grace', new_col=0.11195541729626124),
 Row(Name=u'21 Guns', Artist=u'Artist: Green Day', new_col=0.0649963900791406),
 Row(Name=u'Beautiful disaster', Artist=u'Artist: 311', new_col=0.032424681668088005),
 Row(Name=u'Come Out and Play', Artist=u'Artist: The Offspring', new_col=0.0166309089598253),
 Row(Name=u'Here To Stay', Artist=u'Artist: Korn', new_col=0.015198100030326882),
 Row(Name=u'In The End', Artist=u'Artist: Linkin Park', new_col=0.0037172034769009273),
 Row(Name=u'Girls and Boys', Artist=u'Artist: Blur', new_col=0.0037172034769009273),
 Row(Name=u'Eagle', Artist=u'Artist: Gotthard', new_col=0.0037172034769009273),
 Row(Name=u'Getting Away With Murder', Artist=u'Artist: Papa Roach', new_col=0.0037172034769009273),
 Row(Name=u'Take It Out On Me', Artist=u'Artist: Thousand Foot Krutch', new_col=0.002094037580611728),
 Row(Name=u'Wait And Bleed', Artist=u'Artist: Slipknot', new_col=0.002094037580611728),
 Row

In [55]:
for val in top_recommendations:
    print "%s %s %f" % val

I Hate Everything About You Artist: Three Days Grace 0.111955
21 Guns Artist: Green Day 0.064996
Beautiful disaster Artist: 311 0.032425
Come Out and Play Artist: The Offspring 0.016631
Here To Stay Artist: Korn 0.015198
In The End Artist: Linkin Park 0.003717
Girls and Boys Artist: Blur 0.003717
Eagle Artist: Gotthard 0.003717
Getting Away With Murder Artist: Papa Roach 0.003717
Take It Out On Me Artist: Thousand Foot Krutch 0.002094
Wait And Bleed Artist: Slipknot 0.002094
Nothing Going On Artist: Clawfinger 0.000619
Sunday Artist: Iggy Pop 0.000520
Kryptonite Artist: 3 Doors Down 0.000520
She Keeps Me Up Artist: Nickelback 0.000520
The Vengeful One Artist: Disturbed 0.000520
Sky is Over Artist: Serj Tankian 0.000520
Hard Rock Hallelujah Artist: Lordi 0.000520
Prayer Of The Refugee Artist: Rise Against 0.000520
Cocaine Artist: Nomy 0.000520
Kill The DJ Artist: Green Day 0.000355
Numb Artist: Linkin Park 0.000019
Artist: Green Day Artist: Green Day 0.000000
Artist: Linkin Park Artist: