In [1]:
import os
execfile(os.path.join(os.environ["SPARK_HOME"], 'python/pyspark/shell.py'))

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.1.1
      /_/

Using Python version 2.7.12 (default, Nov 19 2016 06:48:10)
SparkSession available as 'spark'.


In [2]:
from pyspark.sql import SparkSession
sparkSession = SparkSession.builder.enableHiveSupport().master("local[*]").getOrCreate()

In [3]:
from pyspark.sql import functions as f
from pyspark.sql import Window

In [4]:
data = sparkSession.read.parquet("/data/sample264")

In [5]:
data.take(10)

[Row(userId=13065, trackId=944906, artistId=978428, timestamp=1501588527),
 Row(userId=101897, trackId=799685, artistId=989262, timestamp=1501555608),
 Row(userId=215049, trackId=871513, artistId=988199, timestamp=1501604269),
 Row(userId=309769, trackId=857670, artistId=987809, timestamp=1501540265),
 Row(userId=397833, trackId=903510, artistId=994595, timestamp=1501597615),
 Row(userId=501769, trackId=818149, artistId=994975, timestamp=1501577955),
 Row(userId=601353, trackId=958990, artistId=973098, timestamp=1501602467),
 Row(userId=710921, trackId=916226, artistId=972031, timestamp=1501611582),
 Row(userId=6743, trackId=801006, artistId=994339, timestamp=1501584964),
 Row(userId=152407, trackId=913509, artistId=994334, timestamp=1501571055)]

In [6]:
artistTrack = data.groupBy('artistId', 'trackId').count()

In [7]:
artistTrack.take(10)

[Row(artistId=986534, trackId=829140, count=5),
 Row(artistId=995135, trackId=967720, count=25),
 Row(artistId=983387, trackId=829641, count=135),
 Row(artistId=969750, trackId=955248, count=29),
 Row(artistId=970395, trackId=929329, count=23),
 Row(artistId=988199, trackId=870619, count=82),
 Row(artistId=995788, trackId=885715, count=16),
 Row(artistId=987932, trackId=958532, count=36),
 Row(artistId=1000709, trackId=852389, count=1),
 Row(artistId=991186, trackId=824970, count=2)]

In [8]:
def norm(df, key1, key2, field, n): 
    
    window = Window.partitionBy(key1).orderBy(f.col(field).desc())
        
    topsDF = df.withColumn('row_number', f.row_number().over(window)) \
        .filter(f.col('row_number') <= n) \
        .drop(f.col('row_number')) 
        
    tmpDF = topsDF.groupBy(f.col(key1)).agg(f.col(key1), f.sum(f.col(field)).alias('sum_' + field))
   
    normalizedDF = topsDF.join(tmpDF, key1, 'inner') \
        .withColumn('norm_' + field, f.col(field) / f.col('sum_' + field)) \
        .cache()

    return normalizedDF

In [9]:
artistTrackNorm = norm(artistTrack, 'artistId', 'trackId', 'count', 100) \
        .select('artistId', 'trackId', 'norm_count')

In [10]:
artistTrackNorm.take(30)

[Row(artistId=968694, trackId=827354, norm_count=0.25),
 Row(artistId=968694, trackId=820606, norm_count=0.25),
 Row(artistId=968694, trackId=897139, norm_count=0.25),
 Row(artistId=968694, trackId=925696, norm_count=0.25),
 Row(artistId=969344, trackId=933592, norm_count=1.0),
 Row(artistId=969479, trackId=959227, norm_count=0.44166666666666665),
 Row(artistId=969479, trackId=819606, norm_count=0.2),
 Row(artistId=969479, trackId=929291, norm_count=0.10833333333333334),
 Row(artistId=969479, trackId=798826, norm_count=0.075),
 Row(artistId=969479, trackId=890444, norm_count=0.05),
 Row(artistId=969479, trackId=826621, norm_count=0.041666666666666664),
 Row(artistId=969479, trackId=860239, norm_count=0.025),
 Row(artistId=969479, trackId=882651, norm_count=0.016666666666666666),
 Row(artistId=969479, trackId=886945, norm_count=0.008333333333333333),
 Row(artistId=969479, trackId=944749, norm_count=0.008333333333333333),
 Row(artistId=969479, trackId=927174, norm_count=0.008333333333333

In [11]:
window = Window.orderBy(f.col('norm_count').desc())
    
artistTrackList = artistTrackNorm.withColumn('position', f.rank().over(window)) \
    .filter(f.col('position') < 40) \
    .orderBy('artistId', 'trackId') \
    .select('artistId', 'trackId') \
    .take(40)

In [12]:
for val in artistTrackList:
    print "%s %s" % val

967993 869415
967998 947428
968004 927380
968017 859321
968022 852786
968034 807671
968038 964150
968042 835935
968043 913568
968046 935077
968047 806127
968065 907906
968073 964586
968086 813446
968092 837129
968118 914441
968125 821410
968140 953008
968148 877445
968161 809793
968163 803065
968168 876119
968189 858639
968221 896937
968224 892880
968232 825536
968237 932845
968238 939177
968241 879045
968242 911250
968248 953554
968255 808494
968259 880230
968265 950148
968266 824437
968269 913243
968272 816049
968278 946743
968285 847460
968286 940006
