In [1]:
import os
execfile(os.path.join(os.environ["SPARK_HOME"], 'python/pyspark/shell.py'))

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.1.1
      /_/

Using Python version 2.7.12 (default, Nov 19 2016 06:48:10)
SparkSession available as 'spark'.


In [2]:
from pyspark.sql import SparkSession
sparkSession = SparkSession.builder.enableHiveSupport().master("local [2]").getOrCreate()

In [3]:
data = sparkSession.read.parquet("/data/sample264")
meta = sparkSession.read.parquet("/data/meta")

## Normalization could be done by next function

In [4]:
from pyspark.sql import Window
from pyspark.sql.functions import row_number, sum

def norm(df, key1, key2, field, n): 
    
    window = Window.partitionBy(key1).orderBy(col(field).desc())
        
    topsDF = df.withColumn("row_number", row_number().over(window)) \
        .filter(col("row_number") <= n) \
        .drop(col("row_number")) 
        
    tmpDF = topsDF.groupBy(col(key1)).agg(col(key1), sum(col(field)).alias("sum_" + field))
   
    normalizedDF = topsDF.join(tmpDF, key1, "inner") \
        .withColumn("norm_" + field, col(field) / col("sum_" + field)) \
        .cache()

    return normalizedDF

In [5]:
from pyspark.sql import Window
from pyspark.sql.functions import col, rank

userTrack = data.groupBy(col("userId"), col("trackId")).count()

userTrackNorm = norm(userTrack, "userId", "trackId", "count", 1000) \
        .withColumn("id", col("userId")) \
        .withColumn("id2", col("trackId")) \
        .withColumn("norm_count", col("norm_count") * 0.5) \
        .select(col("id"), col("id2"), col("norm_count"))     

window = Window.orderBy(col("norm_count"))
    
userTrackList = userTrackNorm.withColumn("position", rank().over(window))\
    .filter(col("position") < 50)\
    .orderBy(col("id"), col("id2"))\
    .select(col("id"), col("id2"))\
    .take(40)

# Task 1

In [8]:
from pyspark.sql.functions import col, desc, asc, collect_list

In [9]:
SEVEN_MIN = 60 * 7

tracks_tracks = data \
    .alias('data_1').join(data.alias('data_2'), \
        (col('data_1.userId') == col('data_2.userId')) &
        (col('data_1.trackId') != col('data_2.trackId')) &
        (
            (col('data_1.timestamp') - col('data_2.timestamp') <= SEVEN_MIN) &
            (col('data_1.timestamp') - col('data_2.timestamp') >= -SEVEN_MIN)
        ), 'inner') \
    .select(
        col('data_1.trackId').alias('id1'),
        col('data_2.trackId').alias('id2')
    ) \
    .groupBy(col('id1'), col('id2')).count() \
    .orderBy(desc('count'))

tracks_tracks.show(5)

+------+------+-----+
|   id1|   id2|count|
+------+------+-----+
|870292|939606|  253|
|939606|870292|  253|
|854531|879259|  195|
|879259|854531|  195|
|933030|871513|  159|
+------+------+-----+
only showing top 5 rows



In [10]:
results = norm(tracks_tracks, 'id1', 'id2', 'count', 40) \
    .select(col('id1'), col('id2'), col('norm_count')) \
    .orderBy(desc('norm_count'), asc('id1'), asc('id2'))

results.show(5)

+------+------+----------+
|   id1|   id2|norm_count|
+------+------+----------+
|798256|923706|       1.0|
|798319|837992|       1.0|
|798322|876562|       1.0|
|798331|827364|       1.0|
|798335|840741|       1.0|
+------+------+----------+
only showing top 5 rows



In [11]:
for row in results.take(40):
    print row['id1'], row['id2']

798256 923706
798319 837992
798322 876562
798331 827364
798335 840741
798374 816874
798375 810685
798379 812055
798380 840113
798396 817687
798398 926302
798405 867217
798443 905923
798457 918918
798460 891840
798461 940379
798470 840814
798474 963162
798477 883244
798485 955521
798505 905671
798545 949238
798550 936295
798626 845438
798691 818279
798692 898823
798702 811440
798704 937570
798725 933147
798738 894170
798745 799665
798782 956938
798801 950802
798820 890393
798833 916319
798865 962662
798931 893574
798946 946408
799012 809997
799024 935246
