In [None]:
# DO NOT RUN THIS CELL
#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('PySparkShell').getOrCreate()
sc = spark.sparkContext
#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!#!

In [1]:
spark

In [2]:
import datetime
import pyspark.sql.functions as F

In [7]:
charts = spark.read.parquet('files/charts.parquet')
# charts = charts.toDF('id', 'title', 'position', 'date', 'countryId', 'chartName', 'movement', 'streams')
charts.createOrReplaceTempView('charts')
chartsRDD = sc.textFile('files/charts.csv').map(lambda line: line.split(','))

list(enumerate(charts.columns))

[(0, 'id'),
 (1, 'title'),
 (2, 'position'),
 (3, 'date'),
 (4, 'countryId'),
 (5, 'chartName'),
 (6, 'movement'),
 (7, 'streams')]

In [131]:
regions = spark.read.parquet('files/regions.parquet')
regions.createOrReplaceTempView('regions')
regionsRDD = sc.textFile('files/regions.csv').map(lambda line: line.split(','))

list(enumerate(regions.columns))

[(0, 'id'), (1, 'countryName')]

## Query 1

In [5]:
spark.sql('''
    SELECT sum(streams) FROM charts
    WHERE chartName="top200" AND title="Shape of You"
''').show()

+------------+
|sum(streams)|
+------------+
|  2324245979|
+------------+



In [8]:
title = 1
chartName = 5
streams = 7

(
    chartsRDD
     .filter(lambda x: x[chartName] == 'top200' and x[title] == 'Shape of You')
     .map(lambda x: int(x[streams]))
     .reduce(lambda x, y: x + y)
)

2324245979

## Query 2

In [82]:
(
    charts
     .filter(F.col('position') == 1)
     .groupBy(['chartName', 'id'])
     .count()
     .withColumn('avgCount', F.col('count') / 69)
     .groupBy('chartName')
     .agg(F.max('avgCount'))
     .show()
)

+---------+------------------+
|chartName|     max(avgCount)|
+---------+------------------+
|   top200|  54.2463768115942|
|  viral50|24.985507246376812|
+---------+------------------+



In [91]:
spark.sql('''
    SELECT chartName, first(title) title, max(avgCount) maxAvgTime
    FROM (
        SELECT chartName, first(title) title, count(*)/69 avgCount
        FROM charts
        WHERE position=1
        GROUP BY chartName, id
    )
    GROUP BY chartName
''').show()

+---------+----------------+------------------+
|chartName|           title|        maxAvgTime|
+---------+----------------+------------------+
|   top200|            Burn|  54.2463768115942|
|  viral50|A Million Dreams|24.985507246376812|
+---------+----------------+------------------+



In [63]:
title = 1
position = 2
chartName = 5

(
    chartsRDD
     .filter(lambda x: x[position] == '1')
     .map(lambda x: ((x[chartName], x[title]), 1))
     .reduceByKey(lambda x, y: x + y)
     .map(lambda x: (x[0][0], (x[0][1], x[1]/69)))
     .reduceByKey(lambda x, y: x if x[1] > y[1] else y)
     .map(lambda x: (x[0], *x[1]))
     .collect()
)

[('viral50', 'Calma - Remix', 24.985507246376812),
 ('top200', 'Shape of You', 54.2463768115942)]

## Query 3

In [24]:
(
    charts
     .filter((F.col('position') == 1) & (F.col('chartName') == 'top200'))
     .groupBy('date')
     .agg(F.sum('streams'))
     .groupBy([F.year('date'), F.month('date')])
     .agg(F.mean('sum(streams)'))
     .orderBy([F.year('date'), F.month('date')])
     .show(3)
)

+----------+-----------+-----------------+
|year(date)|month(date)|avg(sum(streams))|
+----------+-----------+-----------------+
|      2017|          1|7618611.064516129|
|      2017|          2|8876450.785714285|
|      2017|          3| 8955476.41935484|
+----------+-----------+-----------------+
only showing top 3 rows



In [27]:
chartsRDD.take(1)

[['1607',
  '+',
  '9',
  '2020-01-13T00:00:00.000+02:00',
  '46',
  'viral50',
  'NEW_ENTRY',
  '""']]

In [52]:
list(enumerate(charts.columns))

[(0, 'id'),
 (1, 'title'),
 (2, 'position'),
 (3, 'date'),
 (4, 'countryId'),
 (5, 'chartName'),
 (6, 'movement'),
 (7, 'streams')]

In [50]:
def parse_date(s):
    dt = datetime.datetime.strptime(s[:10], '%Y-%m-%d')
    return dt.date()

In [81]:
position = 2
date = 3
chartName = 5
streams = 7

(
    chartsRDD
     .filter(lambda x: x[position] == '1' and x[chartName] == 'top200')
     .map(lambda x: (parse_date(x[date]), int(x[streams])))
     .reduceByKey(lambda x, y: x + y)
     .map(lambda x: ((x[0].year, x[0].month), (x[1], 1)))
     .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))
     .sortByKey()
     .map(lambda x:(x[0][0], x[0][1], x[1][0] / x[1][1]))
     .take(3)
)

[(2017, 1, 7618611.064516129),
 (2017, 2, 8876450.785714285),
 (2017, 3, 8955476.41935484)]

In [79]:
spark.sql('''
    SELECT year(date), month(date), sum(streamsDay) streams1, count(*)
    FROM (
        SELECT date, sum(streams) streamsDay
        FROM charts
        WHERE position = 1 AND chartName == "top200"
        GROUP BY date
    )
    GROUP BY year(date), month(date)
    ORDER BY year(date), month(date)
''').show(3)

+----------+-----------+---------+--------+
|year(date)|month(date)| streams1|count(1)|
+----------+-----------+---------+--------+
|      2017|          1|236176943|      31|
|      2017|          2|248540622|      28|
|      2017|          3|277619769|      31|
+----------+-----------+---------+--------+
only showing top 3 rows



## Query 4

In [216]:
spark.sql('''
    SELECT countryName, songId songId, title, maxCount
    FROM (
        SELECT countryId, songId, title, cnt, 
            max(cnt) OVER (PARTITION BY countryId) AS maxCount 
        FROM (
            SELECT countryId, id songId, first(title) title, count(*) cnt
            FROM charts
            WHERE chartName = "viral50"
            GROUP BY countryId, id
        )
    )
    LEFT JOIN regions
    ON countryId = regions.id
    WHERE cnt = maxCount
    ORDER BY countryName, title
''').show(3)

+-----------+------+--------------------+--------+
|countryName|songId|               title|maxCount|
+-----------+------+--------------------+--------+
|    Andorra| 55526|Friday (feat. Muf...|     251|
|  Argentina| 35851|        Dance Monkey|     253|
|  Australia| 35851|        Dance Monkey|     217|
+-----------+------+--------------------+--------+
only showing top 3 rows



In [219]:
songId = 0
title = 1
countryId = 4
chartName = 5

# (countryId, songId), (title, count)
counts = (
    chartsRDD
     .filter(lambda x: x[chartName] == 'viral50')
     .map(lambda x: ((x[countryId], x[songId]), (x[title], 1)))
     .reduceByKey(lambda x, y: (x[0], x[1] + y[1]))
     .map(lambda x: (x[0][0], (x[0][1], *x[1])))
)

# countryId, maxCount
max_counts = (
    counts
     .map(lambda x: (x[0], x[1][2]))
     .reduceByKey(max)
     .sortByKey()
)

# Joining now because both tables are tiny and of equal index

# countryId, (countryName, countryId)
max_counts_named = regionsRDD.join(max_counts)

q4 = (
    counts
     .join(max_counts_named)
     .filter(lambda x: x[1][0][2] == x[1][1][1])
     .map(lambda x: (x[1][1][0], x[1][0][0], x[1][0][1], x[1][1][1]))
     .sortBy(lambda x: (x[0], x[1]))
)

q4.take(3)

[('Andorra', '55526', 'Friday (feat. Mufasa;Hypeman) - Dopamine Re-Edit', 251),
 ('Argentina', '35851', 'Dance Monkey', 253),
 ('Australia', '35851', 'Dance Monkey', 217)]

## Q5