In [1]:
import pandas as pd
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

spark = SparkSession.builder.getOrCreate()

In [2]:
# Import data from csv file using spark
spark_charts = spark.read.csv("data/charts.csv", header=True, inferSchema=True)
spark_charts.columns

['title',
 'rank',
 'date',
 'artist',
 'url',
 'region',
 'chart',
 'trend',
 'streams']

In [3]:
spark_charts[['title', 'region']].show(truncate=False, n=5)

+---------------------------+---------+
|title                      |region   |
+---------------------------+---------+
|Chantaje (feat. Maluma)    |Argentina|
|Vente Pa' Ca (feat. Maluma)|Argentina|
|Reggaetón Lento (Bailemos) |Argentina|
|Safari                     |Argentina|
|Shaky Shaky                |Argentina|
+---------------------------+---------+
only showing top 5 rows



In [4]:
type(spark_charts)

pyspark.sql.dataframe.DataFrame

In [5]:
# chain the filter, groupby, and agg operations
result = (spark_charts.filter((F.col("chart") == "top200") & (F.col("region") == "Australia")) 
          .groupby("artist") 
           .agg(
               F.countDistinct("date").alias("n_dates"), 
               F.countDistinct("title").alias("n_songs"))
          .orderBy(F.desc("n_dates"))
         )

In [6]:
# display the result
result.show(n=10)

+------------+-------+-------+
|      artist|n_dates|n_songs|
+------------+-------+-------+
|  Ed Sheeran|   1811|     60|
| The Killers|   1810|      3|
|James Arthur|   1807|      9|
|   Vance Joy|   1803|     18|
|       Drake|   1793|     82|
|XXXTENTACION|   1710|     46|
| Post Malone|   1688|     41|
|Travis Scott|   1611|     26|
|      Eminem|   1589|     55|
|       Oasis|   1580|      2|
+------------+-------+-------+
only showing top 10 rows



In [8]:
# Get the names of the Oasis songs that ranked in the top 10 for Australia
filtered_df = spark_charts.filter((F.col("artist") == "Oasis") & (F.col("region") == "Australia"))

counted_df = filtered_df.groupBy("title").count().orderBy(F.desc("count"))

counted_df.show(truncate=False)


+-----------------------+-----+
|title                  |count|
+-----------------------+-----+
|Wonderwall - Remastered|1519 |
|Wonderwall             |61   |
+-----------------------+-----+



In [None]:
# For local PySpark applications there is no need to explicitly disconnect or stop SparkSession
# If running on a distributed environment you may need to explicitly disconnect to avoid leaving
# idle resources.  In this case you can use `SparkSession.stop()`