In [3]:
# Instalar PySpark via pip
!pip install pyspark



In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("abdulszz/spotify-most-streamed-songs")

print("Path to dataset files:", path)

path_f1 = kagglehub.dataset_download("rprkh15/f1-race-and-qualifying-data")

print("Path to dataset files:", path_f1)

Path to dataset files: /root/.cache/kagglehub/datasets/abdulszz/spotify-most-streamed-songs/versions/2
Downloading from https://www.kaggle.com/api/v1/datasets/download/rprkh15/f1-race-and-qualifying-data?dataset_version_number=45...


100%|██████████| 1.68M/1.68M [00:00<00:00, 66.5MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/rprkh15/f1-race-and-qualifying-data/versions/45


In [4]:
# Iniciar uma Sessão Spark
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("AppPySpark") \
    .getOrCreate()

**Comparando métodos do pandas x métodos do PySpark**

Leitura de Dados:

In [5]:
import pandas as pd
df_pandas = pd.read_csv("/root/.cache/kagglehub/datasets/abdulszz/spotify-most-streamed-songs/versions/2/Spotify Most Streamed Songs.csv")

In [6]:
df_pandas.head(4)

Unnamed: 0,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,...,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%,cover_url
0,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",2,2023,7,14,553,147,141381703,43,...,B,Major,80,89,83,31,0,8,4,Not Found
1,LALA,Myke Towers,1,2023,3,23,1474,48,133716286,48,...,C#,Major,71,61,74,7,0,10,4,https://i.scdn.co/image/ab67616d0000b2730656d5...
2,vampire,Olivia Rodrigo,1,2023,6,30,1397,113,140003974,94,...,F,Major,51,32,53,17,0,31,6,https://i.scdn.co/image/ab67616d0000b273e85259...
3,Cruel Summer,Taylor Swift,1,2019,8,23,7858,100,800840817,116,...,A,Major,55,58,72,11,0,11,15,https://i.scdn.co/image/ab67616d0000b273e787cf...


In [7]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("AppLeitura").getOrCreate()

# Leitura de um arquivo CSV
df_pyspark = spark.read.csv("/root/.cache/kagglehub/datasets/abdulszz/spotify-most-streamed-songs/versions/2/Spotify Most Streamed Songs.csv", header=True, inferSchema=True)

In [8]:
#Nomes das colunas
for i in df_pandas.columns:
  print(i)

track_name
artist(s)_name
artist_count
released_year
released_month
released_day
in_spotify_playlists
in_spotify_charts
streams
in_apple_playlists
in_apple_charts
in_deezer_playlists
in_deezer_charts
in_shazam_charts
bpm
key
mode
danceability_%
valence_%
energy_%
acousticness_%
instrumentalness_%
liveness_%
speechiness_%
cover_url


Seleção de Colunas:

In [9]:
# Seleciona colunas específicas - Pandas
df_selecionado_pandas = df_pandas[['artist(s)_name', 'track_name','streams']]

In [10]:
# Seleciona colunas específicas - PySpark
df_selecionado_spark = df_pyspark.select('artist(s)_name', 'track_name','streams')

Filtragem de Dados:

In [11]:
# Filtra linhas onde coluna1 > 100M - Pandas
# Convert 'streams' column to numeric, handling errors by coercing to NaN
df_pandas['streams'] = pd.to_numeric(df_pandas['streams'], errors='coerce')
# Filter rows after conversion
df_filtrado_pandas = df_pandas[df_pandas['streams'] > 100000000]

In [12]:
# Filtra linhas onde coluna1 > 100M - PySpark
df_filtrado_spark = df_pyspark.filter(df_pyspark.streams > 100000000)

Agregações:

In [13]:
# Calcula a média de coluna1 agrupada por coluna2 - Pandas
df_agregado_pandas = df_pandas.groupby('artist(s)_name')['streams'].mean().reset_index()

In [14]:
# Calcula a média de coluna1 agrupada por coluna2 - PySpark
from pyspark.sql.functions import avg, format_number, desc

df_agregado_spark = df_pyspark.groupBy("artist(s)_name") \
    .agg(avg("streams").alias("artists_avg_streams")) \
    .withColumn("formatted_avg_streams", format_number("artists_avg_streams", 2)) \
    .select("artist(s)_name", "formatted_avg_streams") \
    .orderBy(desc("formatted_avg_streams"))

In [15]:
df_selecionado_spark.show(5)
df_filtrado_spark.show(5)
df_agregado_spark.show(5)

+----------------+--------------------+---------+
|  artist(s)_name|          track_name|  streams|
+----------------+--------------------+---------+
|Latto, Jung Kook|Seven (feat. Latt...|141381703|
|     Myke Towers|                LALA|133716286|
|  Olivia Rodrigo|             vampire|140003974|
|    Taylor Swift|        Cruel Summer|800840817|
|       Bad Bunny|      WHERE SHE GOES|303236322|
+----------------+--------------------+---------+
only showing top 5 rows

+--------------------+----------------+------------+-------------+--------------+------------+--------------------+-----------------+---------+------------------+---------------+-------------------+----------------+----------------+---+---+-----+--------------+---------+--------+--------------+------------------+----------+-------------+--------------------+
|          track_name|  artist(s)_name|artist_count|released_year|released_month|released_day|in_spotify_playlists|in_spotify_charts|  streams|in_apple_playlists|in

Junções (Joins):

In [17]:
# Junção de dois DataFrames pandas
df1 = pd.read_csv("/root/.cache/kagglehub/datasets/rprkh15/f1-race-and-qualifying-data/versions/45/2022/2022/Race Results/australia_race_results.csv")
df2 = pd.read_csv("/root/.cache/kagglehub/datasets/rprkh15/f1-race-and-qualifying-data/versions/45/1988/Race Results/australia_race_results.csv")
df_juncao = pd.merge(df1, df2, on="Driver", how="left")

In [18]:
# Junção de dois DataFrames PySpark
df1 = spark.read.csv("/root/.cache/kagglehub/datasets/rprkh15/f1-race-and-qualifying-data/versions/45/2022/2022/Race Results/australia_race_results.csv", header=True, inferSchema=True)
df2 = spark.read.csv("/root/.cache/kagglehub/datasets/rprkh15/f1-race-and-qualifying-data/versions/45/1988/Race Results/australia_race_results.csv", header=True, inferSchema=True)
df_juncao = df1.join(df2, on="Driver", how="left")