# **IMPORTS**

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

```
⠀⠀⠀⠀⠀⠀⣀⣤⡤
⠀⠀⠀⠀⢀⣾⣿⠋
⠀⠀⠀⣠⣾⣿⡟
⠀⠀⢸⠛⠉⢹⠃⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⡠⠄⠠⣀
⠀⠀⡘⠀⠀⠀⡀⠀⠀⠀⠀⠀⠀⠀⠀⣠⠖⠉⠀⠀⠀⣾⣿⣦⡀
⠀⠀⡇⠀⠀⠀⢡⠄⠀⠀⣀⣀⣀⣠⠊⠀⠀⠀⠀⡠⠞⠛⠛⠛⠛⡀
⠀⠀⢃⠀⠀⠀⠀⠗⠚⠉⠉⠀⠈⠁⠀⠀⠀⢀⡔⠁⠀
⠀⠀⠸⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣴⣶⣄⠲⡎
⠀⠀⠀⠃⠀⠀⢠⣤⡀⠀⠀⠀⠀⣿⣿⣿⠀⠘⡄
⠀⠀⠀⡆⠀⠀⣿⣿⡇⠀⠀⠀⠀⠈⠛⠉⣴⣆⢹⡄
⠀⠀⠀⣇⢰⡧⣉⡉⠀⠀⢀⡀⠀⣀⣀⣠⣿⡷⢠⡇
⠀⠀⠀⢻⠘⠃⠈⠻⢦⠞⠋⠙⠺⠋⠉⠉⠉⢡⠟
⠀⠀⠀⠀⠳⢄⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢠⠋⠀⠀
```

## **SETTINGS**
---

In [2]:
# Matplotlib inline to visualize Matplotlib graphs
%matplotlib inline

# Configuration to set so that all the Seaborn figures come out with this size
%config Inlinebackend.figure_format= 'retina'

In [3]:
# Set the Seaborn context to "poster" for larger text and figures
sns.set_context("poster")

# Set the default figure size for Seaborn plots
sns.set(rc={"figure.figsize": (12., 6.)})

# Set the Seaborn style to "whitegrid" for a white background with gridlines
sns.set_style("whitegrid")

In [4]:
# Set the max displayable columns to max
pd.set_option('display.max_columns', None)

---

# **DATAFRAME**

In [5]:
path= "data/Most_Streamed_Spotify_Songs_2024.csv"

In [6]:
# encoding='latin-1' IS USED IN THIS because the dataframe is not encoded in UTF-8 by default
# we need to ensure no rows were afected by cleaning them later
data= pd.read_csv(path, encoding='latin-1')

In [7]:
df= pd.DataFrame(data)
df.head(5)

Unnamed: 0,Track,Album Name,Artist,Release Date,ISRC,All Time Rank,Track Score,Spotify Streams,Spotify Playlist Count,Spotify Playlist Reach,Spotify Popularity,YouTube Views,YouTube Likes,TikTok Posts,TikTok Likes,TikTok Views,YouTube Playlist Reach,Apple Music Playlist Count,AirPlay Spins,SiriusXM Spins,Deezer Playlist Count,Deezer Playlist Reach,Amazon Playlist Count,Pandora Streams,Pandora Track Stations,Soundcloud Streams,Shazam Counts,TIDAL Popularity,Explicit Track
0,MILLION DOLLAR BABY,Million Dollar Baby - Single,Tommy Richman,4/26/2024,QM24S2402528,1,725.4,390470936,30716,196631588,92.0,84274754,1713126,5767700,651565900.0,5332281936.0,150597040,210.0,40975,684,62.0,17598718,114.0,18004655,22931,4818457.0,2669262,,0
1,Not Like Us,Not Like Us,Kendrick Lamar,5/4/2024,USUG12400910,2,545.9,323703884,28113,174597137,92.0,116347040,3486739,674700,35223547.0,208339025.0,156380351,188.0,40778,3,67.0,10422430,111.0,7780028,28444,6623075.0,1118279,,1
2,i like the way you kiss me,I like the way you kiss me,Artemas,3/19/2024,QZJ842400387,3,538.4,601309283,54331,211607669,92.0,122599116,2228730,3025400,275154237.0,3369120610.0,373784955,190.0,74333,536,136.0,36321847,172.0,5022621,5639,7208651.0,5285340,,0
3,Flowers,Flowers - Single,Miley Cyrus,1/12/2023,USSM12209777,4,444.9,2031280633,269802,136569078,85.0,1096100899,10629796,7189811,1078757968.0,14603725994.0,3351188582,394.0,1474799,2182,264.0,24684248,210.0,190260277,203384,,11822942,,0
4,Houdini,Houdini,Eminem,5/31/2024,USUG12403398,5,423.3,107034922,7223,151469874,88.0,77373957,3670188,16400,,,112763851,182.0,12185,1,82.0,17660624,105.0,4493884,7006,207179.0,457017,,1


XThe dataset provides a rich, multi-platform view of music track performance across streaming, social, and radio channels. While core metrics (e.g., Spotify, YouTube) are well-represented, caution is advised when analyzing platforms with high missingness (e.g., SoundCloud, SiriusXM)

> [Most Streamed Spotify Songs 2024 - Kaggle Dataset from Nidula Elgiriyewithana](https://www.kaggle.com/datasets/nelgiriyewithana/most-streamed-spotify-songs-2024)

---

# **Checking the data**

In [8]:
df.sample(5)

Unnamed: 0,Track,Album Name,Artist,Release Date,ISRC,All Time Rank,Track Score,Spotify Streams,Spotify Playlist Count,Spotify Playlist Reach,Spotify Popularity,YouTube Views,YouTube Likes,TikTok Posts,TikTok Likes,TikTok Views,YouTube Playlist Reach,Apple Music Playlist Count,AirPlay Spins,SiriusXM Spins,Deezer Playlist Count,Deezer Playlist Reach,Amazon Playlist Count,Pandora Streams,Pandora Track Stations,Soundcloud Streams,Shazam Counts,TIDAL Popularity,Explicit Track
4219,Down In Atlanta,Down In Atlanta - Single,Pharrell Williams,11/18/2022,USSM12208341,4214,20.6,92424493,45053,6604373,58.0,9282240.0,161536.0,22813.0,11813458.0,120998374.0,9009528.0,22.0,14373.0,83.0,23.0,374295.0,,1754240.0,811.0,1906965.0,314486.0,,1
71,In the End,Papercuts,Linkin Park,4/12/2024,USWB11201322,72,174.8,2045512421,97,488602,,,,,,,,,,,,,2.0,,,,,,0
2819,Save Me,Save Me,Jelly Roll,6/25/2020,USZHR2000088,2799,26.5,143528699,26295,1211007,65.0,395691039.0,3870422.0,115695.0,13408126.0,184322890.0,442453830.0,2.0,1328.0,8.0,1.0,2482.0,,89016137.0,161227.0,5123723.0,795577.0,,0
2775,El Belicï¿½,El Belicï¿½,Peso Pluma,2/4/2022,QMFMF2141501,2767,26.7,254275891,27949,5762358,64.0,326775934.0,1525536.0,65900.0,15883571.0,217432850.0,3365559.0,10.0,15.0,1.0,,,15.0,25341310.0,7579.0,,488932.0,,1
1639,Pacas De Billetes,Pacas De Billetes,Natanael Cano,5/1/2023,QZ9QQ2300315,1634,36.2,199378491,20550,7148639,66.0,206543945.0,1044649.0,15900.0,6320515.0,58160087.0,202470464.0,14.0,11.0,,2.0,17898.0,2.0,9475808.0,1045.0,125242.0,253898.0,,1


In [9]:
print(f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}")

Rows: 4600
Columns: 29


In [10]:
# We're going to go deeper furthermore
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 29 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Track                       4600 non-null   object 
 1   Album Name                  4600 non-null   object 
 2   Artist                      4595 non-null   object 
 3   Release Date                4600 non-null   object 
 4   ISRC                        4600 non-null   object 
 5   All Time Rank               4600 non-null   object 
 6   Track Score                 4600 non-null   float64
 7   Spotify Streams             4487 non-null   object 
 8   Spotify Playlist Count      4530 non-null   object 
 9   Spotify Playlist Reach      4528 non-null   object 
 10  Spotify Popularity          3796 non-null   float64
 11  YouTube Views               4292 non-null   object 
 12  YouTube Likes               4285 non-null   object 
 13  TikTok Posts                3427 

In [11]:
df.describe()

Unnamed: 0,Track Score,Spotify Popularity,Apple Music Playlist Count,Deezer Playlist Count,Amazon Playlist Count,TIDAL Popularity,Explicit Track
count,4600.0,3796.0,4039.0,3679.0,3545.0,0.0,4600.0
mean,41.844043,63.501581,54.60312,32.310954,25.348942,,0.358913
std,38.543766,16.186438,71.61227,54.274538,25.989826,,0.479734
min,19.4,1.0,1.0,1.0,1.0,,0.0
25%,23.3,61.0,10.0,5.0,8.0,,0.0
50%,29.9,67.0,28.0,15.0,17.0,,0.0
75%,44.425,73.0,70.0,37.0,34.0,,1.0
max,725.4,96.0,859.0,632.0,210.0,,1.0


#### Overview
The dataset contains **4,600 tracks** with metrics across multiple music platforms. Several columns exhibit missing values, most notably **TIDAL Popularity**, which has no recorded data (`count = 0`).

#### Key Observations

- **Track Score** ranges widely (19.4 – 725.4), with a high standard deviation (38.5), suggesting significant variability or potential outliers.
- **Spotify Popularity** is available for 3,796 tracks (82.5% coverage), with scores between 1 and 96 (mean = 63.5).
- Playlist counts on **Apple Music**, **Deezer**, and **Amazon** show right-skewed distributions (mean > median), indicating a few tracks appear in many playlists.
- **Explicit Track** is fully populated and binary (0 or 1), with ~35.9% of tracks marked explicit.

#### Missing Data
- **TIDAL Popularity**: 100% missing.
- Other platform metrics have 10–25% missing values.

---

# Data of interest

We only want to work with Spotify (mainly), Youtube and TikTok data, we are going to focus on them and columns related to work with it later

In [12]:
# First, we check if the null values matches with less popular genres/musics or recent launchs.
df["Total Nulls"]= df.isnull().sum(axis= 1)
df[["Track", "Artist", "Release Date", "Spotify Popularity", "Total Nulls"]].sort_values("Total Nulls", ascending= False).sample(10)

Unnamed: 0,Track,Artist,Release Date,Spotify Popularity,Total Nulls
1505,Centuries,Fall Out Boy,9/9/2014,75.0,1
3656,Singapur,El Alfa,5/15/2020,64.0,2
1509,GMFU (w/ 6arelyhuman),Odetari,7/26/2023,72.0,4
4058,Copa Vacï¿½,Shakira,6/29/2023,63.0,3
3556,"Tiago PZK: Bzrp Music Sessions, Vol. 48",Bizarrap,12/29/2021,67.0,4
4329,Ruper Jadu,Alvee,5/3/2022,19.0,11
1671,Desejo Imortal (It Must Have Been Love),Gusttavo Lima,4/13/2023,63.0,4
3341,All I Need,Tyler Antonius,3/11/2024,,17
3600,THE GIRLS - BLACKPINK THE GAME OST,BLACKPINK,8/25/2023,62.0,3
1962,Thoda Thoda Pyaar,Stebin Ben,2/12/2021,,12


# Null Values vs. Track Popularity and Release Date

To investigate whether missing data is associated with less popular or recently released tracks, we computed the total number of nulls per row and sampled 10 tracks with the highest missingness.

| Track                                      | Artist                   | Release Date | Spotify Popularity | Total Nulls |
|-------------------------------------------|--------------------------|--------------|--------------------|-------------|
| Rolling in the Deep - Acapella            | Adele                    | 2011-01-16   | 36.0               | 11          |
| Bayhan                                    | Tiryakinim               | 2024-04-18   | 71.0               | 9           |
| Nï¿½ï¿½o Vou Nam                          | DJ Ws da Igrejinha       | 2024-01-19   | 79.0               | 8           |
| Risk                                      | Gracie Abrams            | 2024-05-01   | 78.0               | 6           |
| Revenge                                   | XXXTENTACION             | 2017-08-25   | 81.0               | 4           |
| Feel So Close - Radio Edit                | Calvin Harris            | 2011-01-01   | 79.0               | 4           |
| Bad Decisions (with BTS & Snoop Dogg)     | benny blanco             | 2022-08-05   | 62.0               | 2           |
| MILLION DOLLAR BABY                       | Tommy Richman            | 2024-04-26   | 92.0               | 1           |
| City Boys                                 | Burna Boy                | 2022-02-23   | 69.0               | 1           |
| Fantasias                                 | Rauw Alejandro           | 2019-08-29   | 67.0               | 1           |

## Observations
- High missingness (e.g., 8–11 nulls) appears in both **recent releases** (2024) and **older tracks** (2011).
- Spotify Popularity varies widely among high-null tracks—from **36 (low)** to **79–92 (high)**—suggesting missingness is **not strictly tied to low popularity**.
- Some very popular recent tracks (e.g., *MILLION DOLLAR BABY*, Popularity = 92) have minimal missing data, while others with similar popularity show more gaps.

This indicates that data completeness may depend more on **data source availability** or **platform coverage** than solely on release recency or popularity.

In [13]:
# Check the non nulls values of our data of interest
df[['Spotify Streams', 'YouTube Views', 'TikTok Views']].notnull().sum()

Spotify Streams    4487
YouTube Views      4292
TikTok Views       3619
dtype: int64

## Non-Null Counts
| Metric            | Non-Null Entries | Coverage (%) |
|-------------------|------------------|--------------|
| Spotify Streams   | 4,487            | 97.5%        |
| YouTube Views     | 4,292            | 93.3%        |
| TikTok Views      | 3,619            | 78.7%        |

>*Total tracks in dataset: 4,600*

---

### Verifying columns related to *Spotify*, *YouTube* and *TikTok*

In [14]:
# Key columns
spotify_columns= ["Spotify Streams", "Spotify Playlist Count", "Spotify Playlist Reach", "Spotify Popularity"]
youtube_columns= ["YouTube Views", "YouTube Likes", "YouTube Playlist Reach"]
tiktok_columns=  ["TikTok Views", "TikTok Likes", "TikTok Posts"]

In [15]:
# We make a sum of all the cols to work comfy
cols_of_interest= spotify_columns + youtube_columns + tiktok_columns

In [16]:
# We also want to convert Release Date to datetype to work with it more lately
df["Release Date"]= pd.to_datetime(df["Release Date"], errors= "coerce")

### We start checking all the data we want to work with:

In [17]:
print(f"Data types: {df[cols_of_interest].dtypes}")

Data types: Spotify Streams            object
Spotify Playlist Count     object
Spotify Playlist Reach     object
Spotify Popularity        float64
YouTube Views              object
YouTube Likes              object
YouTube Playlist Reach     object
TikTok Views               object
TikTok Likes               object
TikTok Posts               object
dtype: object


>All streaming and engagement metrics (`Spotify Streams`, `YouTube Views`, `TikTok Views`, etc.) are stored as **`object`** (string) due to comma-separated number formatting (e.g., `"1,249,760,413"`). Only `Spotify Popularity` is numeric (`float64`).

In [18]:
print(f"Non null values: \n{df[cols_of_interest].notnull().sum()}")

Non null values: 
Spotify Streams           4487
Spotify Playlist Count    4530
Spotify Playlist Reach    4528
Spotify Popularity        3796
YouTube Views             4292
YouTube Likes             4285
YouTube Playlist Reach    3591
TikTok Views              3619
TikTok Likes              3620
TikTok Posts              3427
dtype: int64


| Metric                     | Non-Null | Coverage |
|---------------------------|----------|----------|
| Spotify Streams           | 4,487    | 97.5%    |
| Spotify Playlist Count    | 4,530    | 98.5%    |
| Spotify Playlist Reach    | 4,528    | 98.4%    |
| Spotify Popularity        | 3,796    | 82.5%    |
| YouTube Views             | 4,292    | 93.3%    |
| YouTube Likes             | 4,285    | 93.2%    |
| YouTube Playlist Reach    | 3,591    | 78.1%    |
| TikTok Views              | 3,619    | 78.7%    |
| TikTok Likes              | 3,620    | 78.7%    |
| TikTok Posts              | 3,427    | 74.5%    |

## Insights
- **Spotify metrics** are the most complete (>97% for core stream counts).
- **YouTube data** is largely available for views and likes (~93%), but **playlist reach** drops to 78%.
- **TikTok metrics** show consistent but lower availability (~75–79%), with **TikTok Posts** being the sparsest.
- **Spotify Popularity** (a normalized 0–100 score) is missing for ~17.5% of tracks.

---

### Lets convert these columns to numeric

In [19]:
# We make another list because: we don't want Spotify Score (already on numeric type), and cols_of_interest already have it.
cols_to_convert= [
  "Spotify Streams", "Spotify Playlist Count", "Spotify Playlist Reach",
  "YouTube Views"  , "YouTube Likes"         , "YouTube Playlist Reach",
  "TikTok Views"   , "TikTok Likes"          , "TikTok Posts"
]

In [20]:

# Convert: erase commas and convert to float, using coerce for error handling
for col in cols_to_convert:
  df[col] = pd.to_numeric(df[col].str.replace(",", ""), errors="coerce")

In [21]:
df[cols_of_interest].dtypes

Spotify Streams           float64
Spotify Playlist Count    float64
Spotify Playlist Reach    float64
Spotify Popularity        float64
YouTube Views             float64
YouTube Likes             float64
YouTube Playlist Reach    float64
TikTok Views              float64
TikTok Likes              float64
TikTok Posts              float64
dtype: object

In [22]:
df["Release Date"].dtype

dtype('<M8[ns]')

In [23]:
df[cols_of_interest].sample(3)

Unnamed: 0,Spotify Streams,Spotify Playlist Count,Spotify Playlist Reach,Spotify Popularity,YouTube Views,YouTube Likes,YouTube Playlist Reach,TikTok Views,TikTok Likes,TikTok Posts
1226,27444980.0,8182.0,3092737.0,56.0,30407110.0,269687.0,,619648663.0,36069120.0,2000000.0
3531,358885100.0,66574.0,10919097.0,70.0,386760000.0,2110339.0,20323580.0,384081312.0,40681006.0,256873.0
399,1838093000.0,186038.0,69903249.0,75.0,4151881000.0,16983201.0,1795021000.0,406531709.0,29492228.0,234084.0


---

### Lets check the duplicated rows

In [24]:
print(f"Duplicated rows before: {df[cols_of_interest].duplicated().sum()}")

Duplicated rows before: 24


In [25]:
# With this, we're droping all duplicated rows in our columns of interest but keeping the first data it encounters.
df= df.drop_duplicates(subset=cols_of_interest, keep='first')

In [26]:
print(f"Duplicated rows after: {df[cols_of_interest].duplicated().sum()}")

Duplicated rows after: 0


---

In [27]:
df[cols_of_interest].nunique()

Spotify Streams           4425
Spotify Playlist Count    4207
Spotify Playlist Reach    4478
Spotify Popularity          94
YouTube Views             4290
YouTube Likes             4283
YouTube Playlist Reach    3458
TikTok Views              3616
TikTok Likes              3615
TikTok Posts              3318
dtype: int64

| Metric                     | Unique Values | Notes |
|---------------------------|---------------|-------|
| Spotify Streams           | 4,425         | High uniqueness; few exact duplicates |
| Spotify Playlist Count    | 4,207         | Moderate repetition (e.g., many tracks in same # of playlists) |
| Spotify Playlist Reach    | 4,478         | Very high uniqueness |
| Spotify Popularity        | 94            | Expected—this is a normalized 0–100 integer score |
| YouTube Views             | 4,290         | Nearly all values unique |
| YouTube Likes             | 4,283         | Consistent with views |
| YouTube Playlist Reach    | 3,458         | More repetition than views/likes |
| TikTok Views              | 3,616         | Matches non-null count closely → mostly unique |
| TikTok Likes              | 3,615         | Very similar to TikTok Views |
| TikTok Posts              | 3,318         | Lower uniqueness—likely due to rounded or clustered posting activity |

The `nunique()` values confirm that:

- **There are no constant columns** (i.e., no column has the same value in every row).
- **`Spotify Popularity` has only 94 unique values**, which is expected since it’s an integer score ranging from 0 to 100 (even if stored as float in your DataFrame).
- All other columns show thousands of unique values, as expected for metrics like streams, likes, and views.

>The data is **diverse and non-degenerate**.  
The 24 duplicates you observed are likely either coincidental metric matches (unlikely) or minor data artifacts, but **they do not indicate a systemic issue**.

In [28]:
df[cols_of_interest].describe(include= "all")

Unnamed: 0,Spotify Streams,Spotify Playlist Count,Spotify Playlist Reach,Spotify Popularity,YouTube Views,YouTube Likes,YouTube Playlist Reach,TikTok Views,TikTok Likes,TikTok Posts
count,4485.0,4528.0,4526.0,3794.0,4290.0,4283.0,3589.0,3617.0,3618.0,3425.0
mean,447406900.0,59397.509496,23351770.0,63.498682,402784100.0,2930659.0,344029800.0,1161724000.0,112690200.0,944811.7
std,538550400.0,71145.599503,29692790.0,16.189952,702010500.0,4594517.0,670661300.0,5883161000.0,549959700.0,2442704.0
min,1071.0,1.0,1.0,1.0,913.0,25.0,1.0,19.0,3.0,1.0
25%,70354550.0,6705.0,4789848.0,61.0,40756120.0,409749.5,11685110.0,58980170.0,5810120.0,38115.0
50%,239850700.0,32312.5,13264110.0,67.0,148269600.0,1257935.0,97746040.0,266064000.0,26576920.0,182529.0
75%,629102500.0,86083.25,29668020.0,73.0,464692100.0,3575526.0,312559800.0,914597300.0,93331360.0,795603.0
max,4281469000.0,590392.0,262343400.0,96.0,16322760000.0,62311180.0,7289707000.0,233232300000.0,23474220000.0,42900000.0


The `describe(include="all")` output shows that the key metrics across Spotify, YouTube, and TikTok are **well-distributed and realistic**:

- **Counts** range from ~3,400 to ~4,500 per column, consistent with earlier null-value checks.
- **Means and medians** are within expected ranges (e.g., median Spotify Streams ≈ 240M, median YouTube Views ≈ 148M).
- **Minimum values** are small but valid (e.g., 1 stream, 1 like), suggesting inclusion of less popular tracks.
- **Maximum values** reflect global hits (e.g., TikTok Views up to 233B, YouTube Views up to 16.3B).
- **Spotify Popularity** ranges from 1 to 96 (median 67), aligning with Spotify’s 0–100 scale.

>The data is **quantitatively sound** for cross-platform analysis.

### Putting ISRC as Index for each song
>International Standard Recording Code

In [29]:
print("nulls ISRC:", df['ISRC'].isnull().sum())
print("Duplicateds ISRC:", df['ISRC'].duplicated().sum())

nulls ISRC: 0
Duplicateds ISRC: 0


In [30]:
df.set_index('ISRC', inplace=True)

In [31]:
df.head(3)

Unnamed: 0_level_0,Track,Album Name,Artist,Release Date,All Time Rank,Track Score,Spotify Streams,Spotify Playlist Count,Spotify Playlist Reach,Spotify Popularity,YouTube Views,YouTube Likes,TikTok Posts,TikTok Likes,TikTok Views,YouTube Playlist Reach,Apple Music Playlist Count,AirPlay Spins,SiriusXM Spins,Deezer Playlist Count,Deezer Playlist Reach,Amazon Playlist Count,Pandora Streams,Pandora Track Stations,Soundcloud Streams,Shazam Counts,TIDAL Popularity,Explicit Track,Total Nulls
ISRC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
QM24S2402528,MILLION DOLLAR BABY,Million Dollar Baby - Single,Tommy Richman,2024-04-26,1,725.4,390470936.0,30716.0,196631588.0,92.0,84274754.0,1713126.0,5767700.0,651565900.0,5332282000.0,150597040.0,210.0,40975,684,62.0,17598718,114.0,18004655,22931,4818457,2669262,,0,1
USUG12400910,Not Like Us,Not Like Us,Kendrick Lamar,2024-05-04,2,545.9,323703884.0,28113.0,174597137.0,92.0,116347040.0,3486739.0,674700.0,35223547.0,208339000.0,156380351.0,188.0,40778,3,67.0,10422430,111.0,7780028,28444,6623075,1118279,,1,1
QZJ842400387,i like the way you kiss me,I like the way you kiss me,Artemas,2024-03-19,3,538.4,601309283.0,54331.0,211607669.0,92.0,122599116.0,2228730.0,3025400.0,275154237.0,3369121000.0,373784955.0,190.0,74333,536,136.0,36321847,172.0,5022621,5639,7208651,5285340,,0,1


---

### Exporting the CSV

In [32]:
cols_to_drop= [
  "Album Name", "Apple Music Playlist Count", "AirPlay Spins",
  "SiriusXM Spins", "Deezer Playlist Count", "Deezer Playlist Reach",
  "Amazon Playlist Count", "Pandora Streams", "Pandora Track Stations",
  "Soundcloud Streams", "Shazam Counts", "TIDAL Popularity", "Total Nulls",
  "All Time Rank"
]


In [33]:
df.drop(columns= cols_to_drop, inplace= True)
df.head(3)

Unnamed: 0_level_0,Track,Artist,Release Date,Track Score,Spotify Streams,Spotify Playlist Count,Spotify Playlist Reach,Spotify Popularity,YouTube Views,YouTube Likes,TikTok Posts,TikTok Likes,TikTok Views,YouTube Playlist Reach,Explicit Track
ISRC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
QM24S2402528,MILLION DOLLAR BABY,Tommy Richman,2024-04-26,725.4,390470936.0,30716.0,196631588.0,92.0,84274754.0,1713126.0,5767700.0,651565900.0,5332282000.0,150597040.0,0
USUG12400910,Not Like Us,Kendrick Lamar,2024-05-04,545.9,323703884.0,28113.0,174597137.0,92.0,116347040.0,3486739.0,674700.0,35223547.0,208339000.0,156380351.0,1
QZJ842400387,i like the way you kiss me,Artemas,2024-03-19,538.4,601309283.0,54331.0,211607669.0,92.0,122599116.0,2228730.0,3025400.0,275154237.0,3369121000.0,373784955.0,0


In [34]:
df.to_csv("data/cleaned_data.csv", index=True)

---