# Section B - Data Preprocessing with Diagnostics
This notebook performs preprocessing while also analyzing the dataset to optimize model performance downstream.

In [1]:
# Load libraries
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv("../data/Spotify_Youtube.csv")  # Update path if needed
df.head()

Unnamed: 0.1,Unnamed: 0,Artist,Url_spotify,Track,Album,Album_type,Uri,Danceability,Energy,Key,...,Url_youtube,Title,Channel,Views,Likes,Comments,Description,Licensed,official_video,Stream
0,0,Gorillaz,https://open.spotify.com/artist/3AA28KZvwAUcZu...,Feel Good Inc.,Demon Days,album,spotify:track:0d28khcov6AiegSCpG5TuT,0.818,0.705,6.0,...,https://www.youtube.com/watch?v=HyHNuVaZJ-k,Gorillaz - Feel Good Inc. (Official Video),Gorillaz,693555221.0,6220896.0,169907.0,Official HD Video for Gorillaz' fantastic trac...,True,True,1040235000.0
1,1,Gorillaz,https://open.spotify.com/artist/3AA28KZvwAUcZu...,Rhinestone Eyes,Plastic Beach,album,spotify:track:1foMv2HQwfQ2vntFf9HFeG,0.676,0.703,8.0,...,https://www.youtube.com/watch?v=yYDmaexVHic,Gorillaz - Rhinestone Eyes [Storyboard Film] (...,Gorillaz,72011645.0,1079128.0,31003.0,The official video for Gorillaz - Rhinestone E...,True,True,310083700.0
2,2,Gorillaz,https://open.spotify.com/artist/3AA28KZvwAUcZu...,New Gold (feat. Tame Impala and Bootie Brown),New Gold (feat. Tame Impala and Bootie Brown),single,spotify:track:64dLd6rVqDLtkXFYrEUHIU,0.695,0.923,1.0,...,https://www.youtube.com/watch?v=qJa-VFwPpYA,Gorillaz - New Gold ft. Tame Impala & Bootie B...,Gorillaz,8435055.0,282142.0,7399.0,Gorillaz - New Gold ft. Tame Impala & Bootie B...,True,True,63063470.0
3,3,Gorillaz,https://open.spotify.com/artist/3AA28KZvwAUcZu...,On Melancholy Hill,Plastic Beach,album,spotify:track:0q6LuUqGLUiCPP1cbdwFs3,0.689,0.739,2.0,...,https://www.youtube.com/watch?v=04mfKJWDSzI,Gorillaz - On Melancholy Hill (Official Video),Gorillaz,211754952.0,1788577.0,55229.0,Follow Gorillaz online:\nhttp://gorillaz.com \...,True,True,434663600.0
4,4,Gorillaz,https://open.spotify.com/artist/3AA28KZvwAUcZu...,Clint Eastwood,Gorillaz,album,spotify:track:7yMiX7n9SBvadzox8T5jzT,0.663,0.694,10.0,...,https://www.youtube.com/watch?v=1V_xRb0x9aw,Gorillaz - Clint Eastwood (Official Video),Gorillaz,618480958.0,6197318.0,155930.0,The official music video for Gorillaz - Clint ...,True,True,617259700.0


In [2]:
# Check shape and dtypes
print("Dataset shape:", df.shape)
print("\nColumn types:\n", df.dtypes.value_counts())
df.dtypes

Dataset shape: (20718, 28)

Column types:
 float64    15
object     12
int64       1
Name: count, dtype: int64


Unnamed: 0            int64
Artist               object
Url_spotify          object
Track                object
Album                object
Album_type           object
Uri                  object
Danceability        float64
Energy              float64
Key                 float64
Loudness            float64
Speechiness         float64
Acousticness        float64
Instrumentalness    float64
Liveness            float64
Valence             float64
Tempo               float64
Duration_ms         float64
Url_youtube          object
Title                object
Channel              object
Views               float64
Likes               float64
Comments            float64
Description          object
Licensed             object
official_video       object
Stream              float64
dtype: object

In [3]:
# Check for missing values
na_counts = df.isna().sum()
na_counts = na_counts[na_counts > 0].sort_values(ascending=False)
print("Missing values (top):\n", na_counts.head(10))
print("\nTotal rows:", len(df))

Missing values (top):
 Description       876
Stream            576
Comments          569
Likes             541
Url_youtube       470
official_video    470
Licensed          470
Views             470
Channel           470
Title             470
dtype: int64

Total rows: 20718


In [4]:
# Remove songs missing either Spotify or YouTube
initial_len = len(df)
df = df.dropna(subset=['Url_youtube', 'Url_spotify'])
print(f"Removed {initial_len - len(df)} rows with missing Youtube or Spotify")

Removed 470 rows with missing Youtube or Spotify


In [5]:
# Create target and preserve Album_type for exploration
df['Target'] = df['Album_type'].map({'single': 1, 'album': 0, 'compilation': 0})
print("Target distribution:\n", df['Target'].value_counts(normalize=True))

Target distribution:
 Target
0    0.759976
1    0.240024
Name: proportion, dtype: float64


In [6]:
# Examine number of unique values per column
uniques = df.nunique().sort_values()
print("\nUnique values per column:\n", uniques)


Unique values per column:
 Target                  2
official_video          2
Licensed                2
Album_type              3
Key                    12
Danceability          897
Energy               1263
Valence              1290
Speechiness          1296
Liveness             1525
Artist               2063
Url_spotify          2063
Acousticness         3133
Instrumentalness     3989
Channel              6714
Loudness             9298
Comments            10485
Album               11727
Duration_ms         14419
Tempo               14772
Description         17395
Track               17485
Likes               17939
Stream              18100
Title               18146
Url_youtube         18154
Uri                 18489
Views               19245
Unnamed: 0          20248
dtype: int64


In [7]:
# Detect high-cardinality columns (which may explode feature space)
high_card_cols = uniques[uniques > 100].index.tolist()
print("\nColumns with more than 100 unique values:", high_card_cols)


Columns with more than 100 unique values: ['Danceability', 'Energy', 'Valence', 'Speechiness', 'Liveness', 'Artist', 'Url_spotify', 'Acousticness', 'Instrumentalness', 'Channel', 'Loudness', 'Comments', 'Album', 'Duration_ms', 'Tempo', 'Description', 'Track', 'Likes', 'Stream', 'Title', 'Url_youtube', 'Uri', 'Views', 'Unnamed: 0']


In [8]:
# Print correlation matrix (numeric only)
numeric_df = df.select_dtypes(include=[np.number])
corr = numeric_df.corr(numeric_only=True)
print("\nCorrelations with Target:\n", corr['Target'].sort_values(ascending=False))


Correlations with Target:
 Target              1.000000
Unnamed: 0          0.290173
Danceability        0.158965
Loudness            0.138373
Energy              0.085833
Speechiness         0.041966
Key                 0.030638
Likes               0.015808
Tempo               0.010155
Valence             0.001728
Comments           -0.001068
Liveness           -0.016957
Views              -0.022998
Instrumentalness   -0.037189
Acousticness       -0.055439
Duration_ms        -0.073129
Stream             -0.077450
Name: Target, dtype: float64


In [None]:
# Save intermediate version to CSV
df.to_csv("../output/B_cleaned_ready_for_modeling.csv", index=False)
print("Cleaned data saved.")