In [18]:
import polars as pl

In [19]:
# Load Spotify Dataset to Polars Dataframe (lazy loading API bc large datasets)
spotify_df = pl.scan_csv('hf://datasets/maharshipandya/spotify-tracks-dataset/dataset.csv').select(["track_id", "artists", "track_name"])

In [20]:
# Load Chord progressions dataset to Polars Dataframe (lazy loading API bc large datasets)
chord_df = pl.scan_csv('hf://datasets/ailsntua/Chordonomicon/chordonomicon_v2.csv').select(["chords", "genres", "spotify_song_id"])

In [21]:
# Genius lyrics dataset to Polars Dataframe (lazy loading API bc large datasets)
genius_df = pl.scan_csv('hf://datasets/sebastiandizon/genius-song-lyrics/song_lyrics 2.csv').select(["title", "artist", "lyrics"])

In [22]:
spotify_df = spotify_df.drop_nulls(subset=["track_id", "track_name", "artists"])

In [23]:
chord_df = chord_df.drop_nulls(subset=["spotify_song_id", "chords", "genres"])

In [24]:
genius_df = genius_df.drop_nulls(subset=["title", "artist", "lyrics"])

In [25]:
def normalize_title(col_name: str):
    return (
        pl.col(col_name)
        .str.to_lowercase()
        .str.replace_all(r"[\(\[].*?[\)\]]", "")
        .str.replace_all(r"[^\w\s]", "")
        .str.replace_all(r"\s+", " ")
        .str.strip_chars()
    )

In [26]:
def normalize_artist(col_name: str):
    return (
        pl.col(col_name)
        .str.to_lowercase()
        .str.replace_all(r"[\(\[].*?[\)\]]", "")
        .str.split(";").list.first()
        .str.replace_all(r"[^\w\s]", "")
        .str.strip_chars()
    )

In [27]:
spotify_df = spotify_df.with_columns(
    normalize_title("track_name").alias("norm_track_name"),
    normalize_artist("artists").alias("norm_artists")
)

In [31]:
genius_df = genius_df.with_columns(
    normalize_title("title").alias("title_norm"),
    normalize_artist("artist").alias("artist_norm")  # no split needed, single artist
)

In [29]:
out1 = chord_df.join(
    spotify_df,
    left_on="spotify_song_id",
    right_on="track_id",
    how="inner"
)

In [30]:
final = out1.join(
    genius_df,
    left_on=["norm_track_name", "norm_artists"],
    right_on=["title_norm", "artist_norm"],
    how="inner"
)

In [32]:
print(final.explain())

INNER JOIN:
LEFT PLAN ON: [col("norm_track_name"), col("norm_artists")]
  INNER JOIN:
  LEFT PLAN ON: [col("spotify_song_id")]
    Csv SCAN [https://huggingface.co/datasets/ailsntua/Chordonomicon/resolve/main/chordonomicon_v2.csv]
    PROJECT 3/10 COLUMNS
    SELECTION: [([(col("genres").is_not_null()) & (col("chords").is_not_null())]) & (col("spotify_song_id").is_not_null())]
    ESTIMATED ROWS: 559563
  RIGHT PLAN ON: [col("track_id")]
     WITH_COLUMNS:
     [col("track_name").str.to_lowercase().str.replace(["[\(\[].*?[\)\]]", ""]).str.replace(["[^\w\s]", ""]).str.replace(["\s+", " "]).str.strip_chars([null]).alias("norm_track_name"), col("artists").str.to_lowercase().str.replace(["[\(\[].*?[\)\]]", ""]).str.split([";"]).list.get([dyn int: 0]).str.replace(["[^\w\s]", ""]).str.strip_chars([null]).alias("norm_artists")] 
      Csv SCAN [https://huggingface.co/datasets/maharshipandya/spotify-tracks-dataset/resolve/main/dataset.csv]
      PROJECT 3/21 COLUMNS
      SELECTION: [([(col("a

In [33]:
sample = final.head(5000).collect()

In [34]:
sample

chords,genres,spotify_song_id,artists,track_name,norm_track_name,norm_artists,title,artist,lyrics
str,str,str,str,str,str,str,str,str,str
"""<intro_1> Bbmin Bbmin9 Fmin Bb…","""'g funk' 'gangster rap' 'hip h…","""503OTo2dSqe7qk76rgsbep""","""Dr. Dre;Snoop Dogg""","""Still D.R.E.""","""still dre""","""dr dre""","""Still D.R.E.""","""Dr. Dre""","""[Produced by Dr. Dre, Scott St…"
"""<intro_1> Bbmin Bbmin9 Fmin Bb…","""'g funk' 'gangster rap' 'hip h…","""503OTo2dSqe7qk76rgsbep""","""Dr. Dre;Snoop Dogg""","""Still D.R.E.""","""still dre""","""dr dre""","""Still D.R.E.""","""Dr. Dre""","""[Produced by Dr. Dre, Scott St…"
"""Dmin Gmin F Emin Dmin Gmin F E…","""'g funk' 'gangster rap' 'hip h…","""2zoobJFEB9h15fjYjRd6oP""","""2Pac""","""Hail Mary""","""hail mary""","""2pac""","""Hail Mary""","""2Pac""","""[Segue from ""Bomb First""] [In…"
"""<intro_1> Bmin Emin D Fs Bmin …","""'g funk' 'gangster rap' 'hip h…","""1SWVDBtw6h3tm9OehOkDhv""","""Dr. Dre;Eminem;Xzibit""","""What's The Difference""","""whats the difference""","""dr dre""","""Whats the Difference""","""Dr. Dre""","""[Produced by Dr. Dre & Mel-Man…"
"""<intro_1> C G Dmin C G Dmin C …","""'alternative hip hop' 'conscio…","""30oTS7bm0aH3p7lqjEIu8q""","""Gang Starr""","""Full Clip""","""full clip""","""gang starr""","""Full Clip""","""Gang Starr""","""[Intro: DJ Premier] Big L, res…"
…,…,…,…,…,…,…,…,…,…
"""<intro_1> Amin7 Gmaj7 <verse_1…","""'k-pop' 'k-pop girl group' 'po…","""60jFaQV7Z4boGC4ob5B5c6""","""TWICE""","""TT""","""tt""","""twice""","""TT""","""TWICE""","""[트와이스 ""TT"" 가사] [Verse 1: Naye…"
"""<intro_1> B Ebmin Dbmin Fs B B…","""'neo-psychedelic'""","""3QaULt6mFjXbmU1O8chake""","""Pond""","""Holding Out For You""","""holding out for you""","""pond""","""Holding Out for You""","""Pond""","""What in the world ever made th…"
"""<chorus_1> Emin G A Emin <vers…","""'modern alternative rock'""","""5w6B0sAH7XauCvMOAtplQj""","""Barns Courtney""","""Fire""","""fire""","""barns courtney""","""Fire""","""Barns Courtney""","""[Verse 1] Lonely shadows follo…"
"""Dsmin Cs Gs Dsmin Cs Gs Dsmin …","""'modern alternative rock'""","""1Mf27cnAF1Q6Ko83XTM5d1""","""Barns Courtney""","""Glitter & Gold""","""glitter gold""","""barns courtney""","""Glitter Gold""","""Barns Courtney""","""[Intro] I am flesh and I am bo…"


In [35]:
final = final.collect()

In [36]:
final

chords,genres,spotify_song_id,artists,track_name,norm_track_name,norm_artists,title,artist,lyrics
str,str,str,str,str,str,str,str,str,str
"""<intro_1> Bbmin Bbmin9 Fmin Bb…","""'g funk' 'gangster rap' 'hip h…","""503OTo2dSqe7qk76rgsbep""","""Dr. Dre;Snoop Dogg""","""Still D.R.E.""","""still dre""","""dr dre""","""Still D.R.E.""","""Dr. Dre""","""[Produced by Dr. Dre, Scott St…"
"""<intro_1> Bbmin Bbmin9 Fmin Bb…","""'g funk' 'gangster rap' 'hip h…","""503OTo2dSqe7qk76rgsbep""","""Dr. Dre;Snoop Dogg""","""Still D.R.E.""","""still dre""","""dr dre""","""Still D.R.E.""","""Dr. Dre""","""[Produced by Dr. Dre, Scott St…"
"""Dmin Gmin F Emin Dmin Gmin F E…","""'g funk' 'gangster rap' 'hip h…","""2zoobJFEB9h15fjYjRd6oP""","""2Pac""","""Hail Mary""","""hail mary""","""2pac""","""Hail Mary""","""2Pac""","""[Segue from ""Bomb First""] [In…"
"""<intro_1> Bmin Emin D Fs Bmin …","""'g funk' 'gangster rap' 'hip h…","""1SWVDBtw6h3tm9OehOkDhv""","""Dr. Dre;Eminem;Xzibit""","""What's The Difference""","""whats the difference""","""dr dre""","""Whats the Difference""","""Dr. Dre""","""[Produced by Dr. Dre & Mel-Man…"
"""<intro_1> C G Dmin C G Dmin C …","""'alternative hip hop' 'conscio…","""30oTS7bm0aH3p7lqjEIu8q""","""Gang Starr""","""Full Clip""","""full clip""","""gang starr""","""Full Clip""","""Gang Starr""","""[Intro: DJ Premier] Big L, res…"
…,…,…,…,…,…,…,…,…,…
"""<verse_1> C Amin F G C Amin F …","""'indie hip hop' 'pixel'""","""5sixigDZ86eDzCHXbfzrDu""","""PmBata""","""i hate her boyfriend's face""","""i hate her boyfriends face""","""pmbata""","""​i hate her boyfriends face""","""PmBata""","""[Verse 1] She told me That her…"
"""<intro_1> Dmaj7""","""'sad lo-fi' 'sad rap'""","""0ZbghqrVhOtGom63oKxONU""","""Powfu;KMays""","""draw you inside my book (feat.…","""draw you inside my book""","""powfu""","""​draw you inside my book""","""Powfu""","""[Verse 1] I was telling her se…"
"""<intro_1> Amin F C G <verse_1>…","""'sad rap'""","""4kKhmIdFALmwCYrt90oWUW""","""MASN""","""Love Me For Me""","""love me for me""","""masn""","""Love Me For Me""","""MASN""","""[Verse 1] Head in the clouds w…"
"""<intro_1> Amin F C G <verse_1>…","""'sad rap'""","""4kKhmIdFALmwCYrt90oWUW""","""MASN""","""Love Me For Me""","""love me for me""","""masn""","""Love Me For Me""","""MASN""","""[Verse 1] Head in the clouds w…"


In [37]:
final.write_csv("final_dataset.csv")