In [19]:
import pathlib
import polars as pl
import chromadb
import os
import json

In [20]:
file_path = "/Users/davidkolb/Documents/Code/kolbeuk-data/vectordb/spotify/spotify_songs.csv"


dtypes = {
    "track_id": pl.Utf8,
    "track_name": pl.Utf8,
    "track_artist": pl.Utf8,
    "track_popularity": pl.Float64,
    "track_album_id": pl.Utf8,
    "track_album_name": pl.Utf8,
    "track_album_release_date": pl.Utf8,  # or pl.Date if in a suitable format
    "playlist_name": pl.Utf8,
    "playlist_id": pl.Utf8,
    "playlist_genre": pl.Utf8,
    "playlist_subgenre": pl.Utf8,
    "danceability": pl.Float64,
    "energy": pl.Float64,
    "key": pl.Int64,
    "loudness": pl.Float64,
    "mode": pl.Int64,
    "speechiness": pl.Float64,
    "acousticness": pl.Float64,
    "instrumentalness": pl.Float64,
    "liveness": pl.Float64,
    "valence": pl.Float64,
    "tempo": pl.Float64,
    "duration_ms": pl.Int64
}

default_columns = [
    "track_name", "track_artist", "track_album_name", "playlist_genre",
    "playlist_subgenre", "danceability", "energy", "acousticness",
    "instrumentalness", "track_popularity"
]
select_columns=[]
try:
    spotify_songs = pl.read_csv(file_path, dtypes=dtypes)
    selected_columns = select_columns if select_columns else default_columns
    spotify_subset = spotify_songs.select(selected_columns)
    # Create docs and metadata for ChromaDB
    ids = [f"song{i}" for i in range(spotify_subset.shape[0])]
    documents = spotify_subset["track_name"].to_list()
    metadatas = spotify_subset.drop("track_name").to_dicts()

except Exception as e:
    raise FileNotFoundError(f"Error reading file {file_path}: {e}")



In [21]:
import polars as pl

# Assuming 'spotify_subset' is your Polars DataFrame
print("Size of DataFrame:", spotify_subset.height * spotify_subset.width)
print("Shape of DataFrame:", (spotify_subset.height, spotify_subset.width))
print("First 6 lines of selected columns:")
print(spotify_subset.head(6))


Size of DataFrame: 328330
Shape of DataFrame: (32833, 10)
First 6 lines of selected columns:
shape: (6, 10)
┌────────────┬────────────┬───────────┬───────────┬───┬────────┬───────────┬───────────┬───────────┐
│ track_name ┆ track_arti ┆ track_alb ┆ playlist_ ┆ … ┆ energy ┆ acousticn ┆ instrumen ┆ track_pop │
│ ---        ┆ st         ┆ um_name   ┆ genre     ┆   ┆ ---    ┆ ess       ┆ talness   ┆ ularity   │
│ str        ┆ ---        ┆ ---       ┆ ---       ┆   ┆ f64    ┆ ---       ┆ ---       ┆ ---       │
│            ┆ str        ┆ str       ┆ str       ┆   ┆        ┆ f64       ┆ f64       ┆ f64       │
╞════════════╪════════════╪═══════════╪═══════════╪═══╪════════╪═══════════╪═══════════╪═══════════╡
│ I Don't    ┆ Ed Sheeran ┆ I Don't   ┆ pop       ┆ … ┆ 0.916  ┆ 0.102     ┆ 0.0       ┆ 66.0      │
│ Care (with ┆            ┆ Care      ┆           ┆   ┆        ┆           ┆           ┆           │
│ Justin     ┆            ┆ (with     ┆           ┆   ┆        ┆           ┆        

In [22]:

# Print some IDs
print("Some IDs:")
for id in ids[:5]:  # Adjust the number to print as many as you like
    print(id)

# Print some documents (track names)
print("\nSome Documents (Track Names):")
for document in documents[:5]:  # Adjust the number to print as many as you like
    print(document)

# Print some metadata entries
print("\nSome Metadata Entries:")
for metadata in metadatas[:3]:  # Adjust the number to print as many as you like
    print(metadata)



Some IDs:
song0
song1
song2
song3
song4

Some Documents (Track Names):
I Don't Care (with Justin Bieber) - Loud Luxury Remix
Memories - Dillon Francis Remix
All the Time - Don Diablo Remix
Call You Mine - Keanu Silva Remix
Someone You Loved - Future Humans Remix

Some Metadata Entries:
{'track_artist': 'Ed Sheeran', 'track_album_name': "I Don't Care (with Justin Bieber) [Loud Luxury Remix]", 'playlist_genre': 'pop', 'playlist_subgenre': 'dance pop', 'danceability': 0.748, 'energy': 0.916, 'acousticness': 0.102, 'instrumentalness': 0.0, 'track_popularity': 66.0}
{'track_artist': 'Maroon 5', 'track_album_name': 'Memories (Dillon Francis Remix)', 'playlist_genre': 'pop', 'playlist_subgenre': 'dance pop', 'danceability': 0.726, 'energy': 0.815, 'acousticness': 0.0724, 'instrumentalness': 0.00421, 'track_popularity': 67.0}
{'track_artist': 'Zara Larsson', 'track_album_name': 'All the Time (Don Diablo Remix)', 'playlist_genre': 'pop', 'playlist_subgenre': 'dance pop', 'danceability': 0.675, 