In [1]:
import pathlib
import polars as pl
import chromadb
import os
import json
from openai import OpenAI
from chromadb.utils import embedding_functions

In [2]:
def prepare_spotify_reviews(file_path):
    #Prepare the spotify reviews dataset for ChromaDB
    print("prepare_spotify_reviews:"+ file_path)

    dtypes = {
        "track_id": pl.Utf8,
        "track_name": pl.Utf8,
        "track_artist": pl.Utf8,
        "track_popularity": pl.Float64,
        "track_album_id": pl.Utf8,
        "track_album_name": pl.Utf8,
        "track_album_release_date": pl.Utf8,  # or pl.Date if in a suitable format
        "playlist_name": pl.Utf8,
        "playlist_id": pl.Utf8,
        "playlist_genre": pl.Utf8,
        "playlist_subgenre": pl.Utf8,
        "danceability": pl.Float64,
        "energy": pl.Float64,
        "key": pl.Int64,
        "loudness": pl.Float64,
        "mode": pl.Int64,
        "speechiness": pl.Float64,
        "acousticness": pl.Float64,
        "instrumentalness": pl.Float64,
        "liveness": pl.Float64,
        "valence": pl.Float64,
        "tempo": pl.Float64,
        "duration_ms": pl.Int64
    }

    default_columns = [
        "track_name", "track_artist", "track_album_name", "playlist_genre",
        "playlist_subgenre", "danceability", "energy", "acousticness",
        "instrumentalness", "track_popularity"
    ]

    select_columns=[]
    try:
        spotify_songs = pl.read_csv(file_path, dtypes=dtypes)
        selected_columns = select_columns if select_columns else default_columns
        spotify_subset = spotify_songs.select(selected_columns)
        # Create docs and metadata for ChromaDB
        ids = [f"song{i}" for i in range(spotify_subset.shape[0])]
        documents = spotify_subset["track_name"].to_list()
        metadatas = spotify_subset.drop("track_name").to_dicts()
        return {"ids": ids, "documents": documents, "metadatas": metadatas}

    except Exception as e:
        raise FileNotFoundError(f"Error reading file {file_path}: {e}")


In [3]:
def build_chroma_collection(
    chroma_path: pathlib.Path,
    collection_name: str,
    embedding_func_name: str,
    ids: list[str],
    documents: list[str],
    metadatas: list[dict],
    distance_func_name: str = "cosine",
):
    """Create a ChromaDB collection"""

    chroma_client = chromadb.PersistentClient(chroma_path)

    embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name=embedding_func_name
    )

    collection = chroma_client.create_collection(
        name=collection_name,
        embedding_function=embedding_func,
        metadata={"hnsw:space": distance_func_name},
    )

    # Add all documents to the collection in one go
    collection.add(
        ids=ids,
        documents=documents,
        metadatas=metadatas,
    )

In [4]:
def print_review_summaries(review_summaries):
    # Access and print the content field
    print(review_summaries.choices[0].message.content)

    # Print the attributes of the review_summaries object
    for attr, value in review_summaries.__dict__.items():
        print(f"{attr}: {value}")

In [5]:
DATA_PATH = "/Users/davidkolb/Documents/Code/kolbeuk-data/vectordb/spotify/spotify_songs.csv"
CHROMA_PATH = "/Users/davidkolb/Documents/Code/kolbeuk-data/vectordb/spotify/"
EMBEDDING_FUNC_NAME = "multi-qa-MiniLM-L6-cos-v1"
COLLECTION_NAME = "spotify_songs"

spotify_reviews_dict = prepare_spotify_reviews(DATA_PATH)

build_chroma_collection(
    CHROMA_PATH,
    COLLECTION_NAME,
    EMBEDDING_FUNC_NAME,
    spotify_reviews_dict["ids"],
    spotify_reviews_dict["documents"],
    spotify_reviews_dict["metadatas"]
)

prepare_spotify_reviews:/Users/davidkolb/Documents/Code/kolbeuk-data/vectordb/spotify/spotify_songs.csv


In [10]:
client = chromadb.PersistentClient(CHROMA_PATH)
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name=EMBEDDING_FUNC_NAME
    )
collection = client.get_collection(name=COLLECTION_NAME, embedding_function=embedding_func)

In [13]:
collection.peek() # returns a list of the first 10 items in the collection

{'ids': ['song0',
  'song1',
  'song10',
  'song100',
  'song1000',
  'song10000',
  'song10001',
  'song10002',
  'song10003',
  'song10004'],
 'embeddings': [[0.017858337610960007,
   -0.0692504271864891,
   0.10405390709638596,
   -0.006421481724828482,
   -0.031127803027629852,
   -0.016038749366998672,
   0.15745040774345398,
   -0.029592517763376236,
   0.06267502903938293,
   -0.06590820103883743,
   -0.0007069412968121469,
   -0.11096051335334778,
   0.023920807987451553,
   -0.09465713798999786,
   0.02858484908938408,
   0.04006320238113403,
   0.0436570979654789,
   -0.030153045430779457,
   -0.08888403326272964,
   -0.030030256137251854,
   -0.0636616051197052,
   0.025099875405430794,
   0.014995189383625984,
   0.00500771589577198,
   -0.13434995710849762,
   0.05565258488059044,
   -0.017524179071187973,
   0.04690057411789894,
   -0.018440745770931244,
   0.040629468858242035,
   0.04483877494931221,
   0.020999880507588387,
   0.030144449323415756,
   -0.07877636700868

In [18]:
# Query for genre
genre_results = collection.query(
    query_texts=["pop", "rock"],
    n_results=10,
    where={"playlist_genre": "pop"},
    include=["documents", "metadatas"]
)
print(genre_results)

{'ids': [['song451', 'song2241', 'song2168', 'song1663', 'song772', 'song1076', 'song512', 'song3132', 'song1506', 'song3109'], ['song1525', 'song3173', 'song3140', 'song331', 'song1945', 'song3638', 'song3583', 'song2293', 'song5268', 'song5368']], 'distances': None, 'metadatas': [[{'acousticness': 0.0183, 'danceability': 0.566, 'energy': 0.769, 'instrumentalness': 0.0, 'playlist_genre': 'pop', 'playlist_subgenre': 'dance pop', 'track_album_name': 'POP/STARS', 'track_artist': 'K/DA', 'track_popularity': 75.0}, {'acousticness': 0.0183, 'danceability': 0.566, 'energy': 0.769, 'instrumentalness': 0.0, 'playlist_genre': 'pop', 'playlist_subgenre': 'post-teen pop', 'track_album_name': 'POP/STARS', 'track_artist': 'K/DA', 'track_popularity': 75.0}, {'acousticness': 0.0352, 'danceability': 0.714, 'energy': 0.621, 'instrumentalness': 1.63e-05, 'playlist_genre': 'pop', 'playlist_subgenre': 'post-teen pop', 'track_album_name': 'Life In Cartoon Motion', 'track_artist': 'MIKA', 'track_popularity'

'POP/STARS'

In [37]:
# Define a threshold for high danceability
high_danceability_threshold = 0.8

# Query with metadata filters for high danceability
results = collection.query(
    query_texts=["dance", "energetic"],  # Example query texts
    n_results=5,
    where={"danceability": {"$gt": high_danceability_threshold}},  # Greater than the threshold
    include=["documents", "metadatas"]
)

documents = results['documents']
metadatas = results['metadatas']
artist_track_pairs = []
# Iterate through each list in metadatas and documents simultaneously
for metadata_list, document_list in zip(metadatas, documents):
    for metadata, track_name in zip(metadata_list, document_list):
        track_artist = metadata.get('track_artist')
        # Append as a tuple or dictionary
        artist_track_pairs.append((track_artist, track_name))
        # Or as a dictionary
        # artist_track_pairs.append({"artist": track_artist, "track": track_name})

print(artist_track_pairs)

[('Lady Gaga', 'Just Dance'), ('Lady Gaga', 'Just Dance'), ('Lady Gaga', 'Just Dance'), ('Franc Moody', 'Dance Moves'), ('Lemonface', 'Dance With You'), ('Rob Stepwart', 'Energy'), ('Deitrick Haddon', 'Power'), ('A$AP Rocky', 'Electric Body'), ('A Tribe Called Quest', 'Electric Relaxation'), ('A Tribe Called Quest', 'Electric Relaxation')]


In [41]:
# Format each pair in artist_track_pairs as a string
artist_track_strs = [f"{artist} - {track}" for artist, track in artist_track_pairs]

# Join these strings with commas
artist_track_joined = ",".join(artist_track_strs)

# Replace poor_reviews["documents"][0] with artist_track_joined
reviews_str = artist_track_joined

print(reviews_str)


Lady Gaga - Just Dance,Lady Gaga - Just Dance,Lady Gaga - Just Dance,Franc Moody - Dance Moves,Lemonface - Dance With You,Rob Stepwart - Energy,Deitrick Haddon - Power,A$AP Rocky - Electric Body,A Tribe Called Quest - Electric Relaxation,A Tribe Called Quest - Electric Relaxation


In [22]:
client = OpenAI()

context = "You are a conversational music guide created to provide song recommendations and artist information to users."
question = "What's the key to a highly dancable song?"

completion = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": context},
        {"role": "user", "content": question},
    ],
    temperature=0
)

# Call the function with your review_summaries object as an argument
print_review_summaries(completion)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


The key to a highly danceable song can vary depending on the genre and style of music. However, there are a few elements that generally contribute to a song's danceability:

1. Rhythm: A strong and consistent beat is essential for getting people moving. Catchy rhythms and grooves that make you want to tap your feet or nod your head are often found in danceable songs.

2. Tempo: Generally, faster tempos tend to be more danceable as they provide energy and encourage movement. However, there are also slower tempo songs that can be danceable, especially in genres like ballroom or contemporary dance.

3. Melody: A memorable and catchy melody can make a song more danceable. It should be easy to follow and have a sense of repetition or hooks that make it enjoyable to move to.

4. Groove: A solid groove created by the combination of bass, drums, and other rhythmic elements can make a song irresistibly danceable. Syncopation, funky basslines, and syncopated percussion can enhance the groove and

In [None]:
client = OpenAI()

chroma_client = chromadb.PersistentClient(CHROMA_PATH)
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=EMBEDDING_FUNC_NAME
)

collection = chroma_client.get_collection(
    name=COLLECTION_NAME, embedding_function=embedding_func
)

In [44]:
context = "You are a conversational music guide created to provide song recommendations and artist information to users."
question = "What's the key to a highly dancable song? "

# Format each pair in artist_track_pairs as a string
artist_track_strs = [f"{artist} - {track}" for artist, track in artist_track_pairs]

# Join these strings with commas
artist_track_joined = ",".join(artist_track_strs)

# Replace poor_reviews["documents"][0] with artist_track_joined
reviews_str = artist_track_joined

good_review_summaries = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": context.format(reviews_str)},
        {"role": "user", "content": question},
    ]
)

# Call the function with your review_summaries object as an argument
print_review_summaries(good_review_summaries)

The key to a highly danceable song lies in a combination of factors. Here are a few elements to consider:

1. Rhythm: A strong, energetic rhythm with a consistent beat is crucial for getting people on the dance floor. This can include a catchy drum pattern or an infectious bassline that compels listeners to move their bodies.

2. Tempo: A moderate to fast tempo often works well for dance music, as it provides the necessary energy to keep dancers engaged. However, slower tempos can also be suitable for certain dance styles or moods.

3. Groove: A groove is the overall feel and flow of the music. Danceable songs often have a strong sense of groove, making it easy for listeners to catch the rhythm and move to it.

4. Catchy Melodies and Hooks: Memorable melodies, hooks, and vocal lines that are easy to sing along to can greatly contribute to a song's danceability. These elements provide something for listeners to connect with and showcase their enthusiasm on the dance floor.

5. Dynamic A

In [None]:
# use this to ask a differnt question 
# use this to ask a differnet question 

question = """Which of these poor reviews has the worst implications for a band? Explain why."""

poor_reviews = collection.query(
    query_texts=[question],
    n_results=5,
    include=["documents"],
    where={"Rating": {"$lte": 3}},
)

reviews_str = ",".join(poor_reviews["documents"][0])

poor_review_analysis = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": context.format(reviews_str)},
        {"role": "user", "content": question},
    ]
)

# Call the function with your review_summaries object as an argument
print_review_summaries(poor_review_analysis)