# Dataset Prompt Engineering & Description Creation

## Setup

In [156]:
!pip install openai
!pip install retry



In [157]:
import os
import time
from joblib import Parallel, delayed
from tqdm.notebook import tqdm
from retry import retry

import pandas as pd

import openai

In [None]:
openai_key = os.environ.get("OPENAI_API_KEY")

openai.api_key = openai_key

## Dataset Processing

In [159]:
input_path = './data/metadata.csv' # UPDATE
output_path = './data/metadata_prompts.csv' # UPDATE

df = pd.read_csv(input_path)

#### Dataset Sanitization

In [160]:
def add_space(text):
    note = text[0]
    key = text[-5:]
    return note + ' ' + key

def genre_split(text):
    if text == 'newage':
        text = 'new age'
    if text == 'posthardcore':
        text = 'post hardcore'
    if text == 'easylistening':
        text = 'easy listening'
    return text

In [161]:
# Data grooming
df['audio_key'] = df['audio_key'].apply(add_space)
df['pitch_range'] = df['pitch_range'].str.replace('_', ' ')
df['genre'] = df['genre'].apply(genre_split).str.replace('__', ' ').replace('_',' ')
df['inst'] = df['inst'].str.replace('_', ' ').replace('-', ' ')
df['track_role'] = df['track_role'].str.replace('_', ' ')

In [162]:
df.head()

Unnamed: 0,audio_key,chord_progressions,pitch_range,num_measures,bpm,genre,track_role,inst,sample_rhythm,time_signature,min_velocity,max_velocity,split_data,id,track_roll,unique_chord_n_note
0,e minor,"[['A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'G',...",unknown,6,231,rock,unknown,clarinet,unknown,4/4,66,100,unknown,c527f2542d8533c1a4c95b2217d9918f_15,unknown,"['E', 'Em', 'G', 'A']"
1,a minor,"[['C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'A',...",unknown,9,222,rock,unknown,recorder,unknown,4/4,66,100,unknown,c527f2542d8533c1a4c95b2217d9918f_16,unknown,"['B', 'E', 'C', 'A']"
2,d minor,"[['A', 'A', 'A', 'A', 'G', 'G', 'G', 'G', 'A',...",unknown,9,235,rock,unknown,electric bass pick,unknown,4/4,100,127,unknown,c527f2542d8533c1a4c95b2217d9918f_2,unknown,"['E', 'C', 'G', 'A']"
3,a minor,"[['Am', 'Am', 'Am', 'Am', 'Am', 'Am', 'Am', 'A...",unknown,10,213,rock,unknown,acoustic guitar nylon,unknown,4/4,23,74,unknown,c527f2542d8533c1a4c95b2217d9918f_3,unknown,"['D', 'G', 'F', 'E', 'Am']"
4,a minor,"[['B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B',...",unknown,10,211,rock,unknown,electric guitar muted,unknown,4/4,55,127,unknown,c527f2542d8533c1a4c95b2217d9918f_4,unknown,"['B', 'D', 'G', 'F', 'Em']"


In [163]:
# Split main dataset by number of rows (rate limit)

num_rows = 2297 # Update to max rate limit
len_df = len(df)

df_list = []

for i in range(0, len_df, num_rows):
    df_append = df[i:i + num_rows]
    df_list.append(df_append)

len(df_list)

7

## Prompt Engineering

Prompt engineering performed directly in ChatGPT

**4K context:**
<br>
    - Input: $0.0015 per 1K tokens <br>
    - Output: $0.002 per 1K tokens
<br>

~$0.000325 per query * ~15K rows = $4.88

#### **Prompts:**

**PROMPT 1:** 

Pretend you are a musician that wants to have fun jamming to music produced by a set of instructions. Write these instructions in 25 to 50 words, where the output music incorporates the following features: a minor key, mid pitch range, 8 measures, 120 beats/minute, cinematic genre, unknown track role,  bass instrument, standard sample rhythm, 4/4 time signature, minimum velocity of 101, maximum velocity of 102, and unique chords of ['D', 'Dm', 'G', 'C', 'Am']. The unique chords feature represents a list of unique chords. If a feature includes the word 'unknown', then eliminate this feature from your response.

**PROMPT 2:**

Give a realistic but natural instruction between 25 to 50 words in the point-of-view of a hobbyist musician doing a jamming or accompaniment session, which includes the following features: a minor key, mid pitch range, 8 measures, 120 beats/minute, cinematic genre, unknown track role,  bass instrument, standard sample rhythm, 4/4 time signature, minimum velocity of 101, maximum velocity of 102, and unique chords of ['D', 'Dm', 'G', 'C', 'Am']. The unique chords feature represents a list of unique chords. Your response should include music terms. If a feature includes the word 'unknown', then eliminate this feature from your response.

**PROMPT 3:**

You are a professional musician looking to sharpen musical skills through an accompaniment session. Write a set of instructions between 25 to 50 words long that will generate music based on the following features: a minor key, mid pitch range, 8 measures, 120 beats/minute, cinematic genre, unknown track role,  bass instrument, standard sample rhythm, 4/4 time signature, minimum velocity of 101, maximum velocity of 102, and unique chords of ['D', 'Dm', 'G', 'C', 'Am']. The unique chords feature represents a list of unique chords. Your response should include music terms. If a feature includes the word 'unknown', then eliminate this feature from your response.

**PROMPT 4:**

Give a user, who wants to try AI for generating music, an example prompt between 25 to 50 words long that will incorporate the following features into a piece of music: a minor key, mid pitch range, 8 measures, 120 beats/minute, cinematic genre, unknown track role,  bass instrument, standard sample rhythm, 4/4 time signature, minimum velocity of 101, maximum velocity of 102, and unique chords of ['D', 'Dm', 'G', 'C', 'Am']. The unique chords feature represents a list of unique chords. Your response should include music terms. If a feature includes the word 'unknown', then eliminate this feature from your response.

**PROMPT 5:**

Can you organize the following features into a set of instructions between 25 to 50 words long that can be used to prompt an LLM to produce a piece of music: a minor key, mid pitch range, 8 measures, 120 beats/minute, cinematic genre, unknown track role,  bass instrument, standard sample rhythm, 4/4 time signature, minimum velocity of 101, maximum velocity of 102, and unique chords of ['D', 'Dm', 'G', 'C', 'Am']. The unique chords feature represents a list of unique chords. Your response should include music terms, and be experimental and thematic. If a feature includes the word 'unknown', then eliminate this feature from your response.

## ChatGPT Response Generation

In [164]:
# Manually update the dataframe
df = df_list[1]

print(f"Dataframe length: {len(df)}")

Dataframe length: 2297


In [165]:
temperature = 0.3
sleep_delay = 0.12
text_list = []

# define a function that parallelizes the data description creation
def process(idx, row, text_list, sleep_delay):
    """
        Takes in a dataframe and returns a dataframe with the data description
    """
    @retry(Exception, tries=5, delay=1, backoff=2, max_delay=120)
    def get_completion(prompt, model='gpt-3.5-turbo'):
        openai.api_key = openai_key
        
        messages = [{'role': 'user', 'content': prompt}]

        response = openai.ChatCompletion.create(model=model, 
                                                messages=messages,
                                            )
        
        return response.choices[0].message['content']
    # Update prompts based on prompt engineering
    if idx % 10 == 4 or idx % 10 == 9:
        # prompt 1
        prompt = "Pretend you are a musician that wants to have fun jamming to music produced by a set of instructions. Write these instructions in 25 to 50 words, where the output music incorporates the following features: {} key, {} pitch range, {} measures, {} beats/minute, {} genre, {} track role,  {} instrument, {} sample rhythm, {} time signature, minimum velocity of {}, maximum velocity of {}, and unique chords of {}. The unique chords feature represents a list of unique chords. If a feature includes the word 'unknown', then eliminate this feature from your response.".format(
                        row['audio_key'], 
                        row['pitch_range'], 
                        row['num_measures'], 
                        row['bpm'], 
                        row['genre'], 
                        row['track_role'], 
                        row['inst'], 
                        row['sample_rhythm'],
                        row['time_signature'], 
                        row['min_velocity'], 
                        row['max_velocity'], 
                        row['unique_chord_n_note']
                        )
    
    if idx % 10 == 0 or idx % 10 == 7:
        # prompt 2
        prompt = "Give a realistic but natural instruction between 25 to 50 words in the point-of-view of a hobbyist musician doing a jamming or accompaniment session, which includes the following features: {} key, {} pitch range, {} measures, {} beats/minute, {} genre, {} track role,  {} instrument, {} sample rhythm, {} time signature, minimum velocity of {}, maximum velocity of {}, and unique chords of {}. The unique chords feature represents a list of unique chords. Your response should include music terms. If a feature includes the word 'unknown', then eliminate this feature from your response.".format(
                        row['audio_key'], 
                        row['pitch_range'], 
                        row['num_measures'], 
                        row['bpm'], 
                        row['genre'], 
                        row['track_role'], 
                        row['inst'], 
                        row['sample_rhythm'],
                        row['time_signature'], 
                        row['min_velocity'], 
                        row['max_velocity'], 
                        row['unique_chord_n_note']
                        )
        
    if idx % 10 == 2 or idx % 10 == 8:
        # prompt 3
        prompt = "You are a professional musician looking to sharpen musical skills through an accompaniment session. Write a set of instructions between 25 to 50 words long that will generate music based on the following features: {} key, {} pitch range, {} measures, {} beats/minute, {} genre, {} track role,  {} instrument, {} sample rhythm, {} time signature, minimum velocity of {}, maximum velocity of {}, and unique chords of {}. The unique chords feature represents a list of unique chords. Your response should include music terms. If a feature includes the word 'unknown', then eliminate this feature from your response.".format(
                        row['audio_key'], 
                        row['pitch_range'], 
                        row['num_measures'], 
                        row['bpm'], 
                        row['genre'], 
                        row['track_role'], 
                        row['inst'], 
                        row['sample_rhythm'],
                        row['time_signature'], 
                        row['min_velocity'], 
                        row['max_velocity'], 
                        row['unique_chord_n_note']
                        )        

    if idx % 10 == 1 or idx % 10 == 6:
        # prompt 4
        prompt = "Give a user, who wants to try AI for generating music, an example prompt between 25 to 50 words long that will incorporate the following features into a piece of music: {} key, {} pitch range, {} measures, {} beats/minute, {} genre, {} track role,  {} instrument, {} sample rhythm, {} time signature, minimum velocity of {}, maximum velocity of {}, and unique chords of {}. The unique chords feature represents a list of unique chords. Your response should include music terms. If a feature includes the word 'unknown', then eliminate this feature from your response.".format(
                        row['audio_key'], 
                        row['pitch_range'], 
                        row['num_measures'], 
                        row['bpm'], 
                        row['genre'], 
                        row['track_role'], 
                        row['inst'], 
                        row['sample_rhythm'],
                        row['time_signature'], 
                        row['min_velocity'], 
                        row['max_velocity'], 
                        row['unique_chord_n_note']
                        )  
        
    if idx % 10 == 3 or idx % 10 == 5:
        # prompt 5
        prompt = "Can you organize the following features into a set of instructions between 25 to 50 words long that can be used to prompt an LLM to produce a piece of music: {} key, {} pitch range, {} measures, {} beats/minute, {} genre, {} track role,  {} instrument, {} sample rhythm, {} time signature, minimum velocity of {}, maximum velocity of {}, and unique chords of {}. The unique chords feature represents a list of unique chords. Your response should include music terms, and be experimental and thematic. If a feature includes the word 'unknown', then eliminate this feature from your response.".format(
                        row['audio_key'], 
                        row['pitch_range'], 
                        row['num_measures'], 
                        row['bpm'], 
                        row['genre'], 
                        row['track_role'], 
                        row['inst'], 
                        row['sample_rhythm'],
                        row['time_signature'], 
                        row['min_velocity'], 
                        row['max_velocity'], 
                        row['unique_chord_n_note']
                        )  
    
    response = get_completion(prompt)
    text_list.append(response)
    time.sleep(sleep_delay)
    return text_list

text_list = Parallel(n_jobs=10, timeout=10000)(delayed(process)(idx, row, text_list, sleep_delay) for (idx, row) in tqdm(df.iterrows(), total=len(df)))

  0%|          | 0/2297 [00:00<?, ?it/s]

In [166]:
num_text = len(text_list)
print(f"Number of records: {num_text}")
df_limit = df.head(num_text)

Number of records: 2297


In [None]:
df_limit.loc[:, 'text'] = text_list

In [168]:
df_limit.count()

audio_key              2297
chord_progressions     2297
pitch_range            2297
num_measures           2297
bpm                    2297
genre                  2297
track_role             2297
inst                   2297
sample_rhythm          2297
time_signature         2297
min_velocity           2297
max_velocity           2297
split_data             2297
id                     2297
track_roll             2297
unique_chord_n_note    2297
text                   2297
dtype: int64

## Dataset Update

In [169]:
if os.path.exists(output_path):
    csv_df = pd.read_csv(output_path)
    if csv_df.empty:
        df_limit.to_csv(output_path, index=False)
    else:
        concat_df = pd.concat([csv_df, df_limit])
        concat_df.to_csv(output_path, index=False)
else:
    df_limit.to_csv(output_path, index=False)
