# ChatGPT Feature Extraction Evaluation

## Setup

In [1]:
# Imports
import os
import time
from joblib import Parallel, delayed
from tqdm.notebook import tqdm
from retry import retry
import pandas as pd
import openai
import json
import pickle
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [2]:
# Set openai api key
openai.api_key = os.environ.get("OPENAI_API_KEY")

## Dataset Preprocessing

In [3]:
input_path = './data/midi_test_dataset.csv' # UPDATE
temp_output_path = './data/chatgpt_responses.csv'
output_path = './data/chatgpt_extraction_evaluation.csv' # UPDATE

df = pd.read_csv(input_path)

### Dataset Sanitization

In [4]:
def add_space(text):
    note = text[0]
    key = text[-5:]
    return note + ' ' + key

def genre_split(text):
    if text == 'newage':
        text = 'new age'
    if text == 'posthardcore':
        text = 'post hardcore'
    if text == 'easylistening':
        text = 'easy listening'
    return text

In [5]:
# Data grooming
df['audio_key'] = df['audio_key'].apply(add_space)
df['audio_key'] = df['audio_key'].str.capitalize()
df['pitch_range'] = df['pitch_range'].str.replace('_', ' ')
df['genre'] = df['genre'].apply(genre_split).str.replace('__', ' ')
df['genre'] = df['genre'].str.replace('_', ' ')
df['inst'] = df['inst'].str.replace('_', ' ')
df['inst'] = df['inst'].str.replace('-', ' ')
df['track_role'] = df['track_role'].str.replace('_', ' ')

In [6]:
print(df.shape)
num_requests = len(df)
df.head()

(2301, 18)


Unnamed: 0,text,audio_key,chord_progressions,pitch_range,num_measures,bpm,genre,track_role,inst,sample_rhythm,time_signature,min_velocity,max_velocity,split_data,id,track_roll,chord,inst_group
0,Create a funky soul jam in the key of C major ...,C major,"[['G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G',...",unknown,31,166,funk soul,unknown,string ensemble 1,unknown,6/8,58,127,unknown,eb331301516cecc3e141c0153737fae0_9,unknown,"['B', 'D', 'G', 'F', 'C', 'E', 'A']",9
1,"Aspiring rockers, get ready to unleash the pow...",E major,"[['F#', 'F#', 'F#', 'F#', 'G#', 'G#', 'G#', 'G...",unknown,10,203,rock,unknown,timpani,unknown,4/4,56,100,unknown,d883c606b87c83f6ae5ccebfda68194b_7,unknown,"['G#', 'B', 'E', 'F#']",6
2,I will create a fun and energetic rock jam in ...,B major,"[['F#', 'F#', 'F#', 'F#', 'B', 'B', 'B', 'B', ...",unknown,9,225,rock,unknown,timpani,unknown,4/4,100,100,unknown,1b3866a75cd3738306e1e8b9f1a2f15e_11,unknown,"['B', 'F#', 'G#', 'C#', 'E']",6
3,Let's start our jam session in the d minor key...,D minor,"[['F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'B',...",unknown,12,178,soundtrack,unknown,timpani,unknown,4/4,24,90,unknown,f94c32517b792973cb9abe2528528f7f_4,unknown,"['B', 'F', 'D', 'A']",6
4,Generate a unique folk melody in E minor key w...,E minor,"[['G', 'G', 'G', 'G', 'B', 'B', 'B', 'B', 'E',...",unknown,12,149,folk,unknown,piccolo,unknown,4/4,104,113,unknown,10086bf6acd992ea3667e25266695593_1,unknown,"['B', 'D', 'F#', 'G', 'E']",5


In [7]:
# Created reduced dataframe to reduce computational cost of iterating over rows of full dataframe.
df_reduced = df[['text','audio_key']].copy(deep=True)
df_reduced.head()

Unnamed: 0,text,audio_key
0,Create a funky soul jam in the key of C major ...,C major
1,"Aspiring rockers, get ready to unleash the pow...",E major
2,I will create a fun and energetic rock jam in ...,B major
3,Let's start our jam session in the d minor key...,D minor
4,Generate a unique folk melody in E minor key w...,E minor


## ChatGPT Response Generation

In [8]:
# Prompt
gpt_prompt = 'From the prompt below, extract the following information: key signature, number of measures, BPM, genre, instrument, time signature, minimum velocity, maximum velocity, and set of unique chords. Format the output as a JSON with the following types: {"key signature": string, "number of measures": integer, "BPM": integer, "genre": string, "instrument": string, "time signature": string, "minimum velocity": integer, "maximum velocity": integer, "set of unique chords": list}. If the information is unknown or not specified use the default value from the following dictionary:\n{"key signature":"C major", "number of measures": 8, "BPM":120, "genre":"rock", "instrument":"acoustic grand piano", "time signature":"4/4", "minimum velocity":10, "maximum velocity":127}\n\n'

In [None]:
sleep_delay = 0.12
text_list = []

# define a function that parallelizes the data description creation
def process(idx, row, text_list, sleep_delay):
    """
        Takes in a dataframe and returns a dataframe with the data description
    """
    @retry(Exception, tries=5, delay=1, backoff=2, max_delay=120)
    def get_completion(prompt, model='gpt-3.5-turbo'):   
        messages = [{'role': 'user', 'content': prompt}]

        response = openai.ChatCompletion.create(model=model, 
                                                messages=messages,
                                            )
        
        return response.choices[0].message['content']

    prompt = gpt_prompt + row['text']
    
    response = get_completion(prompt)
    text_list.append(response)
    time.sleep(sleep_delay)
    return text_list

text_list = Parallel(n_jobs=10, timeout=10000)(delayed(process)(idx, row, text_list, sleep_delay) for (idx, row) in tqdm(df_reduced.iterrows(), total=len(df_reduced)))

### Save responses to pickle file

In [None]:
with open("./data/chatgpt_text_list.pkl", "wb") as f:
    pickle.dump(text_list, f)

## Cleanse / prepocess GPT responses

In [9]:
with open("./data/chatgpt_text_list.pkl", "rb") as f:
    text_list = pickle.load(f)

In [10]:
print(text_list)


[['{\n  "key signature": "C major",\n  "number of measures": 31,\n  "BPM": 166,\n  "genre": "funky soul",\n  "instrument": "string ensemble 1",\n  "time signature": "6/8",\n  "minimum velocity": 58,\n  "maximum velocity": 127,\n  "set of unique chords": ["B", "D", "G", "F", "C", "E", "A"]\n}'], ['{\n  "key signature": "E major",\n  "number of measures": 10,\n  "BPM": 203,\n  "genre": "rock",\n  "instrument": "timpani",\n  "time signature": "4/4",\n  "minimum velocity": 56,\n  "maximum velocity": 100,\n  "set of unique chords": ["G#", "B", "E", "F#"]\n}'], ['{\n  "key signature": "B major",\n  "number of measures": 9,\n  "BPM": 225,\n  "genre": "rock",\n  "instrument": "timpani",\n  "time signature": "4/4",\n  "minimum velocity": 100,\n  "maximum velocity": 100,\n  "set of unique chords": ["B", "F#", "G#", "C#", "E"]\n}'], ['{\n  "key signature": "d minor",\n  "number of measures": 12,\n  "BPM": 178,\n  "genre": "soundtrack",\n  "instrument": "timpani",\n  "time signature": "4/4",\n  "m

In [11]:
## Identify responses with errors / cannot load as JSON
idx = 0
error_dict = {}
good_text_list = []
good_keys = ['key signature', 'number of measures', 'BPM', 'genre', 'instrument', 'time signature', 'minimum velocity', 'maximum velocity', 'set of unique chords']
for response in text_list:
    try:
        temp_response = json.loads(response[0])
        temp_response_keys = list(temp_response.keys())
        if temp_response_keys == good_keys:
            good_text_list.append(json.loads(response[0]))
        else:
            raise Exception
    except:
        error_dict[idx] = response
    idx += 1

# print(error_dict)
error_idx_list = list(error_dict.keys())
print("Index of errors:", error_idx_list)
num_errors = len(error_dict.keys())
print("Number of errors:", num_errors)
gpt_response_error_rate = (len(error_dict.keys())/len(text_list))*100
print("Error rate:", gpt_response_error_rate,"%")
# print(error_dict.values())
# print(good_text_list)
num_good_responses = len(good_text_list)
print("Number of good responses:", num_good_responses)

Index of errors: [339, 948, 970, 1324, 1353, 1509, 1881, 2135, 2203, 2212]
Number of errors: 10
Error rate: 0.43459365493263796 %
Number of good responses: 2291


In [12]:
# Load responses into dataframe for analysis
df_response = pd.DataFrame.from_records(good_text_list)
print(df_response.shape)
df_response

(2291, 9)


Unnamed: 0,key signature,number of measures,BPM,genre,instrument,time signature,minimum velocity,maximum velocity,set of unique chords
0,C major,31,166,funky soul,string ensemble 1,6/8,58,127,"[B, D, G, F, C, E, A]"
1,E major,10,203,rock,timpani,4/4,56,100,"[G#, B, E, F#]"
2,B major,9,225,rock,timpani,4/4,100,100,"[B, F#, G#, C#, E]"
3,d minor,12,178,soundtrack,timpani,4/4,24,90,"[B', F, D, A]"
4,E minor,12,149,folk,piccolo,4/4,104,113,"[B, D, F#, G, E]"
...,...,...,...,...,...,...,...,...,...
2286,d minor,17,110,pop,tango accordion,4/4,95,95,"[D', Dm, F, C#, E, A]"
2287,A minor,8,80,cinematic,timpani,4/4,102,118,"[A, F#m7b5, Em7, Dmaj7, Dm7, Asus4, Am]"
2288,F major,14,150,dance,string ensemble 2,4/4,70,95,"[B, D, G, C, F, A]"
2289,B major,10,205,pop,choir aahs,4/4,72,75,"[B, F#]"


In [13]:
# Data grooming
df_response['key signature'] = df_response['key signature'].str.capitalize()
df_response['genre'] = df_response['genre'].apply(genre_split).str.replace('__', ' ')
df_response['genre'] = df_response['genre'].str.replace('_', ' ')
df_response['instrument'] = df_response['instrument'].str.replace('_', ' ')
df_response['instrument'] = df_response['instrument'].str.replace('-', ' ')
df_response['set of unique chords'] = df_response['set of unique chords'].astype("string")
df_response['set of unique chords'] = df_response['set of unique chords'].str.replace("'", '')
df_response['set of unique chords'] = df_response['set of unique chords'].str.replace('"', '')
df_response['set of unique chords'] = df_response['set of unique chords'].astype("string")

## Calculate Accuracy by column / feature

In [14]:
df_filtered = df.copy(deep=True)
print(df_filtered.shape)

(2301, 18)


In [15]:
print(df_filtered.shape)
df_filtered = df_filtered.drop(index=error_idx_list)
df_filtered = df_filtered.reset_index(drop=True)
print(df_filtered.shape)

(2301, 18)
(2291, 18)


In [16]:
df_filtered['audio_key'] = df_filtered['audio_key'].str.capitalize()
df_filtered['genre'] = df_filtered['genre'].apply(genre_split).str.replace('__', ' ')
df_filtered['genre'] = df_filtered['genre'].str.replace('_', ' ')
df_filtered['inst'] = df_filtered['inst'].str.replace('_', ' ')
df_filtered['inst'] = df_filtered['inst'].str.replace('-', ' ')
df_filtered['chord'] = df_filtered['chord'].astype("string")
df_filtered['chord'] = df_filtered['chord'].str.replace("'", '')
df_filtered['chord'] = df_filtered['chord'].str.replace('"', '')
df_filtered['chord'] = df_filtered['chord'].astype("string")

In [23]:
df_response['key signature'].to_numpy()

array(['C major', 'E major', 'B major', ..., 'F major', 'B major',
       'B minor'], dtype=object)

In [41]:
# acc_key_signature = sum(df_response['key signature'] == df_filtered['audio_key'])/num_requests
# acc_num_measures = sum(df_response['number of measures'] == df_filtered['num_measures'])/num_requests
# acc_bpm = sum(df_response['BPM'] == df_filtered['bpm'])/num_requests
# acc_genre = sum(df_response['genre'] == df_filtered['genre'])/num_requests
# acc_instrument = sum(df_response['instrument'] == df_filtered['inst'])/num_requests
# acc_time_signature = sum(df_response['time signature'] == df_filtered['time_signature'])/num_requests
# acc_min_velocity = sum(df_response['minimum velocity'] == df_filtered['min_velocity'])/num_requests
# acc_max_velocity = sum(df_response['maximum velocity'] == df_filtered['max_velocity'])/num_requests
# acc_unique_chords = sum(df_response['set of unique chords'] == df_filtered['chord'])/num_requests

acc_key_signature = accuracy_score(df_filtered['audio_key'].to_numpy(), df_response['key signature'].to_numpy())
acc_num_measures = accuracy_score(df_filtered['num_measures'].to_numpy(), df_response['number of measures'].to_numpy())
acc_bpm = accuracy_score(df_filtered['bpm'].to_numpy(), df_response['BPM'].to_numpy())
acc_genre = accuracy_score(df_filtered['genre'].to_numpy(), df_response['genre'].to_numpy())
acc_instrument = accuracy_score(df_filtered['inst'].to_numpy(), df_response['instrument'].to_numpy())
acc_time_signature = accuracy_score(df_filtered['time_signature'].to_numpy(), df_response['time signature'].to_numpy())
acc_min_velocity = accuracy_score(df_filtered['min_velocity'].to_numpy(), df_response['minimum velocity'].to_numpy())
acc_max_velocity = accuracy_score(df_filtered['max_velocity'].to_numpy(), df_response['maximum velocity'].to_numpy())
acc_unique_chords = accuracy_score(df_filtered['chord'].to_numpy(), df_response['set of unique chords'].to_numpy())

f1_key_signature = f1_score(df_filtered['audio_key'].to_numpy(), df_response['key signature'].to_numpy(), average='macro')
f1_num_measures = f1_score(df_filtered['num_measures'].to_numpy(), df_response['number of measures'].to_numpy(), average='macro')
f1_bpm = f1_score(df_filtered['bpm'].to_numpy(), df_response['BPM'].to_numpy(), average='macro')
f1_genre = f1_score(df_filtered['genre'].to_numpy(), df_response['genre'].to_numpy(), average='macro')
f1_instrument = f1_score(df_filtered['inst'].to_numpy(), df_response['instrument'].to_numpy(), average='macro')
f1_time_signature = f1_score(df_filtered['time_signature'].to_numpy(), df_response['time signature'].to_numpy(), average='macro')
f1_min_velocity = f1_score(df_filtered['min_velocity'].to_numpy(), df_response['minimum velocity'].to_numpy(), average='macro')
f1_max_velocity = f1_score(df_filtered['max_velocity'].to_numpy(), df_response['maximum velocity'].to_numpy(), average='macro')
f1_unique_chords = f1_score(df_filtered['chord'].to_numpy(), df_response['set of unique chords'].to_numpy(), average='macro')

precision_key_signature = precision_score(df_filtered['audio_key'].to_numpy(), df_response['key signature'].to_numpy(), average='macro')
precision_num_measures = precision_score(df_filtered['num_measures'].to_numpy(), df_response['number of measures'].to_numpy(), average='macro')
precision_bpm = precision_score(df_filtered['bpm'].to_numpy(), df_response['BPM'].to_numpy(), average='macro')
precision_genre = precision_score(df_filtered['genre'].to_numpy(), df_response['genre'].to_numpy(), average='macro')
precision_instrument = precision_score(df_filtered['inst'].to_numpy(), df_response['instrument'].to_numpy(), average='macro')
precision_time_signature = precision_score(df_filtered['time_signature'].to_numpy(), df_response['time signature'].to_numpy(), average='macro')
precision_min_velocity = precision_score(df_filtered['min_velocity'].to_numpy(), df_response['minimum velocity'].to_numpy(), average='macro')
precision_max_velocity = precision_score(df_filtered['max_velocity'].to_numpy(), df_response['maximum velocity'].to_numpy(), average='macro')
precision_unique_chords = precision_score(df_filtered['chord'].to_numpy(), df_response['set of unique chords'].to_numpy(), average='macro')

recall_key_signature = recall_score(df_filtered['audio_key'].to_numpy(), df_response['key signature'].to_numpy(), average='macro')
recall_num_measures = recall_score(df_filtered['num_measures'].to_numpy(), df_response['number of measures'].to_numpy(), average='macro')
recall_bpm = recall_score(df_filtered['bpm'].to_numpy(), df_response['BPM'].to_numpy(), average='macro')
recall_genre = recall_score(df_filtered['genre'].to_numpy(), df_response['genre'].to_numpy(), average='macro')
recall_instrument = recall_score(df_filtered['inst'].to_numpy(), df_response['instrument'].to_numpy(), average='macro')
recall_time_signature = recall_score(df_filtered['time_signature'].to_numpy(), df_response['time signature'].to_numpy(), average='macro')
recall_min_velocity = recall_score(df_filtered['min_velocity'].to_numpy(), df_response['minimum velocity'].to_numpy(), average='macro')
recall_max_velocity = recall_score(df_filtered['max_velocity'].to_numpy(), df_response['maximum velocity'].to_numpy(), average='macro')
recall_unique_chords = recall_score(df_filtered['chord'].to_numpy(), df_response['set of unique chords'].to_numpy(), average='macro')


eval_dict = {'Accuracy':[acc_key_signature, acc_num_measures, acc_bpm, acc_genre, acc_instrument, acc_time_signature, acc_min_velocity, acc_max_velocity, acc_unique_chords],
             'F1':[f1_key_signature, f1_num_measures, f1_bpm, f1_genre, f1_instrument, f1_time_signature, f1_min_velocity, f1_max_velocity, f1_unique_chords],
             'Precision':[precision_key_signature, precision_num_measures, precision_bpm, precision_genre, precision_instrument, precision_time_signature, precision_min_velocity, precision_max_velocity, precision_unique_chords],
             'Recall':[recall_key_signature, recall_num_measures, recall_bpm, recall_genre, recall_instrument, recall_time_signature, recall_min_velocity, recall_max_velocity, recall_unique_chords],}
eval_cols = ['audio_key', 'num_measures', 'bpm', 'genre', 'inst', 'time_signature', 'min_velocity','max_velocity','chord']

df_eval_results = pd.DataFrame.from_dict(eval_dict, orient='index', columns=eval_cols)
df_eval_results

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,audio_key,num_measures,bpm,genre,inst,time_signature,min_velocity,max_velocity,chord
Accuracy,0.903972,0.98254,0.998691,0.788739,0.855958,1.0,0.957224,0.953732,0.973811
F1,0.439802,0.990548,0.998218,0.190048,0.262753,1.0,0.979742,0.969315,0.943311
Precision,0.464445,0.994396,0.999916,0.238253,0.284722,1.0,0.992222,0.988063,0.946314
Recall,0.440204,0.987305,0.997239,0.171356,0.253763,1.0,0.976382,0.956294,0.941871
