# LLM Multi-Task Experimentation

## Setup

In [2]:
import os
from io import StringIO
import boto3
import pandas as pd
import numpy as np

import torch
from datasets import DatasetDict, Dataset
from grouphug.dataset_collection import DatasetCollection
from grouphug import (
    AutoMultiTaskModel, 
    ClassificationHeadConfig, 
    DatasetFormatter, 
    LMHeadConfig, 
)
from transformers import AutoTokenizer

from sklearn.utils import shuffle
from sklearn.metrics import (
    accuracy_score, 
    f1_score, 
    recall_score, 
    precision_score,
    confusion_matrix,
    classification_report
)

import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
# Set random seed
random_seed = 42
torch.manual_seed(random_seed)

<torch._C.Generator at 0x7f61fc27b190>

In [4]:
# CUDA
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    devices = torch.cuda.device_count()
    gpu = torch.cuda.get_device_name()
    device = torch.device("cuda")
    print(f"Devices: {devices}")
else:
    print("No GPUs available.")
    device = torch.device("cpu")

Devices: 1


## Dataset Preparation

### S3 Data Load

In [5]:
access_key_id = os.environ.get("S3_KEY_ID")
secret_access_key = os.environ.get("S3_ACCESS_KEY")


s3 = boto3.client("s3",
    region_name="us-west-2", 
    aws_access_key_id=access_key_id, 
    aws_secret_access_key=secret_access_key,
    )

In [6]:
bucket_name = "orchestrate-bucket"
objects = s3.list_objects_v2(Bucket=bucket_name)
files = objects.get("Contents")

In [7]:
# file = "master_midi_meta_final.csv" # UPDATE FILE NAME
file = "master_midi_meta_final_inst_sampled.csv"

object = s3.get_object(Bucket=bucket_name, Key=file)
data = object["Body"].read().decode("utf-8")

df_orig = pd.read_csv(StringIO(data))

print(f"DataFrame size: {len(df_orig)}")
df_orig.head()

DataFrame size: 15340


Unnamed: 0,audio_key,chord_progressions,pitch_range,num_measures,bpm,genre,track_role,inst,sample_rhythm,time_signature,min_velocity,max_velocity,split_data,id,track_roll,unique_chord_n_note,text,inst_group
0,c major,"[['C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'Dm'...",mid low,8,80,new age,accompaniment,acoustic piano,standard,4/4,23,30,train,commu00002,,"['Dm', 'G7', 'G', 'F', 'C', 'Am']",Compose a tranquil and soothing New Age piano ...,0
1,a minor,"[['Am', 'Am', 'Am', 'Am', 'Em', 'Em', 'Em', 'E...",mid low,4,60,cinematic,pad,acoustic piano,standard,4/4,21,22,train,commu00005,,"['Em', 'F', 'Am']","[""Let's create some cinematic magic! Set your ...",0
2,a minor,"[['Am', 'Am', 'Am', 'Am', 'Am', 'Am', 'Am', 'A...",mid low,8,120,cinematic,pad,acoustic piano,standard,4/4,95,96,train,commu00016,,"['C', 'F', 'G', 'Am']","[""Create a 8-measure piece in the cinematic ge...",0
3,c major,"[['C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'Dm'...",mid high,8,80,new age,main melody,acoustic piano,standard,4/4,23,30,train,commu00024,,"['Dm', 'G7', 'G', 'F', 'C', 'Am']",Compose an experimental new age piece in C maj...,0
4,c major,"[['C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'Dm7...",low,8,50,new age,bass,acoustic piano,standard,4/4,71,72,train,commu00025,,"['Dm7', 'A#', 'C']","[""Let's groove in the C major key with a low p...",0


In [8]:
# Shuffle dataset
df = shuffle(df_orig)
df.reset_index(drop=True, inplace=True)

### Data Preprocessing

In [9]:
def remove_char(text):
    if text.startswith('["') or text.startswith("['"):
        text = text[2:]
    if text.endswith('"]') or text.endswith("']"):
        text = text[:-2]
    return text

In [10]:
df['audio_key'] = df['audio_key'].str.replace(' ', '')
df['pitch_range'] = df['pitch_range'].str.replace(' ', '_')
df['genre'] = df['genre'].str.replace(' ', '_')
df['inst'] = df['inst'].str.replace(' ', '_')
df['track_role'] = df['track_role'].str.replace(' ', '_')
df['text'] = df['text'].apply(remove_char)
df = df.rename(columns={'unique_chord_n_note': 'chord'})

In [11]:
def dataset_process(dataset, split, feature):
    process_dataset = Dataset.from_pandas(dataset)
    process_dataset = DatasetDict({split: process_dataset})

    excl_columns = ["text", feature]
    columns = [col for col in process_dataset[split].column_names if col not in excl_columns]

    process_dataset[split] = process_dataset[split].remove_columns(columns)
    process_dataset[split] = process_dataset[split].rename_column(feature, f"label_{feature}")

    return process_dataset

In [12]:
# ADD COLUMNS TO EXCLUDE
excl_col = [
    "chord_progressions", 
    "split_data", 
    "id", 
    "track_roll", 
    "pitch_range", 
    "track_role", 
    "sample_rhythm", 
    "time_signature",
    "track_role",
    "inst_group"
]

for col in df.columns:
    if col in excl_col:
        df = df.drop(col, axis=1)

In [13]:
dataset_dict = {}
col_list = []

for col in df.columns:
    if col != "text":
        dataset = dataset_process(df, "train", col)
        dataset_dict[col] = dataset

        col_list.append(df[col].name)
    else:
        # Save separate "text" column for use in Evaluation
        inputs = pd.DataFrame(df["text"])

dataset_dict

{'audio_key': DatasetDict({
     train: Dataset({
         features: ['label_audio_key', 'text'],
         num_rows: 15340
     })
 }),
 'num_measures': DatasetDict({
     train: Dataset({
         features: ['label_num_measures', 'text'],
         num_rows: 15340
     })
 }),
 'bpm': DatasetDict({
     train: Dataset({
         features: ['label_bpm', 'text'],
         num_rows: 15340
     })
 }),
 'genre': DatasetDict({
     train: Dataset({
         features: ['label_genre', 'text'],
         num_rows: 15340
     })
 }),
 'inst': DatasetDict({
     train: Dataset({
         features: ['label_inst', 'text'],
         num_rows: 15340
     })
 }),
 'min_velocity': DatasetDict({
     train: Dataset({
         features: ['label_min_velocity', 'text'],
         num_rows: 15340
     })
 }),
 'max_velocity': DatasetDict({
     train: Dataset({
         features: ['label_max_velocity', 'text'],
         num_rows: 15340
     })
 }),
 'chord': DatasetDict({
     train: Dataset({
         featu

### Encoding & Tokenization

In [14]:
model_name = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

#### Data Encoding

In [15]:
formatter = (
    DatasetFormatter().tokenize("text", truncation=True, padding=True)
    .encode("label_audio_key")
    # .encode("label_pitch_range")
    .encode("label_num_measures")
    .encode("label_bpm")
    .encode("label_genre")
    # .encode("label_track_role")
    .encode("label_inst")
    # .encode("label_sample_rhythm")
    # .encode("label_time_signature")
    .encode("label_min_velocity")
    .encode("label_max_velocity")
    .encode("label_chord")
)

data = formatter.apply(
        {
            "audio_key": dataset_dict["audio_key"], 
            # "pitch_range": dataset_dict["pitch_range"],
            "num_measures": dataset_dict["num_measures"],
            "bpm": dataset_dict["bpm"],
            "genre": dataset_dict["genre"],            
            # "track_role": dataset_dict["track_role"],
            "inst": dataset_dict["inst"],
            # "sample_rhythm": dataset_dict["sample_rhythm"],
            # "time_signature": dataset_dict["time_signature"],
            "min_velocity": dataset_dict["min_velocity"],
            "max_velocity": dataset_dict["max_velocity"],
            "chord": dataset_dict["chord"],
        }, 
    tokenizer=tokenizer
)

Map: 100%|███████████████████████████████████████████████████████████████████████████████████| 15340/15340 [00:02<00:00, 6264.28 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████| 15340/15340 [00:02<00:00, 6653.22 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████| 15340/15340 [00:02<00:00, 6727.15 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████| 15340/15340 [00:02<00:00, 6767.24 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████| 15340/15340 [00:02<00:00, 6758.22 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████| 15340/15340 [00:02<00:00, 6772.55 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████| 15340/15340 [00:02<00:00, 6752.38 examples/s]
Map: 100%|███

#### Encoding Key Creation

Used to convert encoded labels and predicted labels back to original values.

In [17]:
encoding_key = {}

for col in col_list:
    col_key = {}
    label_name = f"label_{col}"
    for key, value in zip(data[col]["train"][label_name], dataset_dict[col]["train"][label_name]):
        col_key[key] = value
        encoding_key[col] = col_key

### Create Test Dataset

Test dataset must be created after encoding is performed. Otherwise, encoding will not capture all possible values.

In [18]:
# Combine original data with encoded data in a list
master_data = [dataset_dict, data]

In [19]:
total_size = len(df_orig) # Length of original dataset
train_size = int(0.7 * total_size)
val_size = int(0.15 * total_size)

In [20]:
data_dict = {}
master_split_data = []

for dataset in master_data:
    for col in col_list:
        data_train = dataset[col]["train"].select(i for i in range(train_size))
        data_val = dataset[col]["train"].select(i for i in range(train_size, train_size + val_size))
        data_test = dataset[col]["train"].select(i for i in range(train_size + val_size, total_size))

        split_data = DatasetDict({
            "train": data_train,
            "validation": data_val,
            "test": data_test
        })

        data_dict[col] = split_data

    final_dataset = DatasetCollection(data_dict)
    master_split_data.append(final_dataset)

orig_data = master_split_data[0]
encode_data = master_split_data[1]



In [21]:
# Use same logic to build test set on text inputs to prepare for evaluation
test_inputs = inputs["text"][train_size + val_size:total_size]

## Model Experimentation

Based on this paper: https://arxiv.org/pdf/1905.05583.pdf

Code is here: https://github.com/chatdesk/grouphug/blob/master/examples/from-readme.ipynb

### Model Configuration

In [22]:
head_configs = (
    [
        LMHeadConfig(weight=0.1),
        ClassificationHeadConfig.from_data(data, "label_audio_key", classifier_hidden_size=20, weight=1),
        # ClassificationHeadConfig.from_data(data, "label_pitch_range", classifier_hidden_size=20, weight=1),
        ClassificationHeadConfig.from_data(data, "label_num_measures", classifier_hidden_size=20, weight=1),
        ClassificationHeadConfig.from_data(data, "label_bpm", classifier_hidden_size=20, weight=1),
        ClassificationHeadConfig.from_data(data, "label_genre", classifier_hidden_size=20, weight=1),
        # ClassificationHeadConfig.from_data(data, "label_track_role", classifier_hidden_size=20, weight=1),
        ClassificationHeadConfig.from_data(data, "label_inst", classifier_hidden_size=20, weight=1),
        # ClassificationHeadConfig.from_data(data, "label_sample_rhythm", classifier_hidden_size=20, weight=1),
        # ClassificationHeadConfig.from_data(data, "label_time_signature", classifier_hidden_size=20, weight=1),
        ClassificationHeadConfig.from_data(data, "label_min_velocity", classifier_hidden_size=20, weight=1),
        ClassificationHeadConfig.from_data(data, "label_max_velocity", classifier_hidden_size=20, weight=1),
        ClassificationHeadConfig.from_data(data, "label_chord", classifier_hidden_size=20, weight=1),
    ]
)

### Experimentation

In [24]:
# Load model
model_load = 'bert-base-uncased' # Or other checkpoint model

model = AutoMultiTaskModel.from_pretrained(model_load, head_configs, formatter=formatter, tokenizer=tokenizer)
# model.to(device)

Some weights of BertMultiTaskModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['other_heads.label_bpm.head.1.bias', 'other_heads.label_bpm.head.1.weight', 'other_heads.label_audio_key.head.4.weight', 'other_heads.label_chord.head.4.bias', 'other_heads.label_genre.head.1.weight', 'other_heads.label_inst.head.4.bias', 'other_heads.label_max_velocity.head.1.weight', 'other_heads.label_max_velocity.head.4.bias', 'other_heads.label_num_measures.head.1.bias', 'other_heads.label_min_velocity.head.4.bias', 'other_heads.label_num_measures.head.1.weight', 'other_heads.label_audio_key.head.4.bias', 'other_heads.label_inst.head.1.bias', 'other_heads.label_num_measures.head.4.weight', 'other_heads.label_num_measures.head.4.bias', 'other_heads.label_bpm.head.4.weight', 'other_heads.label_chord.head.1.weight', 'other_heads.label_max_velocity.head.1.bias', 'other_heads.label_bpm.head.4.bias', 'other_heads.label_genre.head.1.bias', 'other_heads.label_

In [33]:
df_test = pd.DataFrame()

# Add test inputs to testing dictionary
df_test["text"] = test_inputs

for col in col_list:
    df_test[f"orig_label_{col}"] = master_split_data[0][col]["test"][f"label_{col}"]
    df_test[f"enc_label_{col}"] = master_split_data[1][col]["test"][f"label_{col}"]

In [26]:
results = model.predict(pd.DataFrame(df_test["text"]))

In [27]:
df_results = pd.DataFrame()

for col in col_list:
    col_check = f"label_{col}_predicted_id"
    for predict_col in results.columns:
        if predict_col.startswith(col_check):
            df_results[predict_col] = results[predict_col]

df_eval = pd.concat([df_test, df_results], axis=1)

In [28]:
f1_dict = {}
compare_dict = {}
report_dict = {}

for col in col_list:
    encoded = f"enc_label_{col}"
    predicted = f"label_{col}_predicted_id"
    
    accuracy = accuracy_score(df_eval[encoded], df_eval[predicted])
    f1 = f1_score(df_eval[encoded], df_eval[predicted], average="macro")
    precision = precision_score(df_eval[encoded], df_eval[predicted], average="macro")
    recall = recall_score(df_eval[encoded], df_eval[predicted], average="macro")

    compare_dict[col] = {
        "Accuracy": accuracy,
        "F1": f1,
        "Precision": precision,
        "Recall": recall,
    }

    f1_dict[col] = f1
    
    print(f"{col} Classification Report")
    print(classification_report(df_eval[encoded], df_eval[predicted]))

df_final = pd.DataFrame(compare_dict)

audio_key Classification Report
              precision    recall  f1-score   support

           0       0.07      0.02      0.04       172
           1       0.00      0.00      0.00       242
           2       0.07      0.01      0.01       126
           3       0.00      0.00      0.00        92
           5       0.00      0.00      0.00         1
           6       0.18      0.02      0.04       408
           7       0.05      0.25      0.08        83
           8       0.00      0.00      0.00       175
           9       0.00      0.00      0.00       137
          10       0.00      0.00      0.00       165
          11       0.00      0.00      0.00       153
          12       0.00      0.00      0.00         1
          14       0.00      0.00      0.00       169
          15       0.05      0.84      0.09        92
          16       0.00      0.00      0.00         1
          17       0.00      0.00      0.00       182
          18       0.03      0.01      0.01      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

In [29]:
df_final

Unnamed: 0,audio_key,num_measures,bpm,genre,inst,min_velocity,max_velocity,chord
Accuracy,0.049544,0.003911,0.003911,0.0,0.049109,0.006953,0.007823,0.0004345937
F1,0.016029,0.000952,0.000134,0.0,0.013694,0.001361,0.000849,1.83122e-06
Precision,0.026071,0.000587,0.000357,0.0,0.010725,0.008922,0.001864,9.167735e-07
Recall,0.067969,0.010264,0.003704,0.0,0.036745,0.009353,0.009684,0.0007215007


In [26]:
keys_list = []

for key in encoding_key:
    key_dict = encoding_key[key]
    
    flatten_dict = {"key": list(key_dict.keys()), "value": list(key_dict.values())}
    
    # index = len(encoding_key[key])
    df_key = pd.DataFrame(flatten_dict)
    
    keys_list.append(df_key)

#### F1 Score by Class

In [90]:
f1_dict

{'audio_key': array([0.79532164, 0.98964803, 0.96850394, 0.98360656, 0.        ,
        0.99266504, 1.        , 0.98863636, 1.        , 0.96676737,
        0.99342105, 0.        , 0.83483483, 0.96808511, 0.        ,
        0.98913043, 0.97536946]),
 'num_measures': array([0.49315068, 0.07142857, 0.        , 0.        , 0.        ,
        0.        , 0.97354497, 0.49419055, 0.        , 0.        ,
        0.        , 0.70588235, 0.        , 0.38785047, 0.        ,
        0.95967742, 0.        , 0.        , 0.        , 0.37630662,
        0.        , 0.        , 0.        , 0.        , 0.43243243,
        0.5106383 , 0.4       , 0.42105263, 0.        , 0.04166667,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.33333333, 0.        ,
        0.        , 0.        , 0.        , 0.       

#### Confusion Matrix

Modify columns so that categories with large numbers of classes are removed.

In [60]:
def plot_confusion_matrix(confusion, class_names, column):
    plt.figure(figsize=(7, 5))

    sns.set(font_scale=1.2)
    sns.heatmap(confusion, annot=True, fmt='.2%', cmap='Blues', cbar=False, xticklabels=class_names, yticklabels=class_names)
    
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix (%): {column}')
    
    plt.show()

In [None]:
for col in col_list:
    original = f"orig_label_{col}"
    encoded = f"enc_label_{col}"
    predicted = f"label_{col}_predicted_id"

    confusion = confusion_matrix(df_eval[encoded], df_eval[predicted])

    zero_row_sums = np.where(confusion.sum(axis=1) == 0)
    confusion[zero_row_sums] = 1

    conf_matrix_perc = confusion.astype('float') / confusion.sum(axis=1)[:, np.newaxis]

    class_names = list(df_test[original].unique())
    plot_confusion_matrix(conf_matrix_perc, class_names, col)

## Inference

In [None]:
results = model.predict({"text":"I love playing to jazz music at 4/4 time signature. Can you give me a piece of music \
                        that is 1-127 velocity with bass? The pitch should be mid with riff. I also want 125 bpm in 8 measures \
                        in a minor key and chords of ['C','A','B']"})