# LLM Multi-Task Fine-Tuning

## Setup

In [None]:
!pip install -r requirements.txt

In [1]:
import os
from io import StringIO
import boto3
import pandas as pd
import evaluate
import numpy as np

import torch
from datasets import DatasetDict, Dataset
from grouphug.config import IGNORE_INDEX
from grouphug.dataset_collection import DatasetCollection
from grouphug import (
    AutoMultiTaskModel, 
    ClassificationHeadConfig, 
    ClassificationHead,
    DatasetFormatter, 
    LMHeadConfig, 
    MultiTaskTrainer,
)
from transformers import (
    AutoTokenizer, 
    TrainingArguments,
)

from sklearn.utils import shuffle
from sklearn.metrics import (
    accuracy_score, 
    f1_score, 
    recall_score, 
    precision_score,
    confusion_matrix
)

import seaborn as sns
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Set random seed
random_seed = 42
torch.manual_seed(random_seed)

<torch._C.Generator at 0x7fa6cfe91190>

In [3]:
# CUDA
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    devices = torch.cuda.device_count()
    gpu = torch.cuda.get_device_name()
    device = torch.device("cuda")
    print(f"Devices: {devices}")
else:
    print("No GPUs available.")
    device = torch.device("cpu")

Devices: 1


## Dataset Preparation

### S3 Data Load

In [4]:
access_key_id = os.environ.get("S3_KEY_ID")
secret_access_key = os.environ.get("S3_ACCESS_KEY")


s3 = boto3.client("s3",
    region_name="us-west-2", 
    aws_access_key_id=access_key_id, 
    aws_secret_access_key=secret_access_key,
    )

In [5]:
bucket_name = "orchestrate-bucket"
objects = s3.list_objects_v2(Bucket=bucket_name)
files = objects.get("Contents")

In [6]:
file = "master_midi_meta_final.csv" # UPDATE FILE NAME

object = s3.get_object(Bucket=bucket_name, Key=file)
data = object["Body"].read().decode("utf-8")

df_orig = pd.read_csv(StringIO(data))

print(f"DataFrame size: {len(df_orig)}")
df_orig.head()

DataFrame size: 15340


Unnamed: 0,audio_key,chord_progressions,pitch_range,num_measures,bpm,genre,track_role,inst,sample_rhythm,time_signature,min_velocity,max_velocity,split_data,id,track_roll,unique_chord_n_note,text,inst_group
0,c major,"[['C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'Dm'...",mid low,8,80,new age,accompaniment,acoustic piano,standard,4/4,23,30,train,commu00002,,"['Dm', 'G7', 'G', 'F', 'C', 'Am']",Compose a tranquil and soothing New Age piano ...,0
1,a minor,"[['Am', 'Am', 'Am', 'Am', 'Em', 'Em', 'Em', 'E...",mid low,4,60,cinematic,pad,acoustic piano,standard,4/4,21,22,train,commu00005,,"['Em', 'F', 'Am']","[""Let's create some cinematic magic! Set your ...",0
2,a minor,"[['Am', 'Am', 'Am', 'Am', 'Am', 'Am', 'Am', 'A...",mid low,8,120,cinematic,pad,acoustic piano,standard,4/4,95,96,train,commu00016,,"['C', 'F', 'G', 'Am']","[""Create a 8-measure piece in the cinematic ge...",0
3,c major,"[['C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'Dm'...",mid high,8,80,new age,main melody,acoustic piano,standard,4/4,23,30,train,commu00024,,"['Dm', 'G7', 'G', 'F', 'C', 'Am']",Compose an experimental new age piece in C maj...,0
4,c major,"[['C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'Dm7...",low,8,50,new age,bass,acoustic piano,standard,4/4,71,72,train,commu00025,,"['Dm7', 'A#', 'C']","[""Let's groove in the C major key with a low p...",0


In [7]:
# Shuffle dataset
df = shuffle(df_orig)
df.reset_index(drop=True, inplace=True)

### Data Preprocessing

In [8]:
def remove_char(text):
    if text.startswith('["') or text.startswith("['"):
        text = text[2:]
    if text.endswith('"]') or text.endswith("']"):
        text = text[:-2]
    return text

In [48]:
df['audio_key'] = df['audio_key'].str.replace(' ', '')
df['pitch_range'] = df['pitch_range'].str.replace(' ', '_')
df['genre'] = df['genre'].str.replace(' ', '_')
df['inst'] = df['inst'].str.replace(' ', '_')
df['track_role'] = df['track_role'].str.replace(' ', '_')
df['text'] = df['text'].apply(remove_char)
df = df.rename(columns={'unique_chord_n_note': 'chord'})

In [10]:
def dataset_process(dataset, split, feature):
    process_dataset = Dataset.from_pandas(dataset)
    process_dataset = DatasetDict({split: process_dataset})

    excl_columns = ["text", feature]
    columns = [col for col in process_dataset[split].column_names if col not in excl_columns]

    process_dataset[split] = process_dataset[split].remove_columns(columns)
    process_dataset[split] = process_dataset[split].rename_column(feature, f"label_{feature}")

    return process_dataset

In [11]:
# ADD COLUMNS TO EXCLUDE
excl_col = [
    "chord_progressions", 
    "split_data", 
    "id", 
    "track_roll", 
    "pitch_range", 
    "track_role", 
    "sample_rhythm", 
    "time_signature",
    "track_role",
    "inst_group"
]

for col in df.columns:
    if col in excl_col:
        df = df.drop(col, axis=1)

In [12]:
dataset_dict = {}
col_list = []

for col in df.columns:
    if col != "text":
        dataset = dataset_process(df, "train", col)
        dataset_dict[col] = dataset

        col_list.append(df[col].name)
    else:
        # Save separate "text" column for use in Evaluation
        inputs = pd.DataFrame(df["text"])

dataset_dict

{'audio_key': DatasetDict({
     train: Dataset({
         features: ['label_audio_key', 'text'],
         num_rows: 15340
     })
 }),
 'num_measures': DatasetDict({
     train: Dataset({
         features: ['label_num_measures', 'text'],
         num_rows: 15340
     })
 }),
 'bpm': DatasetDict({
     train: Dataset({
         features: ['label_bpm', 'text'],
         num_rows: 15340
     })
 }),
 'genre': DatasetDict({
     train: Dataset({
         features: ['label_genre', 'text'],
         num_rows: 15340
     })
 }),
 'inst': DatasetDict({
     train: Dataset({
         features: ['label_inst', 'text'],
         num_rows: 15340
     })
 }),
 'min_velocity': DatasetDict({
     train: Dataset({
         features: ['label_min_velocity', 'text'],
         num_rows: 15340
     })
 }),
 'max_velocity': DatasetDict({
     train: Dataset({
         features: ['label_max_velocity', 'text'],
         num_rows: 15340
     })
 }),
 'chord': DatasetDict({
     train: Dataset({
         featu

### Encoding & Tokenization

In [13]:
model_name = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)
# max_seq_len = 128

#### Data Encoding

In [14]:
formatter = (
    DatasetFormatter().tokenize("text", truncation=True, padding=True)
    .encode("label_audio_key")
    # .encode("label_pitch_range")
    .encode("label_num_measures")
    .encode("label_bpm")
    .encode("label_genre")
    # .encode("label_track_role")
    .encode("label_inst")
    # .encode("label_sample_rhythm")
    # .encode("label_time_signature")
    .encode("label_min_velocity")
    .encode("label_max_velocity")
    .encode("label_chord")
)

data = formatter.apply(
        {
            "audio_key": dataset_dict["audio_key"], 
            # "pitch_range": dataset_dict["pitch_range"],
            "num_measures": dataset_dict["num_measures"],
            "bpm": dataset_dict["bpm"],
            "genre": dataset_dict["genre"],            
            # "track_role": dataset_dict["track_role"],
            "inst": dataset_dict["inst"],
            # "sample_rhythm": dataset_dict["sample_rhythm"],
            # "time_signature": dataset_dict["time_signature"],
            "min_velocity": dataset_dict["min_velocity"],
            "max_velocity": dataset_dict["max_velocity"],
            "chord": dataset_dict["chord"],
        }, 
    tokenizer=tokenizer
)

Map: 100%|████████████████████████████████████████████████████████████████████████████████████████| 15340/15340 [00:02<00:00, 5770.46 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████| 15340/15340 [00:02<00:00, 6257.83 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████| 15340/15340 [00:02<00:00, 6306.52 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████| 15340/15340 [00:02<00:00, 6199.78 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████| 15340/15340 [00:02<00:00, 6365.80 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████| 15340/15340 [00:02<00:00, 6293.70 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████| 15340/15340 [00:02<00:00

#### Encoding Key Creation

Used to convert encoded labels and predicted labels back to original values.

In [15]:
encoding_key = {}

for col in col_list:
    col_key = {}
    label_name = f"label_{col}"
    for key, value in zip(data[col]["train"][label_name], dataset_dict[col]["train"][label_name]):
        col_key[key] = value
        encoding_key[col] = col_key

### Create Test Dataset

Test dataset must be created after encoding is performed. Otherwise, encoding will not capture all possible values.

In [16]:
# Combine original data with encoded data in a list
master_data = [dataset_dict, data]

In [17]:
total_size = len(df_orig) # Length of original dataset
train_size = int(0.7 * total_size)
val_size = int(0.15 * total_size)

In [18]:
data_dict = {}
master_split_data = []

for dataset in master_data:
    for col in col_list:
        data_train = dataset[col]["train"].select(i for i in range(train_size))
        data_val = dataset[col]["train"].select(i for i in range(train_size, train_size + val_size))
        data_test = dataset[col]["train"].select(i for i in range(train_size + val_size, total_size))

        split_data = DatasetDict({
            "train": data_train,
            "validation": data_val,
            "test": data_test
        })

        data_dict[col] = split_data

    final_dataset = DatasetCollection(data_dict)
    master_split_data.append(final_dataset)

orig_data = master_split_data[0]
encode_data = master_split_data[1]



In [19]:
# Use same logic to build test set on text inputs to prepare for evaluation
test_inputs = inputs["text"][train_size + val_size:total_size]

## Modeling & Experimentation

Based on this paper: https://arxiv.org/pdf/1905.05583.pdf

Library Example is here: https://github.com/chatdesk/grouphug/blob/master/examples/from-readme.ipynb

### Model Configuration

In [20]:
head_configs = (
    [
        LMHeadConfig(weight=0.1),
        ClassificationHeadConfig.from_data(data, "label_audio_key", classifier_hidden_size=20, weight=1),
        # ClassificationHeadConfig.from_data(data, "label_pitch_range", classifier_hidden_size=20, weight=1),
        ClassificationHeadConfig.from_data(data, "label_num_measures", classifier_hidden_size=20, weight=1),
        ClassificationHeadConfig.from_data(data, "label_bpm", classifier_hidden_size=20, weight=1),
        ClassificationHeadConfig.from_data(data, "label_genre", classifier_hidden_size=20, weight=1),
        # ClassificationHeadConfig.from_data(data, "label_track_role", classifier_hidden_size=20, weight=1),
        ClassificationHeadConfig.from_data(data, "label_inst", classifier_hidden_size=20, weight=1),
        # ClassificationHeadConfig.from_data(data, "label_sample_rhythm", classifier_hidden_size=20, weight=1),
        # ClassificationHeadConfig.from_data(data, "label_time_signature", classifier_hidden_size=20, weight=1),
        ClassificationHeadConfig.from_data(data, "label_min_velocity", classifier_hidden_size=20, weight=1),
        ClassificationHeadConfig.from_data(data, "label_max_velocity", classifier_hidden_size=20, weight=1),
        ClassificationHeadConfig.from_data(data, "label_chord", classifier_hidden_size=20, weight=1),
    ]
)

In [21]:
model = AutoMultiTaskModel.from_pretrained(model_name, head_configs, formatter=formatter, tokenizer=tokenizer)
model.to(device)

Some weights of BertMultiTaskModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['other_heads.label_genre.head.1.bias', 'other_heads.label_inst.head.1.bias', 'other_heads.label_max_velocity.head.4.weight', 'other_heads.label_chord.head.4.bias', 'other_heads.label_audio_key.head.4.bias', 'other_heads.label_min_velocity.head.4.bias', 'other_heads.label_audio_key.head.1.weight', 'other_heads.label_chord.head.4.weight', 'other_heads.label_bpm.head.1.weight', 'other_heads.label_min_velocity.head.4.weight', 'other_heads.label_max_velocity.head.4.bias', 'other_heads.label_inst.head.1.weight', 'other_heads.label_inst.head.4.bias', 'other_heads.label_audio_key.head.4.weight', 'other_heads.label_num_measures.head.4.bias', 'other_heads.label_min_velocity.head.1.weight', 'other_heads.label_max_velocity.head.1.bias', 'other_heads.label_chord.head.1.bias', 'other_heads.label_genre.head.4.weight', 'other_heads.label_audio_key.head.1.bias', 'other_head

### Training & Evaluation

In [24]:
output_dir = "./models"

ideal_steps = 25000

batch_size = 32
epochs = round(ideal_steps / (train_size / batch_size))
lr = 1e-4
weight_decay = 1e-3
eval_steps = 2500
save_steps = 5000
warmup_steps = ideal_steps * 0.05
save_limit = 5

In [25]:
training_arg = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=epochs,
    learning_rate=lr,
    weight_decay=weight_decay,
    per_device_train_batch_size=batch_size,
    evaluation_strategy="steps",
    eval_steps=eval_steps,
    warmup_steps=warmup_steps,
    save_steps=save_steps,
    save_total_limit=save_limit
)

In [26]:
# Taken straight from GroupHug utils

# metrics = {k: evaluate.load(k) for k in ["accuracy", "f1", "recall", "precision", "matthews_correlation"]}
metrics = {k: evaluate.load(k) for k in ["accuracy", "f1"]}

def compute_classification_metrics(eval_preds, dataset_name, heads):
    all_logits, all_labels = eval_preds
    if not isinstance(all_logits, tuple):
        all_logits = (all_logits,)
        all_labels = (all_labels,)
    results = {}

    for logits, labels, hc in zip(all_logits, all_labels, heads):
        labels_1d = labels.ravel()
        mask = labels_1d != hc.ignore_index
        labels_1d = labels_1d[mask]
        if hc.problem_type == ClassificationHead.MULTI:
            predictions = logits > 0
            predictions_1d = predictions.ravel()[mask]
            exact_match = ((predictions == labels) | (labels == IGNORE_INDEX)).all(axis=-1)
            results[f"{hc.name}_subset_accuracy"] = exact_match.sum() / len(exact_match)
        else:
            predictions_1d = np.argmax(logits, axis=-1).ravel()[mask]
        for k, f in metrics.items():
            try:
                kwargs = {"average": "weighted"} if k in ["f1", "recall", "precision"] else {}
                for mk, mv in f.compute(predictions=predictions_1d, references=labels_1d, **kwargs).items():
                    results[f"{hc.name}_{mk}"] = mv
            except Exception as e:
                print(f"metric {k} on dataset {dataset_name} head {hc.name} failed: {e}")
    return results

In [27]:
trainer = MultiTaskTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_arg,
    train_data=encode_data[:, "train"],
    # eval_data=encode_data[:, "validation"],
    eval_data=encode_data[["inst"], "validation"], # Update for evaluation on single column
    # eval_heads={col: [f"label_{col}"] for col in col_list}, # Use this for all columns (takes a LONG time)
    eval_heads={"inst": ["label_inst"]}, # Update for evaluation on single column
    compute_metrics=compute_classification_metrics,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [28]:
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Currently logged in as: [33mhsiungc[0m. Use [1m`wandb login --relogin`[0m to force relogin
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Label Inst Accuracy,Label Inst F1
2500,4.6429,3.172759,0.20339,0.079111
5000,4.4134,2.688296,0.692742,0.602873
7500,4.1341,2.332824,0.735767,0.65653
10000,4.1749,2.042091,0.800522,0.744059
12500,3.9738,1.793485,0.832681,0.781334
15000,4.0235,1.605153,0.837897,0.79078
17500,3.869,1.434526,0.860495,0.813169
20000,3.7448,1.292288,0.862668,0.816573
22500,3.7478,1.182613,0.926119,0.904688
25000,3.5776,1.058163,0.931334,0.909693




TrainOutput(global_step=201600, training_loss=2.9375342848944284, metrics={'train_runtime': 116580.3881, 'train_samples_per_second': 55.265, 'train_steps_per_second': 1.729, 'total_flos': 1.0581876399966295e+18, 'train_loss': 2.9375342848944284, 'epoch': 75.0})

In [82]:
# Save the model locally
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

### Evaluation

In [30]:
df_test = pd.DataFrame()

# Add test inputs to testing dictionary
df_test["text"] = test_inputs

for col in col_list:
    df_test[f"orig_label_{col}"] = master_split_data[0][col]["test"][f"label_{col}"]
    df_test[f"enc_label_{col}"] = master_split_data[1][col]["test"][f"label_{col}"]

In [31]:
results = model.predict(pd.DataFrame(df_test["text"]))

In [54]:
df_results = pd.DataFrame()

for col in col_list:
    col_check = f"label_{col}_predicted_id"
    for predict_col in results.columns:
        if predict_col.startswith(col_check):
            df_results[predict_col] = results[predict_col]

df_eval = pd.concat([df_test, df_results], axis=1)

In [70]:
f1_dict = {}
compare_dict = {}

for col in col_list:
    encoded = f"enc_label_{col}"
    predicted = f"label_{col}_predicted_id"
    
    accuracy = accuracy_score(df_eval[encoded], df_eval[predicted])
    f1 = f1_score(df_eval[encoded], df_eval[predicted], average=None)
    weight_f1 = f1_score(df_eval[encoded], df_eval[predicted], average="weighted")
    micro_f1 = f1_score(df_eval[encoded], df_eval[predicted], average="micro")
    macro_f1 = f1_score(df_eval[encoded], df_eval[predicted], average="macro")
    
    precision = precision_score(df_eval[encoded], df_eval[predicted], average=None)
    micro_precision = precision_score(df_eval[encoded], df_eval[predicted], average="micro")
    macro_precision = precision_score(df_eval[encoded], df_eval[predicted], average="macro")

    recall = recall_score(df_eval[encoded], df_eval[predicted], average=None)
    micro_recall = recall_score(df_eval[encoded], df_eval[predicted], average="micro")
    macro_recall = recall_score(df_eval[encoded], df_eval[predicted], average="macro")

    compare_dict[col] = {
        "Accuracy": accuracy,
        "F1": f1,
        "WeightedF1": weight_f1,
        "MicroF1": micro_f1,
        "MacroF1": macro_f1,
        "MicroPrecision": micro_precision,
        "MacroPrecision": macro_precision,
        "MicroRecall": micro_recall,
        "MacroRecall": macro_recall,
    }

    f1_dict[col] = f1

df_final = pd.DataFrame(compare_dict)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

In [59]:
df_final

Unnamed: 0,audio_key,num_measures,bpm,genre,inst,min_velocity,max_velocity,chord
Accuracy,0.921773,0.360713,0.151239,0.790091,0.98957,0.435897,0.40678,0.025641
F1,"[0.7080103359173127, 0.9896049896049897, 0.960...","[0.5179856115107914, 0.27272727272727276, 0.0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.6666666666666666, ...","[0.0, 0.0, 0.15384615384615385, 0.769230769230...","[0.9887640449438202, 0.991869918699187, 1.0, 1...","[0.7027027027027027, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.33333333333333337, 0.0, 0.0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
WeightedF1,0.920343,0.247096,0.067258,0.74728,0.988874,0.352791,0.313386,0.002072
MicroF1,0.921773,0.360713,0.151239,0.790091,0.98957,0.435897,0.40678,0.025641
MacroF1,0.755542,0.105382,0.024471,0.356605,0.942252,0.172793,0.077532,0.000058
MicroPrecision,0.921773,0.360713,0.151239,0.790091,0.98957,0.435897,0.40678,0.025641
MacroPrecision,0.758364,0.101479,0.025132,0.354288,0.940579,0.183608,0.069664,0.00003
MicroRecall,0.921773,0.360713,0.151239,0.790091,0.98957,0.435897,0.40678,0.025641
MacroRecall,0.756145,0.145756,0.05853,0.431729,0.946678,0.235521,0.132967,0.000712


#### F1 Score by Class

In [71]:
f1_dict

{'audio_key': array([0.70801034, 0.98960499, 0.9609375 , 0.90625   , 0.        ,
        0.9902439 , 1.        , 0.98575499, 0.99275362, 0.90625   ,
        0.94915254, 0.        , 0.62837838, 0.9039548 , 0.        ,
        0.95263158, 0.97029703]),
 'num_measures': array([0.51798561, 0.27272727, 0.        , 0.        , 0.        ,
        0.        , 0.94736842, 0.49039201, 0.03773585, 0.00921659,
        0.        , 0.6031746 , 0.03539823, 0.38967136, 0.        ,
        0.88793103, 0.        , 0.        , 0.        , 0.39852399,
        0.        , 0.        , 0.        , 0.17391304, 0.54054054,
        0.23529412, 0.3902439 , 0.32      , 0.        , 0.03846154,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.       

#### Confusion Matrix

Modify columns so that categories with large numbers of classes are removed.

In [60]:
def plot_confusion_matrix(confusion, class_names, column):
    plt.figure(figsize=(7, 5))

    sns.set(font_scale=1.2)
    sns.heatmap(confusion, annot=True, fmt='.2%', cmap='Blues', cbar=False, xticklabels=class_names, yticklabels=class_names)
    
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix (%): {column}')
    
    plt.show()

In [None]:
for col in col_list:
    original = f"orig_label_{col}"
    encoded = f"enc_label_{col}"
    predicted = f"label_{col}_predicted_id"

    confusion = confusion_matrix(df_eval[encoded], df_eval[predicted])

    zero_row_sums = np.where(confusion.sum(axis=1) == 0)
    confusion[zero_row_sums] = 1

    conf_matrix_perc = confusion.astype('float') / confusion.sum(axis=1)[:, np.newaxis]

    class_names = list(df_test[original].unique())
    plot_confusion_matrix(conf_matrix_perc, class_names, col)

## Save Model to Hub

In [34]:
version = "test"

#### S3

In [None]:
model_path = f"models/bert_finetune_{version}" # Update to local path of model file

s3_object_key = f"/bert_finetune_{version}"
s3.upload_file(model_path, bucket_name, s3_object_key)

print(f"Model '{model_name}' saved to S3 bucket '{bucket_name}'")

## Inference

In [None]:
results = model.predict({"text":"I love playing to jazz music at 4/4 time signature. Can you give me a piece of music \
                        that is 1-127 velocity with bass? The pitch should be mid with riff. I also want 125 bpm in 8 measures \
                        in a minor key and chords of ['C','A','B']"})