# LLM Single Task Fine-Tuning

## Setup

In [1]:
import os
from io import StringIO
import boto3
import pandas as pd
import numpy as np

import torch

from datasets import DatasetDict, Dataset
from huggingface_hub import notebook_login
from transformers import (
    BertForSequenceClassification,
    AutoTokenizer, 
    TrainingArguments,
    Trainer
)

from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score, 
    f1_score, 
    recall_score, 
    precision_score,
    precision_recall_fscore_support,
    confusion_matrix,
    classification_report
)

import seaborn as sns
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Set random seed
random_seed = 42
torch.manual_seed(random_seed)

<torch._C.Generator at 0x7fd03389f190>

In [3]:
# CUDA 
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    devices = torch.cuda.device_count()
    gpu = torch.cuda.get_device_name()
    device = torch.device("cuda")
    print(f"Devices: {devices}")
else:
    print("No GPUs available.")
    device = torch.device("cpu")

Devices: 1


## Dataset Preparation

### S3 Data Load

In [4]:
access_key_id = os.environ.get("S3_KEY_ID")
secret_access_key = os.environ.get("S3_ACCESS_KEY")


s3 = boto3.client("s3",
    region_name="us-west-2", 
    aws_access_key_id=access_key_id, 
    aws_secret_access_key=secret_access_key,
    )

In [5]:
bucket_name = "orchestrate-bucket"
objects = s3.list_objects_v2(Bucket=bucket_name)
files = objects.get("Contents")

In [479]:
file = "master_midi_meta_final.csv" # UPDATE FILE NAME

object = s3.get_object(Bucket=bucket_name, Key=file)
data = object["Body"].read().decode("utf-8")

df_orig = pd.read_csv(StringIO(data))

print(f"DataFrame size: {len(df_orig)}")
df_orig.head()

DataFrame size: 15340


Unnamed: 0,audio_key,chord_progressions,pitch_range,num_measures,bpm,genre,track_role,inst,sample_rhythm,time_signature,min_velocity,max_velocity,split_data,id,track_roll,unique_chord_n_note,text,inst_group
0,c major,"[['C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'Dm'...",mid low,8,80,new age,accompaniment,acoustic piano,standard,4/4,23,30,train,commu00002,,"['Dm', 'G7', 'G', 'F', 'C', 'Am']",Compose a tranquil and soothing New Age piano ...,0
1,a minor,"[['Am', 'Am', 'Am', 'Am', 'Em', 'Em', 'Em', 'E...",mid low,4,60,cinematic,pad,acoustic piano,standard,4/4,21,22,train,commu00005,,"['Em', 'F', 'Am']","[""Let's create some cinematic magic! Set your ...",0
2,a minor,"[['Am', 'Am', 'Am', 'Am', 'Am', 'Am', 'Am', 'A...",mid low,8,120,cinematic,pad,acoustic piano,standard,4/4,95,96,train,commu00016,,"['C', 'F', 'G', 'Am']","[""Create a 8-measure piece in the cinematic ge...",0
3,c major,"[['C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'Dm'...",mid high,8,80,new age,main melody,acoustic piano,standard,4/4,23,30,train,commu00024,,"['Dm', 'G7', 'G', 'F', 'C', 'Am']",Compose an experimental new age piece in C maj...,0
4,c major,"[['C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'Dm7...",low,8,50,new age,bass,acoustic piano,standard,4/4,71,72,train,commu00025,,"['Dm7', 'A#', 'C']","[""Let's groove in the C major key with a low p...",0


In [480]:
# Shuffle dataset
df = shuffle(df_orig)
df.reset_index(drop=True, inplace=True)

### Data Preprocessing

In [481]:
def remove_char(text):
    if text.startswith('["') or text.startswith("['"):
        text = text[2:]
    if text.endswith('"]') or text.endswith("']"):
        text = text[:-2]
    return text

In [482]:
df['audio_key'] = df['audio_key'].str.replace(' ', '')
df['pitch_range'] = df['pitch_range'].str.replace(' ', '_')
df['genre'] = df['genre'].str.replace(' ', '_')
df['inst'] = df['inst'].str.replace(' ', '_')
df['track_role'] = df['track_role'].str.replace(' ', '_')
df['text'] = df['text'].apply(remove_char)
df = df.rename(columns={'unique_chord_n_note': 'chord'})

In [484]:
# Update columns to input and single output column
col = "chord"

incl_col = [
    col,
    "text"
    ]

df = df[incl_col].rename(columns={col: "targets"})

In [485]:
encoder = LabelEncoder()
df["labels"] = encoder.fit_transform(df["targets"])

In [486]:
# Encoding key for lookups
encoding_key = dict(zip(df["labels"], df["targets"]))

In [487]:
df = df.drop(columns=["targets"])

In [488]:
def dataset_process(dataset, split):
    process_dataset = Dataset.from_pandas(dataset)
    process_dataset = DatasetDict({split: process_dataset})
    
    return process_dataset

In [489]:
dataset = dataset_process(df, "train")
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 15340
    })
})

### Tokenization

In [490]:
model_name = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)
max_seq_len = 128

In [491]:
def tokenize(input):
    tokenized_data = tokenizer(
        input["text"],
        max_length=max_seq_len, 
        padding="max_length", 
        truncation=True)

    return tokenized_data

In [492]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 15340
    })
})

In [493]:
data_tokenized = dataset.map(tokenize, batched=True)

Map: 100%|██████████████████████████████████████████████████████████████████████████████████| 15340/15340 [00:01<00:00, 13345.31 examples/s]


In [494]:
total_size = len(df_orig) # Length of original dataset
train_size = int(0.7 * total_size)
val_size = int(0.15 * total_size)

In [495]:
data_train = data_tokenized["train"].select(i for i in range(train_size))
data_val = data_tokenized["train"].select(i for i in range(train_size, train_size + val_size))
data_test = data_tokenized["train"].select(i for i in range(train_size + val_size, total_size))

split_data = DatasetDict({
    "train": data_train,
    "validation": data_val,
    "test": data_test
})

In [496]:
split_data

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10738
    })
    validation: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2301
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2301
    })
})

## Modeling & Experimentation

Training framework based on this paper: https://arxiv.org/pdf/1905.05583.pdf

### Model Configuration

In [497]:
model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(df["labels"].unique()),
    output_attentions = False,
    output_hidden_states = False,
    )

model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Training & Evaluation

In [501]:
output_dir = "./models"

ideal_steps = 25000

batch_size = 32
epochs = round(ideal_steps / (train_size / batch_size))
lr = 1e-4
weight_decay = 1e-3
eval_steps = 2500
save_steps = 5000
warmup_steps = ideal_steps * 0.05
save_limit = 5

In [502]:
training_arg = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=epochs,
    learning_rate=lr,
    weight_decay=weight_decay,
    per_device_train_batch_size=batch_size,
    evaluation_strategy="steps",
    eval_steps=eval_steps,
    warmup_steps=warmup_steps,
    save_steps=save_steps,
    save_total_limit=save_limit
)

In [503]:
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids

    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

In [504]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_arg,
    train_dataset=split_data["train"],
    eval_dataset=split_data["validation"],
    compute_metrics=compute_metrics
)

In [505]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
2500,4.0664,4.643269,0.391134,0.32383,0.391134,0.339762
5000,1.5577,3.577746,0.64711,0.616442,0.64711,0.625065
7500,0.6178,3.431146,0.729248,0.715448,0.729248,0.718587
10000,0.2383,3.481973,0.751412,0.740055,0.751412,0.742126
12500,0.0821,3.582961,0.756628,0.747611,0.756628,0.749119
15000,0.032,3.686373,0.756628,0.748515,0.756628,0.749714
17500,0.0143,3.792565,0.755758,0.747074,0.755758,0.748411
20000,0.0075,3.854899,0.757062,0.748539,0.757062,0.749927
22500,0.005,3.901595,0.757497,0.748369,0.757497,0.749982
25000,0.0038,3.919466,0.757497,0.747633,0.757497,0.749639


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

TrainOutput(global_step=25200, training_loss=0.981145543531766, metrics={'train_runtime': 4835.7984, 'train_samples_per_second': 166.539, 'train_steps_per_second': 5.211, 'total_flos': 5.51896218671616e+16, 'train_loss': 0.981145543531766, 'epoch': 75.0})

In [506]:
# Save the model locally
model.save_pretrained(f"single_models/{col}")
tokenizer.save_pretrained(f"single_models/{col}")

('single_models/chord/tokenizer_config.json',
 'single_models/chord/special_tokens_map.json',
 'single_models/chord/vocab.txt',
 'single_models/chord/added_tokens.json',
 'single_models/chord/tokenizer.json')

### Evaluation

In [535]:
model_load = './models/checkpoint-25000'

model = BertForSequenceClassification.from_pretrained(
    model_load,
    num_labels=len(df["labels"].unique()),
    output_attentions = False,
    output_hidden_states = False,
    )

tokenizer = AutoTokenizer.from_pretrained(model_load)

In [536]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
)

In [537]:
results = trainer.predict(split_data["test"])

In [538]:
predicted_labels = results.predictions.argmax(axis=1)
true_labels = results.label_ids

In [539]:
compare_dict = {}

accuracy = accuracy_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels, average="macro")
precision = precision_score(true_labels, predicted_labels, average="macro")
recall = recall_score(true_labels, predicted_labels, average="macro")

compare_dict[col] = {
    "Accuracy": accuracy,
    "F1": f1,
    "Precision": precision,
    "Recall": recall,
}

print(f"{col} Classification Report")
print(classification_report(true_labels, predicted_labels))

df_final = pd.DataFrame(compare_dict)

chord Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         1
          10       1.00      1.00      1.00         1
          11       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       0.00      0.00      0.00         0
          17       0.00      0.00      0.00         1
          22       1.00      1.00      1.00         2
          23       1.00      1.00      1.00         1
          26       1.00      1.00      1.00         1
          27       0.00      0.00      0.00         0
          28       0.00      0.00      0.00         1
          29       1.00      1.00      1.00         1
          30       0.00      0.00      0.00         0
          31       1.00      1.00      1.00         1

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [540]:
df_final

Unnamed: 0,chord
Accuracy,0.76445
F1,0.456694
Precision,0.452799
Recall,0.466477


#### Confusion Matrix

Modify columns so that categories with large numbers of classes are removed.

In [37]:
def plot_confusion_matrix(confusion, class_names, column):
    plt.figure(figsize=(7, 5))

    sns.set(font_scale=1.2)
    sns.heatmap(confusion, annot=True, fmt='.2%', cmap='Blues', cbar=False, xticklabels=class_names, yticklabels=class_names)
    
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix (%): {column}')
    
    plt.show()

In [45]:
classes = []

for label in true_labels:
    orig_label = encoding_key[label]
    classes.append(orig_label)

In [None]:
confusion = confusion_matrix(true_labels, predicted_labels)

zero_row_sums = np.where(confusion.sum(axis=1) == 0)
confusion[zero_row_sums] = 1

conf_matrix_perc = confusion.astype('float') / confusion.sum(axis=1)[:, np.newaxis]

plot_confusion_matrix(conf_matrix_perc, classes, col)

## Save Model to Hub

In [34]:
version = "test"

#### S3

In [None]:
model_path = f"models/bert_finetune_{version}" # Update to local path of model file

s3_object_key = f"/bert_finetune_{version}"
s3.upload_file(model_path, bucket_name, s3_object_key)

print(f"Model '{model_name}' saved to S3 bucket '{bucket_name}'")

#### HuggingFace

In [None]:
notebook_login()

In [None]:
model.push_to_hub(f"hsiungc/bert_finetune_{version}", use_auth_token=True)

## Inference

In [None]:
results = model.predict({"text":"I love playing to jazz music at 4/4 time signature. Can you give me a piece of music \
                        that is 1-127 velocity with bass? The pitch should be mid with riff. I also want 125 bpm in 8 measures \
                        in a minor key and chords of ['C','A','B']"})