In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
# from torch.nn.utils.rnn import pad_sequence
import numpy as np
from tqdm import tqdm

from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW, get_linear_schedule_with_warmup
import os
import random

  from .autonotebook import tqdm as notebook_tqdm


# Data Preparation

In [2]:
data = 'data/new/cleaned.csv'
df = pd.read_csv(data)
df.drop('Unnamed: 0', axis=1, inplace=True)
df.head(10)

Unnamed: 0,index,judul_lagu,bar_number,bar,pitch_pattern,birama,panjang_note,ritme,kunci
0,1,The Enchanted Valley,1,G3-A (Bcd=e),flat,2/4,1/16,Very slow,Gm
1,1,The Enchanted Valley,2,f4 (g2dB),flat,2/4,1/16,Very slow,Gm
2,1,The Enchanted Valley,3,({d}c3-B) G2-E2,flat,2/4,1/16,Very slow,Gm
3,1,The Enchanted Valley,4,F4 (D2=E^F),flat,2/4,1/16,Very slow,Gm
4,1,The Enchanted Valley,5,G3-A (Bcd=e),flat,2/4,1/16,Very slow,Gm
5,1,The Enchanted Valley,6,f4 d2-f2,flat,2/4,1/16,Very slow,Gm
6,1,The Enchanted Valley,7,(g2a2 b2).g2,flat,2/4,1/16,Very slow,Gm
7,1,The Enchanted Valley,8,{b}(a2g2 f2).d2,down,2/4,1/16,Very slow,Gm
8,1,The Enchanted Valley,9,(d2{ed}c2) B2B2,flat,2/4,1/16,Very slow,Gm
9,1,The Enchanted Valley,10,(A2G2 {AG}F2).D2,flat,2/4,1/16,Very slow,Gm


### Encoding Fitur

# Transformers By GPT-2

In [None]:
torch.cuda.is_available()

In [3]:
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import os
from torch.cuda.amp import autocast, GradScaler
import random

# Fungsi untuk memproses kolom 'bar'
def process_abc(abc_notation, tokenizer):
    tokens = tokenize_abc(abc_notation)
    processed = [tokenizer.get(token, tokenizer['[UNK]']) for token in tokens]
    original_length = len(processed)
    return processed, original_length

# Fungsi untuk membuat tokenizer berdasarkan kolom 'bar'
def create_tokenizer(df):
    tokenizer = {'[PAD]': 0, '[UNK]': 1}  # Menambahkan token PAD dan UNK
    token_id = 2
    for notation in df['bar']:
        if pd.notnull(notation):
            tokens = tokenize_abc(notation)
            for token in tokens:
                if token not in tokenizer:
                    tokenizer[token] = token_id
                    token_id += 1
    return tokenizer

# Fungsi untuk mengubah notasi ABC menjadi token
def tokenize_abc(abc_notation):
    return list(abc_notation)

# Label encoder untuk kolom lainnya
encoder = LabelEncoder()
pickedCol = ['bar', 'pitch_pattern', 'birama', 'panjang_note', 'ritme', 'kunci']

# Membuat tokenizer untuk kolom 'bar'
tokenizer = create_tokenizer(df)
encodedDf = pd.DataFrame({})

# Membuat peta balik dari token_id ke notasi ABC
reverse_tokenizer = {v: k for k, v in tokenizer.items()}

# Memproses kolom 'bar' dengan tokenizer
encodedDf['bar_encoded'] = df['bar'].apply(lambda x: process_abc(x, tokenizer)[0] if pd.notnull(x) else [0])
encodedDf['bar_length'] = df['bar'].apply(lambda x: process_abc(x, tokenizer)[1] if pd.notnull(x) else 0)

# Encode kolom lainnya
for col in pickedCol:
    if col != 'bar':
        encodedDf[col] = encoder.fit_transform(df[col])

# Simpan DataFrame yang sudah diproses
final_df = encodedDf.copy()

# Konversi list of integers menjadi string agar kompatibel dengan GPT-2
final_df['bar_encoded_str'] = final_df['bar_encoded'].apply(lambda x: ' '.join(map(str, x)))

In [4]:
# Membuat direktori untuk menyimpan checkpoint jika belum ada
checkpoint_dir = './gpt2_checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)

# Tokenizer dan Model GPT-2
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]', 'unk_token': '[UNK]'})

# Menginisialisasi ulang model agar mengenali token tambahan
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))

# Fungsi untuk menambahkan konteks ke teks input
def add_context(row):
    context = f"Pitch pattern: {row['pitch_pattern']}, Birama: {row['birama']}, Kunci: {row['kunci']}."
    return f"{context} Sequence: {row['bar_encoded_str']}"

# Menerapkan fungsi untuk membuat kolom teks input lengkap
final_df['input_text'] = final_df.apply(add_context, axis=1)

final_df.head()



Unnamed: 0,bar_encoded,bar_length,pitch_pattern,birama,panjang_note,ritme,kunci,bar_encoded_str,input_text
0,"[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]",12,1,2,0,31,33,2 3 4 5 6 7 8 9 10 11 12 13,"Pitch pattern: 1, Birama: 2, Kunci: 33. Sequen..."
1,"[14, 15, 6, 7, 16, 17, 10, 8, 13]",9,1,2,0,31,33,14 15 6 7 16 17 10 8 13,"Pitch pattern: 1, Birama: 2, Kunci: 33. Sequen..."
2,"[7, 18, 10, 19, 9, 3, 4, 8, 13, 6, 2, 17, 4, 2...",15,1,2,0,31,33,7 18 10 19 9 3 4 8 13 6 2 17 4 20 17,"Pitch pattern: 1, Birama: 2, Kunci: 33. Sequen..."
3,"[21, 15, 6, 7, 22, 17, 11, 20, 23, 21, 13]",11,1,2,0,31,33,21 15 6 7 22 17 11 20 23 21 13,"Pitch pattern: 1, Birama: 2, Kunci: 33. Sequen..."
4,"[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]",12,1,2,0,31,33,2 3 4 5 6 7 8 9 10 11 12 13,"Pitch pattern: 1, Birama: 2, Kunci: 33. Sequen..."


In [5]:
final_df.input_text.iloc[0]

'Pitch pattern: 1, Birama: 2, Kunci: 33. Sequence: 2 3 4 5 6 7 8 9 10 11 12 13'

In [19]:
final_df.birama.unique()

array([2, 6, 8, 4, 9, 5, 7, 0, 1, 3])

In [20]:
final_df.kunci.unique()

array([33, 13, 29,  4, 12, 11,  0, 25,  5, 19,  3, 26, 18,  7,  8, 34, 30,
        1,  2, 31, 21, 23, 15, 16,  9, 27, 17, 24, 10, 32, 14, 28, 35, 20,
       36, 22,  6])

In [21]:
final_df.pitch_pattern.unique()

array([1, 0, 2])

In [None]:

# Tokenize the input sequences individually (tanpa padding)
input_ids = [tokenizer(text, return_tensors='pt')['input_ids'].squeeze(0) for text in final_df['input_text'].tolist()]
attention_masks = [torch.ones_like(ids) for ids in input_ids]  # Semua tokens dianggap penting, jadi masking 1

# Create a simple custom dataset without padding
class MusicDataset(torch.utils.data.Dataset):
    def __init__(self, input_ids, attention_masks):
        self.input_ids = input_ids
        self.attention_masks = attention_masks

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attention_masks[idx], self.input_ids[idx]

dataset = MusicDataset(input_ids, attention_masks)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)  # batch_size=1 untuk menghindari padding
print(f"Total batches in dataloader: {len(dataloader)}")

# Training configuration
optimizer = AdamW(model.parameters(), lr=5e-5)
epochs = 1
total_steps = len(dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Cek apakah CUDA tersedia
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)


scaler = GradScaler()
# Training loop
model.train()


for epoch in range(epochs):
    epoch_loss = 0.0
    epoch_iterator = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}", leave=False, dynamic_ncols=True)

    for step, batch in enumerate(dataloader):
        input_ids_batch, attention_mask_batch, labels_batch = [x.to(device) for x in batch]
        
        optimizer.zero_grad()

        with autocast():
            outputs = model(input_ids=input_ids_batch, attention_mask=attention_mask_batch, labels=labels_batch)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        epoch_loss += loss.item()

        # Update tqdm manually
        epoch_iterator.update(1)
        epoch_iterator.set_postfix(loss=loss.item())

    print(f'Epoch {epoch+1}/{epochs}, Loss: {epoch_loss/len(dataloader):.4f}')

    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
    }, os.path.join(checkpoint_dir, f'checkpoint_epoch_{epoch+1}.pth'))

# Save the fine-tuned model
model.save_pretrained('./fine_tuned_gpt2')
tokenizer.save_pretrained('./fine_tuned_gpt2')

In [11]:
from transformers import LogitsProcessor
from transformers import GPT2Tokenizer, GPT2LMHeadModel

class CustomLogitsProcessor(LogitsProcessor):
    def __init__(self, valid_tokens):
        self.valid_tokens = valid_tokens
    
    def __call__(self, input_ids, scores):
        mask = torch.ones(scores.shape[-1], dtype=torch.bool, device=scores.device)
        mask[self.valid_tokens] = False
        scores[..., mask] = -float("inf")
        return scores

# List of valid token IDs that correspond to your ABC notation tokens
# Pastikan hanya angka valid yang disertakan
# List of valid token IDs that correspond to your ABC notation tokens
valid_token_ids = []
for token, token_id in tokenizer.get_vocab().items():
    try:
        if token.isdigit() and 0 <= int(token) <= 100:
            valid_token_ids.append(token_id)
    except ValueError:
        # Skip tokens that can't be converted to integer
        continue


# Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained('./fine_tuned_gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('./fine_tuned_gpt2')

# Pastikan model dan tokenizer menggunakan perangkat yang tepat (CPU/GPU)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# List of valid token IDs that correspond to your ABC notation tokens
valid_token_ids = [token_id for token, token_id in tokenizer.get_vocab().items() if token.isdigit()]

# Create the logits processor
logits_processor = CustomLogitsProcessor(valid_token_ids)


In [24]:
# Fungsi untuk memfilter data berdasarkan pitch_pattern, birama, dan kunci
def filter_data(df, pitch_pattern, birama, kunci):
    return df[(df['pitch_pattern'] == pitch_pattern) & 
              (df['birama'] == birama) & 
              (df['kunci'] == kunci)]

# Fungsi untuk menggabungkan bar dari data yang difilter
def combine_bars(filtered_df):
    all_bars = filtered_df['bar_encoded'].tolist()
    random.shuffle(all_bars)
    
    combined_bar = []
    for bar in all_bars:
        combined_bar.extend(bar)
    
    return combined_bar

# Fungsi untuk menambahkan padding jika diperlukan
def pad_sequence(sequence, max_length, pad_value=tokenizer.pad_token_id):
    if len(sequence) < max_length:
        sequence.extend([pad_value] * (max_length - len(sequence)))
    return sequence


# Fungsi untuk menghasilkan musik dengan menggunakan logits processor
def generate_music_with_processor(model, tokenizer, pitch_pattern, birama, kunci, filtered_df, logits_processor, max_length=1024):
    context = f"Pitch pattern: {pitch_pattern}, Birama: {birama}, Kunci: {kunci}. Sequence: "
    combined_bar = combine_bars(filtered_df)
    
    min_sequence_length = 10  # Contoh nilai minimum

    if len(combined_bar) < min_sequence_length:
        raise ValueError(f"Sequence terlalu pendek setelah filterisasi, panjang minimal adalah {min_sequence_length} tokens.")

    combined_bar_str = ' '.join(map(str, combined_bar))
    
    # Pastikan panjang sequence tidak melebihi max_length
    combined_bar_tokens = combined_bar_str.split()
    if len(combined_bar_tokens) > max_length:
        combined_bar_tokens = combined_bar_tokens[:max_length]
    
    input_text = context + ' '.join(combined_bar_tokens)
    inputs = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True, max_length=1024)

    inputs = {key: value.to(device) for key, value in inputs.items()}
    print(f"Input IDs size: {inputs['input_ids'].size(1)}")

    # Menghitung max_new_tokens dengan memastikan tidak melebihi 1024
    remaining_tokens = 1024 - inputs['input_ids'].size(1)
    max_new_tokens = min(remaining_tokens, 50)
    # Pastikan max_new_tokens lebih dari 0
    if max_new_tokens > 0:
        outputs = model.generate(
            inputs['input_ids'], 
            max_new_tokens=max_new_tokens,  # Ini memastikan max_new_tokens valid
            num_return_sequences=1, 
            pad_token_id=tokenizer.pad_token_id, 
            bad_words_ids=[[tokenizer.pad_token_id]],
            repetition_penalty=2.5,
            no_repeat_ngram_size=3,
            do_sample=False,  # Nonaktifkan sampling
            logits_processor=[logits_processor] if logits_processor else None
        )

        generated_sequence = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return generated_sequence
    else:
        # Mengembalikan sequence input jika tidak ada ruang untuk token baru
        print("Sequence input terlalu panjang, tidak ada token baru yang dihasilkan.")
        return combined_bar_str  # Mengembalikan sequence asli


# Fungsi untuk membersihkan hasil yang dihasilkan
def clean_generated_sequence(sequence):
    # Filter out non-numeric tokens or invalid sequences
    filtered_sequence = [token for token in sequence.split() if token.isdigit() and 0 <= int(token) <= 100]
    return ' '.join(filtered_sequence)

# Fungsi untuk mengembalikan angka-angka menjadi notasi ABC
def decode_to_abc(sequence, reverse_tokenizer):
    tokens = sequence.split()  # Memisahkan urutan angka ke dalam daftar
    abc_notation = []
    
    for token in tokens:
        decoded_token = reverse_tokenizer[int(token)]
        if abc_notation and (decoded_token.isdigit() or decoded_token in ['/', '^', '=', '-', '<', '>']):
            # Jika token adalah angka atau simbol yang berkaitan dengan not sebelumnya, gabungkan tanpa spasi
            abc_notation[-1] += decoded_token
        else:
            # Jika token adalah not atau karakter baru, tambahkan sebagai elemen baru
            abc_notation.append(decoded_token)

    return ''.join(abc_notation)  # Gabungkan semua tanpa spasi tambahan


# Inputan dari pengguna
pitch_pattern_input = 1
birama_input = 2
kunci_input = 33

# Filter data sesuai dengan input
filtered_df = filter_data(final_df, pitch_pattern_input, birama_input, kunci_input)

# Setelah mendapatkan hasil dari model dan membersihkannya
if not filtered_df.empty:
    generated_music = generate_music_with_processor(model, tokenizer, pitch_pattern_input, birama_input, kunci_input, filtered_df, logits_processor)
    
    # Membersihkan hasil yang dihasilkan
    cleaned_music = clean_generated_sequence(generated_music)
    
    # Mengonversi urutan angka ke notasi ABC
    decoded_music = decode_to_abc(cleaned_music, reverse_tokenizer)  # Gunakan reverse_tokenizer di sini
    
    print("Decoded Music Sequence (ABC Notation):")
    print(decoded_music)
else:
    print("Tidak ditemukan data yang sesuai dengan input yang diberikan.")

Input IDs size: 1024
Sequence input terlalu panjang, tidak ada token baru yang dihasilkan.
Decoded Music Sequence (ABC Notation):
G>-A B-<cA^GAB c2(dc)(G>A) (BG/E/)G2G2 G2zTd>A (c/A/F/A/)b2(.B2.B2)b2"D"d/4^c/4d/4e/4 d/4=c/4B/4A/4]\
"Gm"G/4^F/4G/4A/4 "D"B/4A/4B/4c/4B>-G A>-B(~F>G) (AB/c/)](g^fga) g2d2(bgfd) cA.F2(.B>.B) (A{BA}G)BABc d2(d=e)B>B (A{BA}G)F>G A(B/A/)f4 f2z2"Gm"c/4B/4A/4B/4 GB2(AG) (BAG^F)\
"Gm"G/4^F/4G/4A/4 B/2G/2B2(AB) c2(Bc)f4 d2-f2G2G2 G3Gd>-=e f-<dF2D2 F2GAd>-=e f-<d"Gm"G/2G/2 G\G3"D"D/2^F/2 A/2c/2c2d=e f2c2f4 (g2dB)(dcBA) (cBAG)BABc d2(cB)(~F>G) (AB/c/)G4 G2G2G2 G2z2d2g2 gfed(cB)(BA) (GB)(AG)B2d2 dcBAc>-A (F3/2G/4A/4)AFAB c2(dc)"D"^F/2A/2 A/4G/4F/2% Nottingham Music Database
"Gm"d/4^c/4d/4e/4 "D"d/4=c/4B/4A/4"Gm"G/2B/2 B/4A/4G/2(d2{ed}c2) B2B2(g2a2 b2).g2BABc dedcB2AB cBAG(bagf) (gd)d>cD(f=efg) f2(dc)(GABc) (d2{ed}c>A)">"f4 e2(fe)(d2g2) g2d2F4 (D2=E^F)G3 GG>-A B-<cd>-B G>>D({d}c3-B) G2-E2cBAG BAGF(d2g2) g2d2(A2G2 {AG}F2).D2A-<d (3(cAG)G2GA B2AGB2d2 (gfdc)c>-A (F3/2G/4A