In [2]:
import pandas as pd
from collections import Counter
from fractions import Fraction
import re
import random

In [3]:
# Define a function to parse the given data
def parse_abc_notation(data):
    entries = data.strip().split("\n\n")  # Split entries by double newlines
    songs = []
    
    for entry in entries:
        lines = entry.strip().split("\n")
        song = {}
        abc_notation = []
        
        for line in lines:
            if line.startswith("T:"):
                song["judul_lagu (T)"] = song.get("judul_lagu (T)", "") + " " + line[2:].strip()
            elif line.startswith("X:"):
                song["indeks_lagu (X)"] = song.get("indeks_lagu (X)", "") + " " + line[2:].strip()
            elif line.startswith("N:"):
                song["ritme (N)"] = song.get("ritme (N)", "") + " " + line[2:].strip()
            elif line.startswith("M:"):
                song['birama (M)'] = song.get("birama (M)", "") + " " + line[2:].strip()
            elif line.startswith("L:"):
                song['panjang_note (L)'] = song.get("panjang_note (L)", "") + " " + line[2:].strip()
            elif line.startswith("B:"):
                song['sumber (B)'] = song.get("sumber (B)", "") + " " + line[2:].strip()
            elif line.startswith("K:"):
                song['kunci (K)'] = song.get("kunci (K)", "") + " " + line[2:].strip()
            elif ":" not in line:
                abc_notation.append(line)
        
        song["abc_notation"] = "\n".join(abc_notation)
        songs.append(song)
    
    return pd.DataFrame(songs)

In [4]:
dataPath = 'data/new/dataabc.txt'

with open(dataPath, 'r') as file:
    data = file.read()

songDf = parse_abc_notation(data)

In [5]:
# Define a function to extract the tempo descriptors
def extract_tempo(ritme):
    if isinstance(ritme, str):
        match = re.search(r'"(.*?)"', ritme)
        return match.group(1) if match else ritme
    return ritme

# Apply the function to the 'ritme (N)' column
songDf['ritme (N)'] = songDf['ritme (N)'].apply(extract_tempo)

# Data Cleaning

## Hitung Nilai Null

## Isi Nilai Null

### Kolom Indeks Lagu

In [6]:
for i, row in songDf.iterrows():
    if pd.isna(row['indeks_lagu (X)']) or row['indeks_lagu (X)'] == '':
        songDf.at[i, 'indeks_lagu (X)'] = f"{i + 1}"

### Kolom Judul Lagu

In [7]:
for i, row in songDf.iterrows():
    if pd.isna(row['judul_lagu (T)']) or row['judul_lagu (T)'] == '':
        songDf.at[i, 'judul_lagu (T)'] = f"Untitled {i + 1}"

### Kolom Birama

In [8]:
for i, row in songDf.iterrows():
    if pd.isna(row['birama (M)']) or row['birama (M)'] == '':
        songDf.at[i, 'birama (M)'] = "4/4"

### Kolom Panjang Note

In [9]:
def calculate_note_length(abc_notation):
    note_lengths = []
    notes = re.findall(r'[a-gA-G][,\']*\d*', abc_notation)
    
    for note in notes:
        length_match = re.search(r'\d+', note)
        if length_match:
            length = int(length_match.group())
        else:
            length = 1  # Default length if not specified
        note_lengths.append(length)
    
    if note_lengths:
        average_length = sum(note_lengths) / len(note_lengths)
        return f'1/{average_length}'
    else:
        return '1/8'  # Default note length if no notes are found


songDf['panjang_note (L)'] = songDf.apply(
    lambda row: row['panjang_note (L)'] if pd.notna(row['panjang_note (L)']) else calculate_note_length(row['abc_notation']),
    axis=1
)

### Kolom Sumber

In [10]:
for i, row in songDf.iterrows():
    if pd.isna(row['sumber (B)']) or row['sumber (B)'] == '':
        songDf.at[i, 'sumber (B)'] = "Unknown"

### Kolom Ritme

In [11]:
for i, row in songDf.iterrows():
    if pd.isna(row['ritme (N)']) or row['ritme (N)'] == '':
        songDf.at[i, 'ritme (N)'] = "Unknown"

### Kolom Kunci

In [12]:
for i, row in songDf.iterrows():
    if pd.isna(row['kunci (K)']) or row['kunci (K)'] == '':
        songDf.at[i, 'kunci (K)'] = "C"

## Fungsi Buat Pack Notation

In [13]:
def get_notation(df, index):
    row = df.iloc[index]
    indeksLagu = 'X:' + row['indeks_lagu (X)']
    judulLagu = 'T:' + row['judul_lagu (T)']
    birama = 'M: ' + row['birama (M)']
    panjangNote = 'L: ' + row['panjang_note (L)']
    sumber = 'B:' + row['sumber (B)']
    ritme = 'N: ' + row['ritme (N)']
    kunci = 'K:' + row['kunci (K)']
    notasiAbc = row['abc_notation']
    
    notasi =  (indeksLagu + '\n' + judulLagu + '\n' + birama + '\n' 
                + panjangNote + '\n' + sumber + '\n' + ritme + '\n'
                + kunci + '\n' + notasiAbc)
    
    return notasi

In [14]:
notasi = get_notation(songDf, 0)
print(notasi)

X: 1
T: The Enchanted Valley
M:  2/4
L:  1/16
B: "O'Neill's 1"
N: Very slow
K: Gm
G3-A (Bcd=e) | f4 (g2dB) | ({d}c3-B) G2-E2 | F4 (D2=E^F) |
G3-A (Bcd=e) | f4 d2-f2 | (g2a2 b2).g2 | {b}(a2g2 f2).d2 |
(d2{ed}c2) B2B2 | (A2G2 {AG}F2).D2 | (GABc) (d2{ed}c>A) | G2G2 G2z ||
G | B2c2 (dcAB) | G2G2 G3G | B2d2 (gfdc) | d2g2 (g3ga) |
(bagf) (gd)d>c | (B2AG) F-D.D2 | (GABc) d2d2 | (bgfd) cA.F2 |
G2A2 (B2{cB}AG) | A3-G F2-D2 | (GABc) (d2{ed}c>A) | G2G2 G2z2 ||


In [15]:
songDf.head()

Unnamed: 0,indeks_lagu (X),judul_lagu (T),birama (M),panjang_note (L),sumber (B),ritme (N),kunci (K),abc_notation
0,1,The Enchanted Valley,2/4,1/16,"""O'Neill's 1""",Very slow,Gm,G3-A (Bcd=e) | f4 (g2dB) | ({d}c3-B) G2-E2 | F...
1,2,Fare You Well,2/4,1/16,"""O'Neill's 2""",Slow,D,f-g | a3-b g3-a | f4 e3-d | d3-c A3-B | c4 d3-...
2,3,The Little Heathy Hill,C,1/8,"""O'Neill's 3""",Moderate,Gm,B/2-c/2 | d2 d>-c B2 A-B | (GBAG) F2 D-F | (G>...
3,4,The Little Girl of my Heart,4/4,1/8,"""O'Neill's 4""",Slow,D,F-G | A-dd>-c d2 cd | e-fgg (f2 d>e) | f-d (c/...
4,5,The Fun at Donnybrook,6/8,1/8,"""O'Neill's 5""",Cheerfully,Gm,B/2-A/2 | G>FD C>D^F | G3z2B/2-c/2 | B>cd cAG ...


# EDA

In [16]:
# Cek banyak data
songDf.shape[0]

3865

## Agregasi Kolom dan Perbaikan

### Birama (M)

In [17]:
# birama
biramaM =songDf.groupby('birama (M)').count()
biramaM

Unnamed: 0_level_0,indeks_lagu (X),judul_lagu (T),panjang_note (L),sumber (B),ritme (N),kunci (K),abc_notation
birama (M),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
12/8,2,2,2,2,2,2,2
2/2,7,7,7,7,7,7,7
2/4,278,278,278,278,278,278,278
2/4 6/8,1,1,1,1,1,1,1
3/2,3,3,3,3,3,3,3
3/4,310,310,310,310,310,310,310
3/4 3/4 3/8 3/4 3/8 3/4,1,1,1,1,1,1,1
3/8,9,9,9,9,9,9,9
4/4,561,561,561,561,561,561,561
4/4 2/4 4/4,2,2,2,2,2,2,2


In [18]:
def clean_time_signature(birama_series):
    # Calculate the overall frequency of time signatures in the whole DataFrame
    overall_counts = Counter()
    
    for entry in birama_series:
        if pd.notnull(entry):
            time_signatures = entry.split()
            for ts in time_signatures:
                if ts.replace("/", "").isdigit():  # Ensure it's a valid time signature
                    overall_counts[ts] += 1

    # Find the most common time signature overall
    overall_most_common = overall_counts.most_common(1)[0][0] if overall_counts else '4/4'

    def process_entry(entry):
        if pd.isnull(entry):
            return '4/4'
        
        # Split the entry into possible time signatures
        time_signatures = entry.split()
        
        # Filter out invalid time signatures (e.g., containing non-numeric or non-slash characters)
        valid_time_signatures = [ts for ts in time_signatures if ts.replace("/", "").isdigit()]
        
        # Check for special cases like 'C' and other abstract entries
        if 'C' in time_signatures or not valid_time_signatures:
            return '4/4'

        # Count the frequency of each time signature in this specific entry
        local_counts = Counter(valid_time_signatures)

        # Find the most common time signature in this entry
        most_common_local = local_counts.most_common()

        # Check if there's a clear most frequent time signature
        if len(most_common_local) > 1 and most_common_local[0][1] == most_common_local[1][1]:
            # If there's a tie, choose the most common overall
            for ts, count in most_common_local:
                if overall_counts[ts] == max(overall_counts.values()):
                    return ts
            return overall_most_common
        else:
            return most_common_local[0][0]

    # Apply the function to each entry in the series
    return birama_series.apply(process_entry)

songDf['birama (M)'] = clean_time_signature(songDf['birama (M)'])

In [19]:
# birama
biramaM =songDf.groupby('birama (M)').count()
biramaM

Unnamed: 0_level_0,indeks_lagu (X),judul_lagu (T),panjang_note (L),sumber (B),ritme (N),kunci (K),abc_notation
birama (M),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
12/8,2,2,2,2,2,2,2
2/2,7,7,7,7,7,7,7
2/4,278,278,278,278,278,278,278
3/2,3,3,3,3,3,3,3
3/4,311,311,311,311,311,311,311
3/8,9,9,9,9,9,9,9
4/4,1861,1861,1861,1861,1861,1861,1861
6/4,3,3,3,3,3,3,3
6/8,1274,1274,1274,1274,1274,1274,1274
9/8,117,117,117,117,117,117,117


### Panjang Note (L)

In [20]:
panjangL = songDf.groupby('panjang_note (L)').count()
panjangL

Unnamed: 0_level_0,indeks_lagu (X),judul_lagu (T),birama (M),sumber (B),ritme (N),kunci (K),abc_notation
panjang_note (L),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1/16,174,174,174,174,174,174,174
1/16 1/8 1/16 1/8 1/16 1/8,1,1,1,1,1,1,1
1/4,599,599,599,599,599,599,599
1/4 1/4 1/4,1,1,1,1,1,1,1
1/4 1/8,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...
1/2.2459016393442623,1,1,1,1,1,1,1
1/2.26,1,1,1,1,1,1,1
1/2.3065693430656933,1,1,1,1,1,1,1
1/2.5595238095238093,1,1,1,1,1,1,1


In [21]:
# Define a mapping of note lengths to their standard fractions
standard_notes = {
    '1/16': 1/16,
    '1/8': 1/8,
    '3/16': 3/16,
    '1/4': 1/4,
    '3/8': 3/8,
    '1/2': 1/2,
    '3/4': 3/4,
    '1': 1,  # Whole note
    '3/2': 3/2,
    '2': 2   # Double whole note
}

# Function to convert a fractional string to a float
def convert_to_float(fraction_str):
    try:
        return float(Fraction(fraction_str))
    except ValueError:
        return None

# Function to map each entry to the closest standard note length
def condense_note_lengths(note_lengths):
    # Calculate standard note float values
    standard_note_values = {k: v for k, v in standard_notes.items()}

    def map_to_standard(note):
        # Split note by spaces if it contains multiple
        components = note.split()
        
        # If it's multiple notes, return the mode or average
        if len(components) > 1:
            valid_fractions = [convert_to_float(n) for n in components if convert_to_float(n) is not None]
            if not valid_fractions:
                return '1/4'
            average_value = sum(valid_fractions) / len(valid_fractions)
            return min(standard_note_values, key=lambda k: abs(standard_note_values[k] - average_value))

        # Single note processing
        float_value = convert_to_float(note)
        if float_value is None:
            return '1/4'
        
        # Find the closest standard note
        closest_note = min(standard_note_values, key=lambda k: abs(standard_note_values[k] - float_value))
        return closest_note

    return [map_to_standard(note.strip()) for note in note_lengths]

songDf['panjang_note (L)'] = condense_note_lengths(songDf['panjang_note (L)'])

In [22]:
panjangL = songDf.groupby('panjang_note (L)').count()
panjangL

Unnamed: 0_level_0,indeks_lagu (X),judul_lagu (T),birama (M),sumber (B),ritme (N),kunci (K),abc_notation
panjang_note (L),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1/16,175,175,175,175,175,175,175
1/4,1111,1111,1111,1111,1111,1111,1111
1/8,2576,2576,2576,2576,2576,2576,2576
3/16,3,3,3,3,3,3,3


### Ritme (N)

In [23]:
ritmeN = songDf.groupby('ritme (N)').count()
ritmeN

Unnamed: 0_level_0,indeks_lagu (X),judul_lagu (T),birama (M),panjang_note (L),sumber (B),kunci (K),abc_notation
ritme (N),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
,6,6,6,6,6,6,6
The 2nd part has a repeat at the end but not at the beginning.,1,1,1,1,1,1,1
"""collected by Ennis",1,1,1,1,1,1,1
"""collected by F. O'Neill",2,2,2,2,2,2,2
"""collected by J. O'Neill",4,4,4,4,4,4,4
...,...,...,...,...,...,...,...
moderate,2,2,2,2,2,2,2
setting 1,1,1,1,1,1,1,1
setting 2,1,1,1,1,1,1,1
transcribed by Fielding,1,1,1,1,1,1,1


In [24]:
valid_rhythms = ['Very slow', 'Slow', 'Moderate', 'Cheerfully', 'Tenderly', 'Playfully', 
                 'With feeling', 'With expression', 'With spirit', 'Cheerful', 'Slow with expression', 
                 'Gracefully', 'Plaintively', 'Slow and tenderly', 'Slow, with expression', 
                 'Slow and mournful', 'Mournful', 'Gaily', 'Slow and with feeling', 'Slow with feeling', 
                 'Animated', 'Slow and distinctly', 'Spirited', 'Plaintive', 'With Animation', 'Boldly', 
                 'Gheerfully', 'Lively', 'Playful', 'Brilliant', 'Rather slow', 'Distinctly', 'Graceful', 
                 'Slow and plaintive', 'Very slow and plaintive', 'With spirit and feeling', 'Quick and spirit', 'With force']

# Filter and clean the dataframe
def clean_rhythm(row):
    if row['ritme (N)'] in valid_rhythms:
        return row['ritme (N)']
    else:
        similar_rhythm = songDf[(songDf['birama (M)'] == row['birama (M)']) & (songDf['panjang_note (L)'] == row['panjang_note (L)']) & (songDf['ritme (N)'].isin(valid_rhythms))]
        if not similar_rhythm.empty:
            return similar_rhythm.iloc[0]['ritme (N)']
        else:
            return 'Unknown'

songDf['ritme (N)'] = songDf.apply(clean_rhythm, axis=1)

In [25]:
ritmeN = songDf.groupby('ritme (N)').count()
ritmeN

Unnamed: 0_level_0,indeks_lagu (X),judul_lagu (T),birama (M),panjang_note (L),sumber (B),kunci (K),abc_notation
ritme (N),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Animated,20,20,20,20,20,20,20
Boldly,3,3,3,3,3,3,3
Brilliant,1,1,1,1,1,1,1
Cheerful,7,7,7,7,7,7,7
Cheerfully,670,670,670,670,670,670,670
Distinctly,3,3,3,3,3,3,3
Gaily,18,18,18,18,18,18,18
Gheerfully,1,1,1,1,1,1,1
Graceful,3,3,3,3,3,3,3
Gracefully,5,5,5,5,5,5,5


### Kunci (K)

In [26]:
kunciK = songDf['kunci (K)'].unique()
kunciK

array([' Gm', ' D', ' G', ' D Dm', ' Am', ' Cm', ' C', ' A', ' Em',
       ' Amix', ' Dmix', ' Ador', ' F', ' Dm', 'C', ' Bb', ' Bm', ' Gmix',
       ' GDor', ' ADor', ' AMix', ' Aphr', ' Gdor', ' Dphr', ' Eb',
       ' DMix', ' Ddor', ' Dmix Bb', ' Dmix Dmixm', ' Bn', ' F#m',
       ' Dlyd', ' Edor', ' D G', ' Bphr', ' D A', ' Glyd', ' G D',
       ' DDor', ' Fmix', ' G D G', ' B Minor', ' D Major', ' G Major',
       ' D Mixolydian', ' A Mixolydian', ' E Dorian', ' C Major',
       ' A Dorian', ' A Major', ' E Minor', ' G Dorian',
       ' D Major D Mixolydian', ' F Major', ' E Major', ' B Dorian',
       ' D Major G Major', ' A Dorian A Major', ' G Major D Major',
       ' G Mixolydian', ' D Minor', ' D Major D Minor D Major',
       ' E Dorian A Dorian', ' G Major A Dorian G Major',
       ' E Minor G Major', ' A Dorian C Major', ' D Dorian',
       ' D Dorian D Mixolydian', ' D Mixolydian D Major',
       ' G Major D Major G Major', ' D Major A Major D Major',
       ' G Dorian G 

In [27]:
# Dictionary to map various notations to a single notation
key_map = {
    'G Major': 'G',
    'D Major': 'D',
    'A Major': 'A',
    'E Major': 'E',
    'C Major': 'C',
    'F Major': 'F',
    'B Minor': 'Bm',
    'A Minor': 'Am',
    'E Minor': 'Em',
    'D Minor': 'Dm',
    'G Mixolydian': 'Gmix',
    'D Mixolydian': 'Dmix',
    'A Mixolydian': 'Amix',
    'D Dorian': 'Ddor',
    'E Dorian': 'Edor',
    'A Dorian': 'Ador',
    'G Dorian': 'Gdor',
    'F Mixolydian': 'Fmix'
}

# Function to normalize the key names
def normalize_key(key):
    key = key.strip()
    return key_map.get(key, key)

# Function to handle multiple keys and return the most common one
def get_most_common_key(keys_list):
    normalized_keys_list = [normalize_key(key) for key in keys_list]
    counter = Counter(normalized_keys_list)
    most_common_keys = counter.most_common()
    
    if len(most_common_keys) > 1 and most_common_keys[0][1] == most_common_keys[1][1]:
        return None  # Tie situation
    return most_common_keys[0][0]

# Clean the 'kunci (K)' column
def clean_keys(row):
    keys_list = row['kunci (K)'].split()
    most_common_key = get_most_common_key(keys_list)
    
    if most_common_key:
        return most_common_key
    
    # Handle tie by considering the overall dataset
    relevant_rows = songDf[(songDf['birama (M)'] == row['birama (M)']) & 
                       (songDf['panjang_note (L)'] == row['panjang_note (L)']) & 
                       (songDf['ritme (N)'] == row['ritme (N)'])]
    
    all_keys = ' '.join(relevant_rows['kunci (K)'].tolist()).split()
    overall_counter = Counter(all_keys)
    overall_most_common = overall_counter.most_common(1)
    
    if overall_most_common:
        return normalize_key(overall_most_common[0][0])
    
    return normalize_key(keys_list[0])

songDf['kunci (K)'] = songDf.apply(clean_keys, axis=1)

In [28]:
kunciK = songDf.groupby('kunci (K)').count()
kunciK

Unnamed: 0_level_0,indeks_lagu (X),judul_lagu (T),birama (M),panjang_note (L),sumber (B),ritme (N),abc_notation
kunci (K),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
A,351,351,351,351,351,351,351
ADor,2,2,2,2,2,2,2
AMix,3,3,3,3,3,3,3
Ador,4,4,4,4,4,4,4
Am,123,123,123,123,123,123,123
Amix,8,8,8,8,8,8,8
Aphr,1,1,1,1,1,1,1
B,1,1,1,1,1,1,1
Bb,30,30,30,30,30,30,30
Bm,11,11,11,11,11,11,11


In [29]:
songDf['kunci (K)'].unique()

array(['Gm', 'D', 'G', 'Am', 'Cm', 'C', 'A', 'Em', 'Amix', 'Dmix', 'Ador',
       'F', 'Dm', 'Bb', 'Bm', 'Gmix', 'GDor', 'ADor', 'AMix', 'Aphr',
       'Gdor', 'Dphr', 'Eb', 'DMix', 'Ddor', 'Bn', 'F#m', 'Dlyd', 'Edor',
       'Bphr', 'Glyd', 'DDor', 'Fmix', 'Major', 'Dorian', 'Minor', 'E',
       'B'], dtype=object)

In [30]:
songDf.to_csv('data/new/dataabc.csv')

# Data Preparation

## Labeling Pitch dalam Bar Nada

In [31]:
# Fungsi untuk memisahkan setiap bar, termasuk yang ditutup dengan tanda ||
def split_bars(abc_notation):
    bars = re.split(r'\|\||\|', abc_notation)
    return bars

# Fungsi untuk mengevaluasi pola nada dalam satu bar
def evaluate_pitch_pattern(bar):
    # Menghapus whitespace dan tanda yang tidak relevan
    bar = re.sub(r'\s+', '', bar)
    
    # Ekstrak notasi nada
    notes = re.findall(r'[A-Ga-g][,\']*\d*', bar)
    
    if not notes:
        return 'flat'
    
    # Convert notes to relative pitch values (A=0, B=1, C=2, ..., G=6)
    pitch_values = []
    for note in notes:
        base_pitch = note[0].upper()
        base_value = 'CDEFGAB'.index(base_pitch)
        pitch_values.append(base_value)
    
    if len(pitch_values) < 2:
        return 'flat'
    
    # Evaluate pitch pattern
    increasing = decreasing = flat = False
    for i in range(1, len(pitch_values)):
        if pitch_values[i] > pitch_values[i-1]:
            increasing = True
        elif pitch_values[i] < pitch_values[i-1]:
            decreasing = True
        else:
            flat = True
    
    if increasing and not decreasing:
        return 'up'
    elif decreasing and not increasing:
        return 'down'
    else:
        return 'flat'

# Fungsi untuk melabeli setiap bar dengan pola nada
def label_bars_with_pitch_pattern(abc_notation):
    bars = split_bars(abc_notation)
    labels = [evaluate_pitch_pattern(bar) for bar in bars]
    return bars, labels

# Membagi ABC notation menjadi bars dan melabelinya
songDf['bars_labels'] = songDf['abc_notation'].apply(label_bars_with_pitch_pattern)

## Rekonstruksi Dataset

In [32]:
# Fungsi untuk merapikan dataframe agar lebih mudah dibaca
def expand_bars_and_labels(df):
    rows = []
    for _, row in df.iterrows():
        bars, labels = row['bars_labels']
        for i, (bar, label) in enumerate(zip(bars, labels)):
            rows.append({
                'index': row['indeks_lagu (X)'].strip(),
                'judul_lagu': row['judul_lagu (T)'].strip(),
                'bar_number': i + 1,
                'bar': bar.strip(),
                'pitch_pattern': label,
                'birama': row['birama (M)'],
                'panjang_note': row['panjang_note (L)'],
                'ritme': row['ritme (N)'],
                'kunci': row['kunci (K)']
            })
    return pd.DataFrame(rows)

# Mengembangkan dataframe dengan setiap bar dan labelnya
expandedDf = expand_bars_and_labels(songDf)

## Deteksi dan Hapus String Kosong

In [33]:
# Fungsi untuk mendeteksi string kosong dan menghapus baris yang mengandungnya
def hapus_baris_kosong(df):
    # Deteksi nilai kosong (string kosong) dalam DataFrame
    empty_mask = df == ''

    # Menentukan baris yang memiliki setidaknya satu nilai kosong
    rows_with_empty_values = empty_mask.any(axis=1)

    # Menghapus baris yang memiliki nilai kosong
    df_cleaned = df[~rows_with_empty_values].reset_index(drop=True)

    return df_cleaned

cleanedDf = hapus_baris_kosong(expandedDf)

In [34]:
cleanedDf.head(10)

Unnamed: 0,index,judul_lagu,bar_number,bar,pitch_pattern,birama,panjang_note,ritme,kunci
0,1,The Enchanted Valley,1,G3-A (Bcd=e),flat,2/4,1/16,Very slow,Gm
1,1,The Enchanted Valley,2,f4 (g2dB),flat,2/4,1/16,Very slow,Gm
2,1,The Enchanted Valley,3,({d}c3-B) G2-E2,flat,2/4,1/16,Very slow,Gm
3,1,The Enchanted Valley,4,F4 (D2=E^F),flat,2/4,1/16,Very slow,Gm
4,1,The Enchanted Valley,5,G3-A (Bcd=e),flat,2/4,1/16,Very slow,Gm
5,1,The Enchanted Valley,6,f4 d2-f2,flat,2/4,1/16,Very slow,Gm
6,1,The Enchanted Valley,7,(g2a2 b2).g2,flat,2/4,1/16,Very slow,Gm
7,1,The Enchanted Valley,8,{b}(a2g2 f2).d2,down,2/4,1/16,Very slow,Gm
8,1,The Enchanted Valley,9,(d2{ed}c2) B2B2,flat,2/4,1/16,Very slow,Gm
9,1,The Enchanted Valley,10,(A2G2 {AG}F2).D2,flat,2/4,1/16,Very slow,Gm


In [35]:
# Fungsi untuk mendeteksi string kosong dalam setiap baris dan kolom
def deteksi_kosong(df):
    # Deteksi nilai kosong (string kosong) dalam DataFrame
    empty_mask = df == ''

    # Menampilkan baris dan kolom yang mengandung nilai kosong
    for row_idx, row in enumerate(empty_mask.values):
        for col_idx, is_empty in enumerate(row):
            if is_empty:
                print(f"Nilai kosong ditemukan pada baris {row_idx + 1}, kolom '{df.columns[col_idx]}'")
                # Memanggil fungsi deteksi_kosong
print("\nDeteksi nilai kosong:")
deteksi_kosong(cleanedDf)


Deteksi nilai kosong:


In [36]:
cleanedDf.to_csv('data/new/cleaned.csv')

# Implementasi Pretrained Model

In [37]:
# Ensure that the dataset is sorted by song and bar number for coherence
df = cleanedDf.sort_values(by=['judul_lagu', 'bar_number'])

# Function to filter dataframe by rhythm and key
def filter_df(df, rhythms, keys):
    filtered_df = df[df['ritme'].isin(rhythms) | df['kunci'].isin(keys) ]
    return filtered_df

# Prepare a structured input string by concatenating bar information with their labels
def prepare_input(df, max_bars=100):
    input_data = []
    for index, row in df.iterrows():
        # Create a compressed string that combines the bar with its characteristics
        bar_info = f"Bar: {row['bar']} | Pitch: {row['pitch_pattern']} | Key: {row['kunci']} | Rhythm: {row['ritme']}"
        input_data.append(bar_info)
    
    # If the input data exceeds max_bars, randomly sample max_bars from the data
    if len(input_data) > max_bars:
        input_data = random.sample(input_data, max_bars)
    
    return '\n'.join(input_data)

# Define desired rhythms and keys for filtering
desired_rhythms = ['Slow with expression', 'With force']
desired_keys = ['C']

# Filter dataframe
filtered_df = filter_df(df, desired_rhythms, desired_keys)

# Prepare input data from the filtered dataset with a limit of 100 bars
input_data = prepare_input(filtered_df, max_bars=100)

print(input_data)

Bar: "G7"GG AB | Pitch: up | Key: C | Rhythm: Moderate
Bar: g2 A e2 A | Pitch: flat | Key: C | Rhythm: Cheerfully
Bar: "C"GEC "F"F2E | Pitch: flat | Key: C | Rhythm: Lively
Bar: "D"f2f f3 | Pitch: up | Key: C | Rhythm: Lively
Bar: "C"c2 -c/2B/2c | Pitch: flat | Key: C | Rhythm: Moderate
Bar: fd2f | Pitch: flat | Key: C | Rhythm: Moderate
Bar: "C/(pl"c2 c2 | Pitch: flat | Key: C | Rhythm: Moderate
Bar: \
"Dm"dd/2e/2 "Dm7/c"f/2e/2f/2g/2 | Pitch: flat | Key: C | Rhythm: Moderate
Bar: "Dm"d3G | Pitch: up | Key: C | Rhythm: Moderate
Bar: "F"A2f ffA | Pitch: flat | Key: C | Rhythm: Lively
Bar: "C"g2c B2c | Pitch: flat | Key: C | Rhythm: Lively
Bar: e>dc (c<g)e | Pitch: flat | Key: C | Rhythm: Cheerfully
Bar: c>AG>F EC C | Pitch: flat | Key: C | Rhythm: Moderate
Bar: \
"C"BA "D"AB/2A/2 | Pitch: flat | Key: C | Rhythm: Moderate
Bar: G | Pitch: flat | Key: C | Rhythm: Cheerfully
Bar: (B/c/) \ | Pitch: down | Key: G | Rhythm: With force
Bar: "G"G^FG d^cd | Pitch: flat | Key: C | Rhythm: Lively
B

In [38]:
from openai import OpenAI

client = OpenAI(
  base_url = "https://integrate.api.nvidia.com/v1",
  api_key = "nan"
)

formatNote = "Bar: BAG Bcd | Pitch Pattern: flat | Key: Gm | Rhythm: Spirited"
instructionNote = "Randomly shuffle the bars to create a new piece, maintaining the same overall rhythm and pitch pattern"


# Create a function to generate new music based on labeled input
def generate_music_with_labels(input_data, temperature=0.5, max_tokens=2048):
    completion = client.chat.completions.create(
        model="meta/llama3-8b-instruct",
        messages=[{"role": "user", 
    "content": f'Can you generate result from {instructionNote}? notation based on this data {input_data} with format like this example {formatNote}'}],
        temperature=temperature,
        top_p=1,
        max_tokens=max_tokens,
        stream=True
    )

    generated_music = ""

    for chunk in completion:
        if chunk.choices[0].delta.content is not None:
            generated_music += chunk.choices[0].delta.content

    return generated_music

# Generate new music
new_music = generate_music_with_labels(input_data)
print(new_music)

I can generate a new piece by randomly shuffling the bars while maintaining the same overall rhythm and pitch pattern. Here's the result:

**New Piece**

Bar: "G7"GG AB | Pitch Pattern: up | Key: C | Rhythm: Moderate
Bar: cBAB G2(ed) | Pitch Pattern: flat | Key: C | Rhythm: Lively
Bar: "Dm"dd/2e/2 "Dm7/c"f/2e/2f/2g/2 | Pitch Pattern: flat | Key: C | Rhythm: Moderate
Bar: (cA)(GE) G2 c>d | Pitch Pattern: flat | Key: C | Rhythm: Moderate
Bar: "C"G3/2A/2 GF | Pitch Pattern: flat | Key: C | Rhythm: Moderate
Bar: e6 e2 | Pitch Pattern: flat | Key: C | Rhythm: Moderate
Bar: "F"A2f ffA | Pitch Pattern: flat | Key: C | Rhythm: Lively
Bar: GEG GEG | Pitch Pattern: flat | Key: C | Rhythm: Cheerfully
Bar: "C"e/2c3/2 "F"ag | Pitch Pattern: flat | Key: C | Rhythm: Moderate
Bar: afd dBG | Pitch Pattern: flat | Key: C | Rhythm: Cheerfully
Bar: "C"GEC "F"F2E | Pitch Pattern: flat | Key: C | Rhythm: Lively
Bar: "D"AB/2A/2 | Pitch Pattern: flat | Key: C | Rhythm: Moderate
Bar: "G"B/2B/2[g/2B/2]B/2 B/2[a

In [39]:
def extract_bar_notes(text):
    # Define the regex pattern to match the notes in the "Bar" section
    pattern = r'Bar:\s*([^|]+)\s*\|'
    matches = re.findall(pattern, text)
    return ' | \n'.join(matches) if matches else None

newMusic = extract_bar_notes(new_music)
print(newMusic)

"G7"GG AB  | 
cBAB G2(ed)  | 
"Dm"dd/2e/2 "Dm7/c"f/2e/2f/2g/2  | 
(cA)(GE) G2 c>d  | 
"C"G3/2A/2 GF  | 
e6 e2  | 
"F"A2f ffA  | 
GEG GEG  | 
"C"e/2c3/2 "F"ag  | 
afd dBG  | 
"C"GEC "F"F2E  | 
"D"AB/2A/2  | 
"G"B/2B/2[g/2B/2]B/2 B/2[a/2B/2]B/2B/2  | 
(ea)({b}a^g) (ea)({b}ag)  | 
"C"cc de  | 
"Bb"d3^d  | 
E2G c2e  | 
"C"g2g gfe  | 
G2E A2E  | 
"C"e/2"F"g(3a/4g/4f/4  | 
"G7"b3 a2f  | 
e>cc c2  | 
"C"BA "D"AB/2A/2  | 
(G2 G)  | 
"Dm"[F2d2][Ge]  | 
"C"ede G2G 


In [40]:
from openai import OpenAI