In [3]:
import pandas as pd
import math
import ast
from tqdm.auto import tqdm

In [24]:
note2number = {
    'B#': 0,
    'C': 0,
    'С': 0, # Cyrillic C (U+0421)
    'C#': 1,
    'С#': 1,
    'Db': 1,
    'D': 2,
    'D#': 3,
    'Eb': 3,
    'E': 4,
    'Fb': 4,
    'E#': 5,
    'F': 5,
    'F#': 6,
    'Gb': 6,
    'G': 7,
    'G#': 8,
    'Ab': 8,
    'A': 9,
    'A#': 10,
    'Bb': 10,
    'B': 11,
    'Cb': 11,
    'Сb': 11
}

note2number_alt = {
    'C': 0,
    'С': 0,
    'C#': 1,
    'С#': 1,
    'Db': 1,
    'D': 2,
    'D#': 3,
    'Eb': 3,
    'E': 4,
    'Fb': 4,
    'E#': 5,
    'F': 5,
    'F#': 6,
    'Gb': 6,
    'G': 7,
    'G#': 8,
    'Ab': 8,
    'A': 9,
    'A#': 10,
    'Bb': 10,
    'B': 10,
    'B#': 11,
    'H': 11,
    'Cb': 11,
    'Сb': 11
}

def key2number(key):
    tonic = 0
    tonality = ''
    try:
        if len(key) > 2: # e.g. C#m
            tonality = 'min'
            tonic = note2number[key[:2]]
        elif len(key) == 2: # e.g. Dm or Db
            if key in note2number: # e.g. Db
                tonality = 'maj'
                tonic = note2number[key]
            else: # e.g. Dm
                tonality = 'min'
                tonic = note2number[key[:1]]
        else: # e.g. C
            tonality = 'maj'
            tonic = note2number[key]
    except KeyError:
        # print(f'!!key2number KeyError: {key}')
        return -1, -1
    return tonic, tonality

def chord2number(chord, tonic, alt=False):
    copy = chord

    # convert root to number
    root = 0
    root_length = 0
    try:
        if alt:
            if copy[:2] in note2number_alt: root_length = 2 # e.g. C#
            elif copy[:1] in note2number_alt: root_length = 1 # e.g. C
            root = note2number_alt[copy[:root_length]]
        else:
            if copy[:2] in note2number: root_length = 2 # e.g. C#
            elif copy[:1] in note2number: root_length = 1 # e.g. C
            root = note2number[copy[:root_length]]
    except KeyError:
        # print(f'root KeyError: {chord}')
        return -1 # skip this song
    root_relative = root - tonic + 12 if root - tonic < 0 else root - tonic
    copy = str(int(root_relative)) + '-' + copy[root_length:]

    # check if slash exists
    slash_index = 0
    for index, char in enumerate(copy):
        if char == '/': slash_index = index
    
    # check if slash chord and convert
    try:
        if slash_index > 0:
            slash_root = 0
            slash_root_relative = 0
            slash_length = 0
            is_slash_chord = True

            if alt:
                if copy[slash_index + 1:][:2] in note2number_alt: slash_length = 2 # e.g. C#
                elif copy[slash_index + 1:][:1] in note2number_alt: slash_length = 1 # e.g. C
                else: is_slash_chord = False
                if is_slash_chord: slash_root = note2number_alt[copy[slash_index + 1:][:slash_length]]
            else:
                if copy[slash_index + 1:][:2] in note2number: slash_length = 2 # e.g. C#
                elif copy[slash_index + 1:][:1] in note2number: slash_length = 1 # e.g. C
                else: is_slash_chord = False
                if is_slash_chord: slash_root = note2number[copy[slash_index + 1:][:slash_length]]
                
            if is_slash_chord:
                slash_root_relative = slash_root - tonic + 12 if slash_root - tonic < 0 else slash_root - tonic
                copy = copy[:slash_index + 1] + str(int(slash_root_relative)) + copy[slash_index + slash_length + 1:]
    except KeyError:
        print(f'Slash chord KeyError: {chord}')

    return copy

def convert_chords(chords, tonic, alt=False):
    converted_chords = ''
    for line_idx, line in enumerate(chords):
        for chord_idx, chord in enumerate(line):
            if chord == '': continue
            if alt == False and chord[:1] == 'H':
                return 0
            converted_chord = chord2number(chord, tonic, alt)
            if converted_chord == -1: return -1
            converted_chords += converted_chord
            if line_idx != len(chords) - 1 or chord_idx != len(line) - 1: # only for the last chord
                converted_chords += ' '
    return converted_chords

False

In [25]:
decades = ['1950', '1960', '1970', '1980', '1990', '2000']
num_elements = []

for decade in decades:
    print(f'--- PROCESSING FILE: {decade} ---')
    csv = pd.read_csv(f'./{decade}.csv')
    converted_list = []
    absolute_list = []
    num_relative = 0
    num_absolute = 0
    num_skipped = 0
    for i in tqdm(range(len(csv))):
        row = csv.iloc[i]

        # check if key is missing
        absolute = False
        if isinstance(row.key, float):
            absolute = True
            tonic = 0
        else:
            tonic, tonality = key2number(row.key)
            if tonic == -1:
                print(f'key error. skipping song: {i} {row.title}')
                num_skipped += 1
                continue
            # check for capo and calculate
            if not math.isnan(row.capo):
                tonic = tonic - row.capo + 12 if tonic - row.capo < 0 else tonic - row.capo

        # read chord list as list literal and convert
        chords = ast.literal_eval(row.chords)
        converted_chords = convert_chords(chords, tonic)

        # check for errors in conversion
        if converted_chords == 0: # If Scandinavian/German notation...
            converted_chords = convert_chords(chords, tonic, alt=True)
        if converted_chords == -1: # if invalid song...
            print(f'invalid notation. skipping song: {i}')
            num_skipped += 1
            continue

        # append converted song to respective list
        if absolute:
            absolute_list.append([row.title, converted_chords])
            num_absolute += 1
        else:
            converted_list.append([row.title, 0 if tonality == 'maj' else 1, converted_chords])
            num_relative += 1

    num_elements.append((decade, num_relative, num_absolute, num_skipped))

    absolute_df = pd.DataFrame(absolute_list)
    converted_df = pd.DataFrame(converted_list)
    absolute_df.to_csv(f'./converted/{decade}_converted_absolute.csv')
    converted_df.to_csv(f'./converted/{decade}_converted_relative.csv')

print(f'Converted {len(decades)} files.')
print('Decade / Relative / Absolute / Skipped')
for decade in num_elements: print(decade)

--- PROCESSING FILE: 1950 ---


  0%|          | 0/4991 [00:00<?, ?it/s]

invalid notation. skipping song: 1258
invalid notation. skipping song: 1550
invalid notation. skipping song: 1699
invalid notation. skipping song: 1945
invalid notation. skipping song: 2108
invalid notation. skipping song: 3350
invalid notation. skipping song: 3821
invalid notation. skipping song: 3980
invalid notation. skipping song: 4199
invalid notation. skipping song: 4656
invalid notation. skipping song: 4678
invalid notation. skipping song: 4684
invalid notation. skipping song: 4685
invalid notation. skipping song: 4752
invalid notation. skipping song: 4753
invalid notation. skipping song: 4754
invalid notation. skipping song: 4757
--- PROCESSING FILE: 1960 ---


  0%|          | 0/5000 [00:00<?, ?it/s]

invalid notation. skipping song: 2522
invalid notation. skipping song: 3330
invalid notation. skipping song: 4189
--- PROCESSING FILE: 1970 ---


  0%|          | 0/5000 [00:00<?, ?it/s]

invalid notation. skipping song: 3891
invalid notation. skipping song: 4469
--- PROCESSING FILE: 1980 ---


  0%|          | 0/5000 [00:00<?, ?it/s]

invalid notation. skipping song: 4328
invalid notation. skipping song: 4993
--- PROCESSING FILE: 1990 ---


  0%|          | 0/5000 [00:00<?, ?it/s]

invalid notation. skipping song: 1870
invalid notation. skipping song: 1954
invalid notation. skipping song: 2525
invalid notation. skipping song: 2526
invalid notation. skipping song: 2809
invalid notation. skipping song: 3358
--- PROCESSING FILE: 2000 ---


  0%|          | 0/5000 [00:00<?, ?it/s]

Converted 6 files.
Decade / Relative / Absolute / Skipped
('1950', 1230, 3744, 17)
('1960', 922, 4075, 3)
('1970', 876, 4122, 2)
('1980', 882, 4116, 2)
('1990', 797, 4197, 6)
('2000', 748, 4252, 0)
