<a href="https://colab.research.google.com/github/jessica-aaao/ChordsExtractor/blob/main/ChordExtractor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
!python3 -m pip install -q -U "yt-dlp[default]"
!pip install -q -U openai-whisper
!pip install -q -U demucs
!pip install -q -U pychord

In [2]:
import requests
import json
import pandas as pd
import os
import re
import unicodedata
from difflib import SequenceMatcher
from pychord import Chord
from collections import Counter

In [3]:
from google.colab import drive
from IPython.display import display
from bs4 import BeautifulSoup

In [None]:
!git clone  https://github.com/mikezzb/lyrics-sync.git
!git clone https://github.com/filipecalegario/ISMIR2019-Large-Vocabulary-Chord-Recognition.git

In [None]:
drive.mount('/content/drive')

# Common

In [6]:
class SongUrls:
    def __init__(self, name, audio, lyrics, chords):
        self.name = name
        self.audio = audio
        self.lyrics = lyrics
        self.chords = chords

    def get_name(self):
        return self.name

    def get_audio_url(self):
        return self.audio

    def get_lyrics_url(self):
        return self.lyrics

    def get_chords_url(self):
        return self.chords

In [7]:
def get_urls():
    """
    Carrega o arquivo CSV com as infformações das músicas.
    """
    print(f'Fetching urls...\n\n')

    file_path = '/content/drive/My Drive/TCC/CodeData/songs.csv'
    songs = pd.read_csv(file_path)

    print(f'Urls fetched!\n\n')

    return songs

In [8]:
def get_songs_from_csv():
    """
    Obtém as músicas, letras e cifras, além do nome da música, a partir do arquivo CSV.
    """
    songs = get_urls()

    song_urls = []

    print(f'Creating SongUrls...\n\n')

    for index, row in songs.iterrows():
        song_name = slugify(row['Song Name'])
        audio_url = row['Audio URL']
        lyrics_url = row['Lyrics URL']
        chords_url = row['Chords URL']

        song_urls.append(SongUrls(song_name, audio_url, lyrics_url, chords_url))

    print(f'SongUrls created!\n\n')

    return song_urls


In [9]:
def slugify(raw_song_name):
    """
    Converte o nome da música para um formato que pode ser usado como nome de arquivo.
    """
    song_name = raw_song_name.lower()

    song_name = unicodedata.normalize('NFKD', song_name)
    song_name = song_name.encode('ascii', 'ignore').decode('ascii')

    song_name = re.sub(r'[^a-z0-9]+', '_', song_name)

    song_name = song_name.strip('_')

    return song_name

In [10]:
def save_to_file(data, folder_path, song_name):
    """
    Salva os dados em um arquivo de texto.
    """
    path = f"{folder_path}/{song_name}.txt"

    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    with open(path, 'w') as file:
        file.write(data)

    print(f'Saved as {path}')

    return path

# Sound

In [11]:
def extract_sound_recording(youtube_url, song_name):
    """
    Extrai o áudio da música a partir do vídeo do YouTube.
    """
    print(f'Extracting audio from {youtube_url}...\n\n')

    cookies_path = '/content/cookies.txt'
    output_path = f"/content/audios/{song_name}.wav"

    !yt-dlp {youtube_url} --audio-format "wav" --cookies {cookies_path} -x -o {output_path}  -q

    print(f'Audio saved as {output_path}!\n\n')

    return output_path


# Chords

In [12]:
def extract_chords_from(song_paths, song_names):
    """
    Extrai a cifra, com o timestamp, a partir da música
    """

    output_folder = "/content/chords"
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    output_paths = []

    %cd ISMIR2019-Large-Vocabulary-Chord-Recognition
    !pip install -q -r requirements.txt

    for song_path, song_name in zip(song_paths, song_names):
        print(f"Extracting chords from {song_path}...\n\n")
        output_path = f"{output_folder}/{song_name}.lab"
        !python chord_recognition.py {song_path} {output_path}
        print(f"Chords extracted to {output_path}!\n\n")

        output_paths.append(output_path)

    %cd ..

    return output_paths

# Lyrics

## Lyrics Extraction from Webpage

In [13]:
def extract_lyrics_from_html(html):
    """
    Extrai a letra da página HTML fornecida
    """

    print(f'Fetching lyrics...!\n\n')

    lyricsTag = html.find('div', class_='lyric-original')
    lyrics = ""

    for p in lyricsTag.find_all('p'):
        for br in p.find_all('br'):
            br.replace_with('\n')
        lyrics += p.get_text() + "\n"

    print(f'Lyrics fetched!\n\n')

    return lyrics


In [14]:
def get_lyrics_from_webpage(lyric_urls, song_names):
    """
    Obtém a página web e extrai a letra
    """

    lyrics_paths = []
    folder_path = "/content/lyrics"

    for lyric_url, song_name in zip(lyric_urls, song_names):
        print(f'Fetching webpage {lyric_url}...\n\n')
        response = requests.get(lyric_url)

        if response.status_code == 200:
            print(f'Webpage fetched!\n\n')
            htmlContent = BeautifulSoup(response.content, 'html.parser')

            lyrics = extract_lyrics_from_html(htmlContent)
            lyrics_path = save_to_file(lyrics, folder_path, song_name)

            lyrics_paths.append(lyrics_path)
        else:
            print(f"Failed to fetch {song_name} webpage. Status code: {response.status_code}\n\n")

    return lyrics_paths

## Lyrics Sync to Audio

In [15]:
def create_output_folder():
    """
    Cria as pastas de resultados para o lyrics-sync
    """
    output_folder = "/content/lyrics-sync/output"
    vocals_folder = output_folder + "/vocals"
    words_folder = output_folder + "/words"
    lrc_folder = output_folder + "/lrc"

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    if not os.path.exists(vocals_folder):
        os.makedirs(vocals_folder)

    if not os.path.exists(words_folder):
        os.makedirs(words_folder)

    if not os.path.exists(lrc_folder):
        os.makedirs(lrc_folder)

In [16]:
def get_timestamps(audio_paths, lyrics_paths, song_names):
    """
    Obtém os timestamps das palavras da letra
    """
    print("Installing conda...")
    !wget -c https://repo.continuum.io/archive/Anaconda3-2024.10-1-Linux-x86_64.sh
    !chmod +x Anaconda3-2024.10-1-Linux-x86_64.sh
    !bash ./Anaconda3-2024.10-1-Linux-x86_64.sh -b -f -p /usr/local
    print("Conda installed!")

    print("Installing lsync...")
    %cd lyrics-sync/
    !conda env update -f environment.yml
    !source activate lsync

    print("Lsync installed!")
    from lsync import LyricsSync

    print("Extracting timestamps...")

    lsync = LyricsSync()

    timestamps_paths = []

    for lyrics_path, audio_path, song_name in zip(lyrics_paths, audio_paths, song_names):
        words, lrc = lsync.sync(audio_path, lyrics_path)
        words_path = f"/content/lyrics-sync/output/words/{song_name}.csv"

        timestamps_paths.append(words_path)

    print("Timestamps extracted!")

    return timestamps_paths
    %cd ..

In [17]:
def get_synced_lyrics(lyric_urls, audio_paths, song_names):
    """
    Obtém a letra sincronizada com o áudio.
    """

    print(f"Getting synced lyrics...\n\n")

    lyrics_paths = get_lyrics_from_webpage(lyric_urls, song_names)
    timestamps_paths = get_timestamps(audio_paths, lyrics_paths, song_names)

    print(f"Synced lyrics ready!\n\n")

    return (lyrics_paths, timestamps_paths)

# Chord Sheets

## Chords Parsing

In [18]:
def translate_chord(chord):
    """
    Simplifica os acordes resconhecidos pelo modelo, para acordes mais comuns
    nas cifras.
    """
    chord = chord.replace(":", "")
    chord = chord.replace("min", "m")
    chord = chord.replace("maj", "")
    chord = chord.replace("hdim7", "m7(b5)")
    chord = chord.replace("hdim", "m7(b5)")
    chord = chord.replace("sus4(b7)", "7sus4")

    return chord

In [19]:
def parse_chords(chords_path):
    """
    Percorre o arquivo .lab, simplifica e extrai os acordes em um dicionário.
    """
    print(f"Parsing chords from {chords_path}...")

    chords = []
    with open(chords_path, 'r', encoding='utf-8') as file:
        for line in file:
            start, end, chord = line.strip().split('\t')
            if chord == 'N':
                continue
            chord = translate_chord(chord)
            chords.append({
                "start": float(start),
                "end": float(end),
                "chord": chord
            })

        print(f"Chords parsed!\n\n")

        return chords

## Lyrics Parsing

In [20]:
def parse_lyrics(timestamped_lyrics_path, lyrics_path):
    """
    Separa as letras, com timestamps, em linhas, de acordo com o arquivo da
    letra extraída pelo web scraping.
    """
    print(f"Parsing lyrics from {lyrics_path}...")

    timestamped_lyrics = pd.read_csv(timestamped_lyrics_path)
    timestamped_per_line = []

    with open(lyrics_path, 'r', encoding='utf-8') as file:
        for index, line in enumerate(file):
            line = line.strip().split()
            words_in_line = len(line)

            timestamped_words = timestamped_lyrics.iloc[:words_in_line]
            timestamped_lyrics = timestamped_lyrics.iloc[words_in_line:]

            for df_index, word_info in timestamped_words.iterrows():
                start = word_info["start"]
                end = word_info["end"]
                word = word_info["label"]

                timestamped_per_line.append({
                    "word": word,
                    "start": start,
                    "end": end,
                    "line": index
                })

    print(f'Lyrics Parsed!\n\n')

    return timestamped_per_line

## General

In [21]:
def align_chord_over_word(word_info, chord_info, line_start):
    """
    Posiciona o acorde sobre a palavra.
    """
    word = word_info["word"]
    word_start = word_info["start"]
    word_end = word_info["end"]
    chord = chord_info["chord"]
    chord_start = chord_info["chord_start"]

    word_duration = word_end - word_start
    ratio = (chord_start - word_start) / word_duration

    word_len = len(word)
    word_index = int(round(ratio * (word_len - 1)))
    word_index = max(0, min(word_index, word_len - 1))

    chord_position = line_start + word_index

    return (chord, chord_position)

In [22]:
def overlay_chords_on_transcribed(lyrics_per_line, chords):
    """
    Associa acordes às palavras da letra com base nos tempos de ACORDES.
    """
    print(f"Overlaying chords on lyrics...")

    result = []
    previous_end = None

    for word_info in lyrics_per_line:
        start = word_info["start"]
        end = word_info["end"]
        word = word_info["word"]
        line_number = word_info["line"]

        associated_chords = []

        if start is not None and end is not None:
            for i in range(len(chords)):
                chord_infos = chords[0]
                chord_info_start = chord_infos["start"]
                chord_info_end = chord_infos["end"]
                chord_info_name = chord_infos["chord"]

                if chord_info_start > end: # Próxima palavra
                    break
                elif start <= chord_info_start <= end: # Dentro da palavra
                    chord_name = chord_info_name
                    chord_start = chord_info_start

                    associated_chords.append({
                        "chord": chord_name,
                        "chord_start": chord_start
                    })
                elif previous_end is not None and previous_end < chord_info_start < start: # Entre palavras
                    chord_name = chord_info_name
                    chord_start = chord_info_start

                    result.append({
                        "word": " ",
                        "start": start,
                        "end": end,
                        "chords": [{
                            "chord": chord_name,
                            "chord_start": chord_start
                        }],
                        "line": line_number
                    })
                elif chord_info_end >= start: # Antes da palavra
                    chord_name = chord_info_name
                    chord_start = chord_info_start

                    result.append({
                        "word": " ",
                        "start": start,
                        "end": end,
                        "chords": [{
                            "chord": chord_name,
                            "chord_start": chord_start
                        }],
                        "line": line_number
                    })

                chords.pop(0)

        result.append({
            "word": word,
            "start": start,
            "end": end,
            "chords": associated_chords,
            "line": line_number
        })

        previous_end = end

    print(f"Chords overlayed!\n\n")

    return result

In [44]:
def format_transcribed_with_chords(lyrics_with_chords):
    """
    Gera o formato de saída com acordes acima das palavras.
    """
    formatted_output = []
    chord_line = []
    lyrics_line = []

    current_line = lyrics_with_chords[0]["line"]
    line_start = 0

    for word_info in lyrics_with_chords:
        if word_info["line"] != current_line:
            current_line = word_info["line"]
            formatted_output.append("".join(chord_line))
            formatted_output.append(" ".join(lyrics_line))
            formatted_output.append("")

            chord_line = []
            lyrics_line = []
            line_start = 0

        word = word_info["word"]
        chords = word_info["chords"]
        chord_position = None

        if chords:
            for chord_info in chords:
                chord = chord_info["chord"]

                if word == " ":
                    chord_line.append(chord)
                    line_start += 1
                else:
                    chord, chord_position = align_chord_over_word(word_info, chord_info, line_start)

                    while len(chord_line) < chord_position:
                        chord_line.append(" ")

                    if chord_line and chord_line[-1] != " ":
                        chord_line.append(" ")

                    chord_line.append(chord)

            line_start += len(word)
        else:
            chord_line.extend([" "] * (len(word)+1))
            line_start += len(word)

        lyrics_line.append(word)

    # Adiciona o restante, se existir
    if chord_line or lyrics_line:
        formatted_output.append("".join(chord_line))
        formatted_output.append(" ".join(lyrics_line))

    return "\n".join(formatted_output)

In [24]:
def save_chords_sheet(data, song_name):
    """
    Salva a cifra com acordes em um arquivo de texto.
    """
    print(f"Saving chord sheet to {song_name}.txt...")

    folder_path = "/content/chord_sheets"
    path = f"{folder_path}/{song_name}.txt"

    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    with open(path, 'w') as file:
        file.write(data)

    print(f'Saved as {path}!')

    return path

In [47]:
def format_song_with_chords(timestamped_lyrics_paths, lyrics_paths, chords_paths, song_names):
    """
    Formata a letra com acordes no formato tradicional de cifras.
    """
    formatted_chords = []

    for timestamped_lyrics_path, lyrics_path, chords_path, song_name in zip(timestamped_lyrics_paths, lyrics_paths, chords_paths, song_names):
        lyrics = parse_lyrics(timestamped_lyrics_path, lyrics_path)
        chords = parse_chords(chords_path)

        lyrics_with_chords = overlay_chords_on_transcribed(lyrics, chords)

        formatted_output = format_transcribed_with_chords(lyrics_with_chords)
        chord_sheet_path = save_chords_sheet(formatted_output, song_name)

        formatted_chords.append(chord_sheet_path)

    return formatted_chords



---



# Evaluation

## Cifra Club Webscraping

In [26]:
def has_chords(text):
    """
    Verifica se uma linha contém acordes musicais.

    Args:
        text (str): Linha de texto a ser analisada

    Returns:
        bool: True se a linha contém acordes, False caso contrário
    """
    # Expressão regular para acordes musicais
    chords_pattern = r'\b([A-G][#b]?(m|maj|min|M|add|sus|dim|aug)?[0-9]*(\([^)]+\))?|B7|E7|A7)\b'

    # Verifica se há pelo menos um acorde na linha
    if re.search(chords_pattern, text.strip()):
        return True
    else:
        return False

In [27]:
# Função para remover tablaturas e seções indesejadas, reduzindo múltiplas linhas em branco seguidas
def clean_cifraclub_chords(chord_sheets):
    # Remover seções como [Tab - Intro] e Parte X de Y
    chord_sheets = re.sub(r'\[.*?\]\n?', '', chord_sheets)  # Remove [Tab - Intro], [Intro], etc.
    chord_sheets = re.sub(r'Parte \d+ de \d+\n?', '', chord_sheets)  # Remove "Parte 1 de 3", etc.

    # Dividir o texto em linhas
    lines = chord_sheets.split('\n')

    # Reduzir múltiplas linhas em branco seguidas para apenas uma
    keep_lines = []
    previous_line_is_empty = False

    for line in lines:
        if re.match(r'^[EBGDAe]\|', line):  # Verificar se a linha é uma tablatura (começa com E|, B|, G|, etc.)
            if keep_lines and keep_lines[-1].split():
                keep_lines.pop()
                previous_line_is_empty = True
        else:
            if not line.strip():  # Se a linha estiver em branco
                if keep_lines and has_chords(keep_lines[-1]):
                    keep_lines.pop()
                    previous_line_is_empty = True
                if not previous_line_is_empty:  # Se a linha anterior não estiver em branco
                    keep_lines.append(line)  # Adiciona a linha em branco
                    previous_line_is_empty = True
            else:
                if keep_lines and has_chords(keep_lines[-1]) and has_chords(line): # Se a linha anterior tiver acordes
                    keep_lines.pop()

                keep_lines.append(line)
                previous_line_is_empty = False

    # Juntar as linhas novamente
    if has_chords(keep_lines[-1]):
        keep_lines.pop()

    return '\n'.join(keep_lines)

In [28]:
def get_chord_sheet_from_webpage(chords_url, song_name):
    """
    Obtém a página web e extrai a cifra
    """

    print(f'Fetching webpage {chords_url}...\n\n')
    response = requests.get(chords_url)

    if response.status_code == 200:
        print(f'Webpage fetched!\n\n')

        htmlContent = BeautifulSoup(response.content, 'html.parser')

        cifra_element = htmlContent.find('pre')

        if not cifra_element:
            print("Cifra não encontrada na página.")
            return None

        chords = cifra_element.get_text()
        chords = clean_cifraclub_chords(chords)
        folder_path = "/content/chords_evaluation"
        chords_path = save_to_file(chords, folder_path, song_name)

        return chords_path
    else:
        print(f"Failed to fetch the webpage. Status code: {response.status_code}\n\n")

## Evaluation

In [29]:
# Acordes com inversões e variações - Notação Brasileira (sem "maj")
chord_inversions = {
    # Acordes com terças no baixo (inversões com /3)
    'C/3': ['C/E', 'C6/E', 'Cadd9/E'],     # Terça no baixo (E é a terça de C)
    'D/3': ['D/F#', 'D6/F#', 'D7/F#'],     # Terça no baixo (F# é a terça de D)
    'E/3': ['E/G#', 'E6/G#', 'Eadd9/G#'],  # Terça no baixo (G# é a terça de E)
    'F/3': ['F/A', 'F6/A', 'F7/A'],        # Terça no baixo (A é a terça de F)
    'G/3': ['G/B', 'G6/B', 'G7/B', 'G4/B'],        # Terça no baixo (B é a terça de G)
    'A/3': ['A/C#', 'A6/C#', 'A7/C#'],     # Terça no baixo (C# é a terça de A)
    'B/3': ['B/D#', 'B6/D#', 'B7/D#'],     # Terça no baixo (D# é a terça de B)

    # Para acordes menores com /3
    'Cm/3': ['Cm/Eb', 'Cm6/Eb', 'Cm7/Eb'],  # Terça no baixo (Eb é a terça de Cm)
    'Dm/3': ['Dm/F', 'Dm6/F', 'Dm7/F'],     # Terça no baixo (F é a terça de Dm)
    'Em/3': ['Em/G', 'Em6/G', 'Em7/G'],     # Terça no baixo (G é a terça de Em)
    'Fm/3': ['Fm/Ab', 'Fm6/Ab', 'Fm7/Ab'],  # Terça no baixo (Ab é a terça de Fm)
    'Gm/3': ['Gm/Bb', 'Gm6/Bb', 'Gm7/Bb'],  # Terça no baixo (Bb é a terça de Gm)
    'Am/3': ['Am/C', 'Am6/C', 'Am7/C'],     # Terça no baixo (C é a terça de Am)
    'Bm/3': ['Bm/D', 'Bm6/D', 'Bm7/D'],     # Terça no baixo (D é a terça de Bm)

    # Acordes com quintas no baixo (inversões com /5)
    'C/5': ['C/G', 'C7/G', 'C6/G'],      # Quinta no baixo (G é a quinta de C)
    'D/5': ['D/A', 'D7/A', 'D6/A'],      # Quinta no baixo (A é a quinta de D)
    'E/5': ['E/B', 'E7/B', 'E6/B'],      # Quinta no baixo (B é a quinta de E)
    'F/5': ['F/C', 'F7/C', 'F6/C'],      # Quinta no baixo (C é a quinta de F)
    'G/5': ['G/D', 'G7/D', 'G6/D'],      # Quinta no baixo (D é a quinta de G)
    'A/5': ['A/E', 'A7/E', 'A6/E'],      # Quinta no baixo (E é a quinta de A)
    'B/5': ['B/F#', 'B7/F#', 'B6/F#'],   # Quinta no baixo (F# é a quinta de B)

    # Para acordes menores com /5
    'Cm/5': ['Cm/G', 'Cm7/G', 'Cm6/G'],    # Quinta no baixo (G é a quinta de Cm)
    'Dm/5': ['Dm/A', 'Dm7/A', 'Dm6/A'],    # Quinta no baixo (A é a quinta de Dm)
    'Em/5': ['Em/B', 'Em7/B', 'Em6/B'],    # Quinta no baixo (B é a quinta de Em)
    'Fm/5': ['Fm/C', 'Fm7/C', 'Fm6/C'],    # Quinta no baixo (C é a quinta de Fm)
    'Gm/5': ['Gm/D', 'Gm7/D', 'Gm6/D'],    # Quinta no baixo (D é a quinta de Gm)
    'Am/5': ['Am/E', 'Am7/E', 'Am6/E'],    # Quinta no baixo (E é a quinta de Am)
    'Bm/5': ['Bm/F#', 'Bm7/F#', 'Bm6/F#'], # Quinta no baixo (F# é a quinta de Bm)

    # Acordes com sétima e /3 no baixo (inversões com sétima e terças)
    'C7/3': ['C7/E', 'C9/E'],            # Dominante com sétima e terças no baixo
    'D7/3': ['D7/F#', 'D9/F#'],          # Dominante com sétima e terças no baixo
    'E7/3': ['E7/G#', 'E9/G#'],          # Dominante com sétima e terças no baixo
    'F7/3': ['F7/A', 'F9/A'],            # Dominante com sétima e terças no baixo
    'G7/3': ['G7/B', 'G9/B'],            # Dominante com sétima e terças no baixo
    'A7/3': ['A7/C#', 'A9/C#'],          # Dominante com sétima e terças no baixo
    'B7/3': ['B7/D#', 'B9/D#'],          # Dominante com sétima e terças no baixo

    # Menores com sétima e /3 no baixo
    'Cm7/3': ['Cm7/Eb', 'Cm9/Eb'],      # Menor com sétima e terças no baixo
    'Dm7/3': ['Dm7/F', 'Dm9/F'],        # Menor com sétima e terças no baixo
    'Em7/3': ['Em7/G', 'Em9/G'],        # Menor com sétima e terças no baixo
    'Fm7/3': ['Fm7/Ab', 'Fm9/Ab'],      # Menor com sétima e terças no baixo
    'Gm7/3': ['Gm7/Bb', 'Gm9/Bb'],      # Menor com sétima e terças no baixo
    'Am7/3': ['Am7/C', 'Am9/C'],        # Menor com sétima e terças no baixo
    'Bm7/3': ['Bm7/D', 'Bm9/D'],        # Menor com sétima e terças no baixo

    # Acordes com sétima e /5 no baixo (inversões com sétima e quinta)
    'C7/5': ['C7/G', 'C9/G'],            # Dominante com sétima e quinta no baixo
    'D7/5': ['D7/A', 'D9/A'],            # Dominante com sétima e quinta no baixo
    'E7/5': ['E7/B', 'E9/B'],            # Dominante com sétima e quinta no baixo
    'F7/5': ['F7/C', 'F9/C'],            # Dominante com sétima e quinta no baixo
    'G7/5': ['G7/D', 'G9/D'],            # Dominante com sétima e quinta no baixo
    'A7/5': ['A7/E', 'A9/E'],            # Dominante com sétima e quinta no baixo
    'B7/5': ['B7/F#', 'B9/F#'],          # Dominante com sétima e quinta no baixo

    # Menores com sétima e quinta no baixo
    'Cm7/5': ['Cm7/G', 'Cm9/G'],        # Menor com sétima e quinta no baixo
    'Dm7/5': ['Dm7/A', 'Dm9/A'],        # Menor com sétima e quinta no baixo
    'Em7/5': ['Em7/B', 'Em9/B'],        # Menor com sétima e quinta no baixo
    'Fm7/5': ['Fm7/C', 'Fm9/C'],        # Menor com sétima e quinta no baixo
    'Gm7/5': ['Gm7/D', 'Gm9/D'],        # Menor com sétima e quinta no baixo
    'Am7/5': ['Am7/E', 'Am9/E'],        # Menor com sétima e quinta no baixo
    'Bm7/5': ['Bm7/F#', 'Bm9/F#'],      # Menor com sétima e quinta no baixo
}


def get_chord_equivalents(chord):
    """Returns all equivalent voicings for a chord with third in bass"""
    return chord_inversions.get(chord, chord)

In [30]:
def update_chords_with_inversions(generated_chord_sequence, validation_chord_sequence):
    """
    Updates the chords in the generated sequence, replacing those ending with '/3'
    by their equivalent chords (inversions with the third in the bass) found in the validation sequence.
    """
    updated_sequence = []

    # Iterate through each line of the generated sequence
    for generated_line in generated_chord_sequence:
        updated_line = []
        for chord, position in generated_line:
            if chord.endswith('/3'):  # Check if the chord ends with '/3'
                # Find the equivalent chords for the '/3' chord
                equivalents = get_chord_equivalents(chord)

                # Look for the equivalent chords in the validation sequence
                found_equivalent = False  # Flag to check if an equivalent is found
                for equivalent in equivalents:
                    # Check if any of the equivalents exist in the validation sequence
                    for validation_line in validation_chord_sequence:
                        for validation_chord, _ in validation_line:
                            if equivalent == validation_chord:
                                # If an equivalent is found, replace the chord
                                updated_line.append((equivalent, position))
                                found_equivalent = True
                                break
                    if found_equivalent:
                        break

                # If no equivalent is found in the validation sequence, keep the original chord
                if not found_equivalent:
                    updated_line.append((chord, position))
            else:
                # If it's not a '/3' chord, keep the original chord
                updated_line.append((chord, position))

        updated_sequence.append(updated_line)

    return updated_sequence


In [31]:
def extract_chords_with_positions(chord_sheet):
    """Extracts chords along with their positions relative to lyrics."""
    chord_pattern = re.compile(r'([A-G][#b]?(maj7?|min7?|dim|aug|sus\d?|m|M)?\d*(\/\w+)?(add\d*)?)')

    lines = chord_sheet.split('\n')
    extracted_chords = []

    absolute_index = 0

    for line in lines:
        matches = list(chord_pattern.finditer(line))

        if matches:
            extracted_chords.append([(match.group(0), match.start() + absolute_index) for match in matches])

        absolute_index += len(line) + 1

    return extracted_chords

In [32]:
def calculate_chords_percentage(comparison, result_unique_chords, validation_unique_chords):
    """
    Calcula a porcentagem de acordes em comum entre duas listas de acordes.
    """
    total_chords = len(result_unique_chords | validation_unique_chords)  # Total de acordes únicos na cifra result

    if total_chords == 0:
        return 0

    common_chords_count = len(comparison)

    # A porcentagem de acordes comuns em relação ao total de acordes na cifra result
    common_percentage = (common_chords_count / total_chords) * 100

    return common_percentage

In [33]:
# Função para normalizar acordes (remover extensões como 7, sus4, etc.)
def simplify_chord(chord):
    if not chord:
        return None
    try:
        # Usa a biblioteca pychord para extrair a fundamental e o modo
        c = Chord(chord)

        return f"{c.root}"  # Exemplo: "Gm" para "Gm7" ou "Gsus4"
    except:
        return chord  # Se não for possível normalizar, retorna o acorde original


In [34]:
def compare_chord_positions(result_chords, validation_chords, margin=2):
    """
    Compara os acordes de duas cifras considerando uma margem de erro nos índices e levando em conta
    as inversões com '/3' nos acordes de result_chords. Retorna um dicionário com a quantidade de posições corretas
    (iguais e semelhantes) e a quantidade de posições incorretas (diferentes, faltantes e em excesso).
    """
    correct_positions = 0  # Quantidade de posições corretas (acordes iguais ou semelhantes)
    incorrect_positions = 0  # Quantidade de posições incorretas (diferentes, faltantes ou em excesso)

    matched_chords = set()   # Guarda acordes de result_chords que encontraram pares

    # Comparação dos acordes entre result_chords e validation_chords
    for result_list in result_chords:
        for result_chord, result_index in result_list:
            found_match = False  # Flag para indicar se houve correspondência

            for i, validation_list in enumerate(validation_chords):
                for validation_chord, validation_index in validation_list:
                    if abs(result_index - validation_index) <= margin:  # Comparação dentro da margem
                        # Se os acordes forem exatamente iguais
                        if result_chord == validation_chord:
                            correct_positions += 1
                            found_match = True
                            matched_chords.add(result_chord)
                            break
                        # Se os acordes forem equivalentes (mesma raiz) ou similaridade de acorde
                        elif simplify_chord(result_chord) == simplify_chord(validation_chord):
                            correct_positions += 1
                            found_match = True
                            matched_chords.add(result_chord)
                            break

            if not found_match:
                incorrect_positions += 1  # Contabiliza como incorreto (não encontrou correspondência)

    # Acordes faltantes e em excesso
    # Faltantes: Estão em validation_chords, mas não foram pareados com result_chords
    # Em excesso: Estão em result_chords, mas não foram pareados com validation_chords
    missing_chords = [
        (chord, idx) for validation_list in validation_chords for chord, idx in validation_list
        if chord not in matched_chords
    ]
    extra_chords = [
        (chord, idx) for result_list in result_chords for chord, idx in result_list
        if chord not in matched_chords
    ]

    # Faltantes e em excesso também são considerados como incorretos
    incorrect_positions += len(missing_chords) + len(extra_chords)

    percentage = (correct_positions / (correct_positions + incorrect_positions)) * 100
    return {
        "posicoes_corretas": correct_positions,
        "posicoes_incorretas": incorrect_positions
    }, percentage

In [35]:
def calculate_percentage_comparison_from_count(count_comparison):
    """
    Calcula a porcentagem de acordes iguais e diferentes, dado o dicionário de contagens comparativas de acordes.
    """
    common_chords_count = 0
    different_chords_count = 0
    total_chords = 0

    # Calculando a quantidade de acordes iguais e diferentes
    for chord, (result_count, validation_count) in count_comparison.items():
        total_chords += 1

        if result_count == validation_count:
            common_chords_count += 1  # Acordes iguais
        else:
            different_chords_count += 1  # Acordes diferentes

    if total_chords == 0:
        return {
            "equal_percentage": 0,
            "different_percentage": 0
        }

    equal_percentage = (common_chords_count / total_chords) * 100
    different_percentage = (different_chords_count / total_chords) * 100

    return {
        "equal_percentage": equal_percentage,
        "different_percentage": different_percentage
    }

In [36]:
def sequence_diff(generated_chord_sequence, validation_chord_sequence):
    """Compares two sequences and highlights similarities and differences."""
    generated_simplified = [simplify_chord(chord) for chord in generated_chord_sequence]
    validation_simplified = [simplify_chord(chord) for chord in validation_chord_sequence]

    matcher = SequenceMatcher(None, generated_simplified, validation_simplified)

    diff_result = {
        "sequências": [],
        "substituições": [],
        "exclusões": [],
        "inserções": []
    }

    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag == 'equal':
            # Adiciona as sequências que são iguais nas duas listas
            diff_result["sequências"].append(generated_chord_sequence[i1:i2])
        elif tag == 'replace':
            # Adiciona as substituições (sequência substituída por outra)
            diff_result["substituições"].append({
                "cifra gerada": generated_chord_sequence[i1:i2],
                "cifra validacao": validation_chord_sequence[j1:j2]
            })
        elif tag == 'delete':
            # Adiciona os elementos que foram deletados
            diff_result["inserções"].append(generated_chord_sequence[i1:i2])
        elif tag == 'insert':
            # Adiciona os elementos que foram inseridos
            diff_result["exclusões"].append(validation_chord_sequence[j1:j2])

    return diff_result


In [37]:
def compare_chords(result_path, chords_evaluation_path):
    with open(result_path, 'r', encoding='utf-8') as file:
        result_sheet = file.read()
    with open(chords_evaluation_path, 'r', encoding='utf-8') as file:
        validation_sheet = file.read()

    """Compares two chord sheets based on extracted chords."""
    result_chords = extract_chords_with_positions(result_sheet)
    validation_chords = extract_chords_with_positions(validation_sheet)

    result_chords = update_chords_with_inversions(result_chords, validation_chords)

    result_chords_flat = [chord for line in result_chords for chord, _ in line]
    validation_chords_flat = [chord for line in validation_chords for chord, _ in line]

    result_chords_count = Counter(result_chords_flat)
    validation_chords_count = Counter(validation_chords_flat)

    result_unique_chords = set(result_chords_flat)
    validation_unique_chords = set(validation_chords_flat)

    common_chords = result_unique_chords & validation_unique_chords
    common_chords_percentage = calculate_chords_percentage(common_chords, result_unique_chords, validation_unique_chords)

    only_on_result = result_unique_chords - validation_unique_chords
    only_on_result_percentage = calculate_chords_percentage(only_on_result, result_unique_chords, validation_unique_chords)

    only_on_validation = validation_unique_chords - result_unique_chords
    only_on_validation_percentage = calculate_chords_percentage(only_on_validation, result_unique_chords, validation_unique_chords)

    positions_comparison, positions_comparison_percentage = compare_chord_positions(result_chords, validation_chords)

    count_comparison = {chord: (result_chords_count[chord], validation_chords_count.get(chord, 0)) for chord in result_unique_chords | validation_unique_chords}
    count_comparison_percentage = calculate_percentage_comparison_from_count(count_comparison)

    sequence_diff_result = sequence_diff(result_chords_flat, validation_chords_flat)

    return {
        "common_chords": common_chords,
        "common_chords_percentage": common_chords_percentage,
        "only_on_result": only_on_result,
        "only_on_result_percentage": only_on_result_percentage,
        "only_on_validation": only_on_validation,
        "only_on_validation_percentage": only_on_validation_percentage,
        "count_comparison": count_comparison,
        "count_comparison_percentage": count_comparison_percentage,
        "positions_comparison": positions_comparison,
        "positions_comparison_percentage": positions_comparison_percentage,
        "sequence_diff": sequence_diff_result
    }


# MAIN

In [None]:
songs = get_songs_from_csv()

song_names = []
song_paths = []
lyrics_urls = []
chord_sheets = []
evaluation_paths = []

for song in songs:
    song_name = song.get_name().replace(" ", "_")
    song_names.append(song_name)

    song_path = extract_sound_recording(song.get_audio_url(), song_name)
    song_paths.append(song_path)

    lyrics_urls.append(song.get_lyrics_url())

    evaluation_path = get_chord_sheet_from_webpage(song.get_chords_url(), song_name)
    evaluation_paths.append(evaluation_path)

chords_paths = extract_chords_from(song_paths, song_names)

create_output_folder()
lyrics_paths, lyrics_timestamped_paths = get_synced_lyrics(lyrics_urls, song_paths, song_names)

In [48]:
chord_sheets = format_song_with_chords(lyrics_timestamped_paths, lyrics_paths, chords_paths, song_names)

for chord_sheet, song_name, evaluation_path in zip(chord_sheets, song_names, evaluation_paths):
    name = song_name.upper().replace("_", " ")
    print(f"------------EVALUATING \'{name}\' CHORD SHEET----------------\n")

    comparison = compare_chords(chord_sheet, evaluation_path)

    for key, value in comparison.items():
        if isinstance(value, dict):  # Se o valor for um dicionário, iteramos sobre ele
            print(f"{key}:")
            for sub_key, sub_value in value.items():
                print(f"  {sub_key}: {sub_value}")
        else:
            print(f"{key}: {value}")
    print(f"-------------------------------------------------------------\n\n")

Parsing lyrics from /content/lyrics/que_pais_e_este.txt...
Lyrics Parsed!


Parsing chords from /content/chords/que_pais_e_este.lab...
Chords parsed!


Overlaying chords on lyrics...
Chords overlayed!


Saving chord sheet to que_pais_e_este.txt...
Saved as /content/chord_sheets/que_pais_e_este.txt!
Parsing lyrics from /content/lyrics/infiel.txt...
Lyrics Parsed!


Parsing chords from /content/chords/infiel.lab...
Chords parsed!


Overlaying chords on lyrics...
Chords overlayed!


Saving chord sheet to infiel.txt...
Saved as /content/chord_sheets/infiel.txt!
Parsing lyrics from /content/lyrics/tocando_em_frente.txt...
Lyrics Parsed!


Parsing chords from /content/chords/tocando_em_frente.lab...
Chords parsed!


Overlaying chords on lyrics...
Chords overlayed!


Saving chord sheet to tocando_em_frente.txt...
Saved as /content/chord_sheets/tocando_em_frente.txt!
Parsing lyrics from /content/lyrics/petrolina_juazeiro.txt...
Lyrics Parsed!


Parsing chords from /content/chords/petrolina_jua