<a href="https://colab.research.google.com/github/jessica-aaao/ChordsExtractor/blob/main/ChordExtractor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
!python3 -m pip install -q -U "yt-dlp[default]"
!pip install -q -U openai-whisper
!pip install -q -U demucs
!pip install pychord



In [2]:
import requests
import json
import pandas as pd
import os
import re
import unicodedata

In [3]:
from google.colab import drive
from IPython.display import display
from bs4 import BeautifulSoup

In [4]:
!git clone  https://github.com/mikezzb/lyrics-sync.git
!git clone https://github.com/filipecalegario/ISMIR2019-Large-Vocabulary-Chord-Recognition.git

fatal: destination path 'lyrics-sync' already exists and is not an empty directory.
fatal: destination path 'ISMIR2019-Large-Vocabulary-Chord-Recognition' already exists and is not an empty directory.


In [5]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Common

In [6]:
class SongUrls:
    def __init__(self, name, audio, lyrics, chords):
        self.name = name
        self.audio = audio
        self.lyrics = lyrics
        self.chords = chords

    def get_name(self):
        return self.name

    def get_audio_url(self):
        return self.audio

    def get_lyrics_url(self):
        return self.lyrics

    def get_chords_url(self):
        return self.chords

In [7]:
def get_urls():
    """
    Carrega o arquivo CSV com as infformações das músicas.
    """
    print(f'Fetching urls...\n\n')

    file_path = '/content/drive/My Drive/TCC/CodeData/songs.csv'
    songs = pd.read_csv(file_path)

    print(f'Urls fetched!\n\n')

    return songs

In [8]:
def get_songs_from_csv():
    """
    Obtém as músicas, letras e cifras, além do nome da música, a partir do arquivo CSV.
    """
    songs = get_urls()

    song_urls = []

    print(f'Creating SongUrls...\n\n')

    for index, row in songs.iterrows():
        song_name = slugify(row['Song Name'])
        audio_url = row['Audio URL']
        lyrics_url = row['Lyrics URL']
        chords_url = row['Chords URL']

        song_urls.append(SongUrls(song_name, audio_url, lyrics_url, chords_url))

    print(f'SongUrls created!\n\n')

    return song_urls


In [9]:
def slugify(raw_song_name):
    """
    Converte o nome da música para um formato que pode ser usado como nome de arquivo.
    """
    song_name = raw_song_name.lower()

    song_name = unicodedata.normalize('NFKD', song_name)
    song_name = song_name.encode('ascii', 'ignore').decode('ascii')

    song_name = re.sub(r'[^a-z0-9]+', '_', song_name)

    song_name = song_name.strip('_')

    return song_name

In [10]:
def save_to_file(data, folder_path, song_name):
    """
    Salva os dados em um arquivo de texto.
    """
    path = f"{folder_path}/{song_name}.txt"

    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    with open(path, 'w') as file:
        file.write(data)

    print(f'Saved as {path}')

    return path

# Sound

In [11]:
def extract_sound_recording(youtube_url, song_name):
    """
    Extrai o áudio da música a partir do vídeo do YouTube.
    """
    print(f'Extracting audio from {youtube_url}...\n\n')

    cookies_path = '/content/cookies.txt'
    output_path = f"/content/audios/{song_name}.wav"

    !yt-dlp {youtube_url} --audio-format "wav" --cookies {cookies_path} -x -o {output_path}  -q

    print(f'Audio saved as {output_path}!\n\n')

    return output_path


# Chords

In [12]:
def extract_chords_from(song_paths, song_names):
    """
    Extrai a cifra, com o timestamp, a partir da música
    """

    output_folder = "/content/chords"
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    output_paths = []

    %cd ISMIR2019-Large-Vocabulary-Chord-Recognition
    !pip install -q -r requirements.txt

    for song_path, song_name in zip(song_paths, song_names):
        print(f"Extracting chords from {song_path}...\n\n")
        output_path = f"{output_folder}/{song_name}.lab"
        !python chord_recognition.py {song_path} {output_path}
        print(f"Chords extracted to {output_path}!\n\n")

        output_paths.append(output_path)

    %cd ..

    return output_paths

# Lyrics

## Lyrics Extraction from Webpage

In [13]:
def extract_lyrics_from_html(html):
    """
    Extrai a letra da página HTML fornecida
    """

    print(f'Fetching lyrics...!\n\n')

    lyricsTag = html.find('div', class_='lyric-original')
    lyrics = ""

    for p in lyricsTag.find_all('p'):
        for br in p.find_all('br'):
            br.replace_with('\n')
        lyrics += p.get_text() + "\n"

    print(f'Lyrics fetched!\n\n')

    return lyrics


In [14]:
def get_lyrics_from_webpage(lyric_urls, song_names):
    """
    Obtém a página web e extrai a letra
    """

    lyrics_paths = []
    folder_path = "/content/lyrics"

    for lyric_url, song_name in zip(lyric_urls, song_names):
        print(f'Fetching webpage {lyric_url}...\n\n')
        response = requests.get(lyric_url)

        if response.status_code == 200:
            print(f'Webpage fetched!\n\n')
            htmlContent = BeautifulSoup(response.content, 'html.parser')

            lyrics = extract_lyrics_from_html(htmlContent)
            lyrics_path = save_to_file(lyrics, folder_path, song_name)

            lyrics_paths.append(lyrics_path)
        else:
            print(f"Failed to fetch {song_name} webpage. Status code: {response.status_code}\n\n")

    return lyrics_paths

## Lyrics Sync to Audio

In [15]:
def create_output_folder():
    """
    Cria as pastas de resultados para o lyrics-sync
    """
    output_folder = "/content/lyrics-sync/output"
    vocals_folder = output_folder + "/vocals"
    words_folder = output_folder + "/words"
    lrc_folder = output_folder + "/lrc"

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    if not os.path.exists(vocals_folder):
        os.makedirs(vocals_folder)

    if not os.path.exists(words_folder):
        os.makedirs(words_folder)

    if not os.path.exists(lrc_folder):
        os.makedirs(lrc_folder)

In [16]:
def get_timestamps(audio_paths, lyrics_paths, song_names):
    """
    Obtém os timestamps das palavras da letra
    """
    print("Installing conda...")
    !wget -c https://repo.continuum.io/archive/Anaconda3-2024.10-1-Linux-x86_64.sh
    !chmod +x Anaconda3-2024.10-1-Linux-x86_64.sh
    !bash ./Anaconda3-2024.10-1-Linux-x86_64.sh -b -f -p /usr/local
    print("Conda installed!")

    print("Installing lsync...")
    %cd lyrics-sync/
    !conda env update -f environment.yml
    !source activate lsync

    print("Lsync installed!")
    from lsync import LyricsSync

    print("Extracting timestamps...")

    lsync = LyricsSync()

    timestamps_paths = []

    for lyrics_path, audio_path, song_name in zip(lyrics_paths, audio_paths, song_names):
        words, lrc = lsync.sync(audio_path, lyrics_path)
        words_path = f"/content/lyrics-sync/output/words/{song_name}.csv"

        timestamps_paths.append(words_path)

    print("Timestamps extracted!")

    return timestamps_paths
    %cd ..

In [17]:
def get_synced_lyrics(lyric_urls, audio_paths, song_names):
    """
    Obtém a letra sincronizada com o áudio.
    """

    print(f"Getting synced lyrics...\n\n")

    lyrics_paths = get_lyrics_from_webpage(lyric_urls, song_names)
    timestamps_paths = get_timestamps(audio_paths, lyrics_paths, song_names)

    print(f"Synced lyrics ready!\n\n")

    return (lyrics_paths, timestamps_paths)

# Chord Sheets

## Chords Parsing

In [18]:
def simplify_chord(chord):
    """
    Simplifica os acordes resconhecidos pelo modelo, para acordes mais comuns
    nas cifras.
    """
    chord = chord.replace(":", "")
    chord = chord.replace("min", "m")
    chord = chord.replace("maj", "")
    chord = chord.replace("hdim7", "m7(b5)")
    chord = chord.replace("hdim", "m7(b5)")
    chord = chord.replace("sus4(b7)", "7sus4")

    return chord

In [67]:
def parse_chords(chords_path):
    """
    Percorre o arquivo .lab, simplifica e extrai os acordes em um dicionário.
    """
    print(f"Parsing chords from {chords_path}...")

    chords = []
    with open(chords_path, 'r', encoding='utf-8') as file:
        for line in file:
            start, end, chord = line.strip().split('\t')
            if chord == 'N':
                continue
            chord = simplify_chord(chord)
            chords.append({
                "start": float(start),
                "end": float(end),
                "chord": chord
            })

        print(f"Chords parsed!\n\n")

        return chords

## Lyrics Parsing

In [69]:
def parse_lyrics(timestamped_lyrics_path, lyrics_path):
    """
    Separa as letras, com timestamps, em linhas, de acordo com o arquivo da
    letra extraída pelo web scraping.
    """
    print(f"Parsing lyrics from {lyrics_path}...")

    timestamped_lyrics = pd.read_csv(timestamped_lyrics_path)
    timestamped_per_line = []

    with open(lyrics_path, 'r', encoding='utf-8') as file:
        for index, line in enumerate(file):
            line = line.strip().split()
            words_in_line = len(line)

            timestamped_words = timestamped_lyrics.iloc[:words_in_line]
            timestamped_lyrics = timestamped_lyrics.iloc[words_in_line:]

            for df_index, word_info in timestamped_words.iterrows():
                start = word_info["start"]
                end = word_info["end"]
                word = word_info["label"]

                timestamped_per_line.append({
                    "word": word,
                    "start": start,
                    "end": end,
                    "line": index
                })

    print(f'Lyrics Parsed!\n\n')

    return timestamped_per_line

## General

In [23]:
def align_chord_over_word(word_info, chord_info, line_start):
    """
    Posiciona o acorde sobre a palavra.
    """
    word = word_info["word"]
    word_start = word_info["start"]
    word_end = word_info["end"]
    chord = chord_info["chord"]
    chord_start = chord_info["chord_start"]

    word_duration = word_end - word_start
    ratio = (chord_start - word_start) / word_duration

    word_len = len(word)
    word_index = int(round(ratio * (word_len - 1)))
    word_index = max(0, min(word_index, word_len - 1))

    chord_position = line_start + word_index

    return (chord, chord_position)

In [70]:
def find_closest(words, chord_start):
  left, right = 0, len(words) - 1

  while left < right:
    mid = left + (right - left) // 2

    if words[mid]["start"] == chord_start:
      return mid
    elif words[mid]["start"] < chord_start:
      left = mid + 1
    else:
      right = mid

  # Check if left is 0 or if the left-1 element is closer to the target
  if left == 0 or abs(words[left - 1]["start"] - chord_start) < abs(words[left]["start"] - chord_start):
    return left - 1

  return left

In [66]:
def overlay_chords_on_transcribed(lyrics_per_line, chords):
    """
    Associa acordes às palavras da letra com base nos tempos de ACORDES.
    """
    print(f"Overlaying chords on lyrics...")

    result = []
    previous_end = None

    for line_number, line in enumerate(lyrics_per_line):
        for dataframe in line:
            for index, word_info in dataframe.iterrows():
                start = word_info["start"]
                end = word_info["end"]
                word = word_info["label"]
                associated_chords = []

                if start is not None and end is not None:
                    for i in range(len(chords)):
                        chord_infos = chords[0]
                        chord_info_start = chord_infos["start"]
                        chord_info_end = chord_infos["end"]
                        chord_info_name = chord_infos["chord"]

                        if chord_info_start > end: # Próxima palavra
                            break
                        elif start <= chord_info_start <= end: # Dentro da palavra
                            chord_name = chord_info_name
                            chord_start = chord_info_start

                            associated_chords.append({
                                "chord": chord_name,
                                "chord_start": chord_start
                            })
                        elif previous_end is not None and previous_end < chord_info_start < start: # Entre palavras
                            chord_name = chord_info_name
                            chord_start = chord_info_start

                            result.append({
                                "word": " ",
                                "start": start,
                                "end": end,
                                "chords": [{
                                    "chord": chord_name,
                                    "chord_start": chord_start
                                }],
                                "line": line_number
                            })
                        elif chord_info_end >= start: # Antes da palavra
                            chord_name = chord_info_name
                            chord_start = chord_info_start

                            result.append({
                                "word": " ",
                                "start": start,
                                "end": end,
                                "chords": [{
                                    "chord": chord_name,
                                    "chord_start": chord_start
                                }],
                                "line": line_number
                            })

                        chords.pop(0)

                result.append({
                    "word": word,
                    "start": start,
                    "end": end,
                    "chords": associated_chords,
                    "line": line_number
                })

                previous_end = end

    print(f"Chords overlayed!\n\n")

    return result

In [24]:
def format_transcribed_with_chords(lyrics_with_chords):
    """
    Gera o formato de saída com acordes acima das palavras.
    """
    formatted_output = []
    chord_line = []
    lyrics_line = []
    validation_data = []

    current_line = lyrics_with_chords[0]["line"]
    line_start = 0

    for word_info in lyrics_with_chords:
        if word_info["line"] != current_line:
            current_line = word_info["line"]
            formatted_output.append("".join(chord_line))
            formatted_output.append(" ".join(lyrics_line))
            formatted_output.append("")
            chord_line = []
            lyrics_line = []
            line_start = 0

        word = word_info["word"]
        chords = word_info["chords"]
        chord_position = None

        if chords:
            for chord_info in chords:
                chord = chord_info["chord"]

                if word == " ":
                    chord_line.append(chord)
                    line_start += 1
                else:
                    chord, chord_position = align_chord_over_word(word_info, chord_info, line_start)

                    while len(chord_line) < chord_position:
                        chord_line.append(" ")

                    if chord_line and chord_line[-1] != " ":
                        chord_line.append(" ")

                    chord_line.append(chord)

            line_start += len(word)
        else:
            chord_line.extend([" "] * (len(word)+1))
            line_start += len(word)

        lyrics_line.append(word)

    # Adiciona o restante, se existir
    if chord_line or lyrics_line:
        formatted_output.append("".join(chord_line))
        formatted_output.append(" ".join(lyrics_line))

    return ("\n".join(formatted_output))

In [25]:
def save_chords_sheet(data, song_name):
    """
    Salva a cifra com acordes em um arquivo de texto.
    """
    print(f"Saving chord sheet to {song_name}.txt...")

    folder_path = "/content/chord_sheets"
    path = f"{folder_path}/{song_name}.txt"

    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    with open(path, 'w') as file:
        file.write(data)

    print(f'Saved as {path}!')

    return path

In [26]:
def format_song_with_chords(timestamped_lyrics_paths, lyrics_paths, chords_paths, song_names):
    """
    Formata a letra com acordes no formato tradicional de cifras.
    """
    formatted_chords = []
    validations = []

    for timestamped_lyrics_path, lyrics_path, chords_path, song_name in zip(timestamped_lyrics_paths, lyrics_paths, chords_paths, song_names):
        lyrics = parse_lyrics(timestamped_lyrics_path, lyrics_path)
        chords = parse_chords(chords_path)

        lyrics_with_chords = overlay_chords_on_transcribed(lyrics, chords)

        formatted_output = format_transcribed_with_chords(lyrics_with_chords)
        chord_sheet_path = save_chords_sheet(formatted_output, song_name)

        formatted_chords.append(chord_sheet_path)

    return formatted_chords



---



# Evaluation

## Cifra Club Webscraping

In [57]:
def has_chords(text):
    """
    Verifica se uma linha contém acordes musicais.

    Args:
        text (str): Linha de texto a ser analisada

    Returns:
        bool: True se a linha contém acordes, False caso contrário
    """
    # Expressão regular para acordes musicais
    chords_pattern = r'\b([A-G][#b]?(m|maj|min|M|add|sus|dim|aug)?[0-9]*(\([^)]+\))?|B7|E7|A7)\b'

    # Verifica se há pelo menos um acorde na linha
    if re.search(chords_pattern, text.strip()):
        return True
    else:
        return False

In [62]:
# Função para remover tablaturas e seções indesejadas, reduzindo múltiplas linhas em branco seguidas
def clean_cifraclub_chords(chord_sheets):
    # Remover seções como [Tab - Intro] e Parte X de Y
    chord_sheets = re.sub(r'\[.*?\]\n?', '', chord_sheets)  # Remove [Tab - Intro], [Intro], etc.
    chord_sheets = re.sub(r'Parte \d+ de \d+\n?', '', chord_sheets)  # Remove "Parte 1 de 3", etc.

    # Dividir o texto em linhas
    lines = chord_sheets.split('\n')

    # Reduzir múltiplas linhas em branco seguidas para apenas uma
    keep_lines = []
    previous_line_is_empty = False

    for line in lines:
        if re.match(r'^[EBGDAe]\|', line):  # Verificar se a linha é uma tablatura (começa com E|, B|, G|, etc.)
            if keep_lines and keep_lines[-1].split():
                keep_lines.pop()
                previous_line_is_empty = True
        else:
            if not line.strip():  # Se a linha estiver em branco
                if keep_lines and has_chords(keep_lines[-1]):
                    keep_lines.pop()
                    previous_line_is_empty = True
                if not previous_line_is_empty:  # Se a linha anterior não estiver em branco
                    keep_lines.append(line)  # Adiciona a linha em branco
                    previous_line_is_empty = True
            else:
                if keep_lines and has_chords(keep_lines[-1]) and has_chords(line): # Se a linha anterior tiver acordes
                    keep_lines.pop()

                keep_lines.append(line)
                previous_line_is_empty = False

    # Juntar as linhas novamente
    if has_chords(keep_lines[-1]):
        keep_lines.pop()

    return '\n'.join(keep_lines)

In [28]:
def get_chord_sheet_from_webpage(chords_url, song_name):
    """
    Obtém a página web e extrai a cifra
    """

    print(f'Fetching webpage {chords_url}...\n\n')
    response = requests.get(chords_url)

    if response.status_code == 200:
        print(f'Webpage fetched!\n\n')

        htmlContent = BeautifulSoup(response.content, 'html.parser')

        cifra_element = htmlContent.find('pre')

        if not cifra_element:
            print("Cifra não encontrada na página.")
            return None

        chords = cifra_element.get_text()
        chords = clean_cifraclub_chords(chords)
        folder_path = "/content/chords_evaluation"
        chords_path = save_to_file(chords, folder_path, song_name)

        return chords_path
    else:
        print(f"Failed to fetch the webpage. Status code: {response.status_code}\n\n")

## Evaluation

In [73]:
import re
from difflib import SequenceMatcher
from pychord import Chord  # Biblioteca python-chord para normalização de acordes

# Função para normalizar acordes (remover extensões como 7, sus4, etc.)
def normalizar_acorde(acorde):
    if not acorde:
        return None
    try:
        # Usa a biblioteca python-chord para extrair a fundamental e o modo
        c = Chord(acorde)
        return f"{c.root}{c.quality}"  # Exemplo: "Gm" para "Gm7" ou "Gsus4"
    except:
        return acorde  # Se não for possível normalizar, retorna o acorde original

# Função para extrair acordes e suas posições de um texto
def extrair_acordes(texto):
    # Expressão regular para identificar acordes
    regex_acordes = r'\b([A-G][#b]?(m|maj|min|M|add|sus|dim|aug)?[0-9]*(\([^)]+\))?|B7|E7|A7)\b'

    # Encontrar todos os acordes e suas posições
    acordes = [(m.group(), m.start()) for m in re.finditer(regex_acordes, texto)]
    print(acordes)
    return acordes

# Função para comparar duas cifras com tolerância de posição (±2)
def comparar_cifras(path_cifra_gerada, path_cifra_cifraclub, tolerancia_pos=2):
    # Ler o conteúdo dos arquivos
    with open(path_cifra_gerada, 'r', encoding='utf-8') as file:
        cifra_gerada = file.read()

    with open(path_cifra_cifraclub, 'r', encoding='utf-8') as file:
        cifra_cifraclub = file.read()

    # Extrair acordes e suas posições
    acordes_gerados = extrair_acordes(cifra_gerada)
    acordes_cifraclub = extrair_acordes(cifra_cifraclub)

    # Normalizar acordes
    acordes_gerados = [(normalizar_acorde(acorde), pos) for acorde, pos in acordes_gerados]
    acordes_cifraclub = [(normalizar_acorde(acorde), pos) for acorde, pos in acordes_cifraclub]

    # Etapa 1: Análise de Posicionamento
    relatorio_pos = analisar_posicionamento(acordes_gerados, acordes_cifraclub, tolerancia_pos)

    # Etapa 2: Análise de Conteúdo
    relatorio_acordes = analisar_acordes(acordes_gerados, acordes_cifraclub)

    return {
        "posicionamento": relatorio_pos,
        "acordes": relatorio_acordes,
        "estatisticas": {
            "total_gerado": len(acordes_gerados),
            "total_referencia": len(acordes_cifraclub)
        }
    }

def analisar_posicionamento(acordes_gerados, acordes_ref, tolerancia):
    """Analisa o alinhamento estrutural das cifras"""
    relatorio = {
        "deslocamentos": [],
        "match_estrutural": 0,
        "erros_posicionais": 0
    }

    for i, (acorde, pos) in enumerate(acordes_gerados):
        if i >= len(acordes_ref):
            break

        pos_ref = acordes_ref[i][1]
        diff_pos = abs(pos - pos_ref)

        relatorio["deslocamentos"].append(diff_pos)

        if diff_pos <= tolerancia:
            relatorio["match_estrutural"] += 1
        else:
            relatorio["erros_posicionais"] += 1

    return relatorio

def analisar_acordes(acordes_gerados, acordes_ref):
    """Analisa correspondência de acordes independente de posição"""
    relatorio = {
        "exatos": 0,
        "variacoes": 0,
        "faltantes": [],
        "extras": [],
        "divergentes": []
    }

    # Criar dicionário de acordes de referência por posição
    ref_dict = {pos: acorde for acorde, pos in acordes_ref}

    for acorde, pos in acordes_gerados:
        if pos in ref_dict:
            ref_acorde = ref_dict[pos]
            if acorde == ref_acorde:
                relatorio["exatos"] += 1
            elif acorde[0] == ref_acorde[0]:  # Mesma nota fundamental
                relatorio["variacoes"] += 1
                relatorio["divergentes"].append((acorde, ref_acorde))
            else:
                relatorio["divergentes"].append((acorde, ref_acorde))
        else:
            relatorio["faltantes"].append(acorde)

    # Verificar acordes extras na referência
    gerados_pos = {pos for _, pos in acordes_gerados}
    for pos, acorde in ref_dict.items():
        if pos not in gerados_pos:
            relatorio["extras"].append(acorde)

    return relatorio

def calcular_metricas(resultado_comparacao):
    # Extrair dados básicos
    pos = resultado_comparacao['posicionamento']
    ac = resultado_comparacao['acordes']
    stats = resultado_comparacao['estatisticas']

    # 1. Precisão Geral
    total_comparacoes = max(stats['total_gerado'], stats['total_referencia'])
    acertos_totais = pos['match_estrutural'] + ac['exatos']
    precisao_geral = (acertos_totais / total_comparacoes) * 100 if total_comparacoes > 0 else 0

    # 2. Taxa de Erros por Tipo
    taxa_erros = {
        'posicionamento': (pos['erros_posicionais'] / stats['total_gerado']) * 100 if stats['total_gerado'] > 0 else 0,
        'divergencias': (len(ac['divergentes']) / stats['total_referencia']) * 100 if stats['total_referencia'] > 0 else 0,
        'faltantes': (len(ac['faltantes']) / stats['total_referencia']) * 100 if stats['total_referencia'] > 0 else 0,
        'extras': (len(ac['extras']) / stats['total_referencia']) * 100 if stats['total_referencia'] > 0 else 0
    }

    # 3. Similaridade Harmônica (considerando variações como parcialmente corretas)
    similaridade_harmonica = (
        (ac['exatos'] + (ac['variacoes'] * 0.5)) /  # Variações valem meio ponto
        stats['total_referencia'] * 100 if stats['total_referencia'] > 0 else 0
    )

    # 4. Distribuição de Deslocamentos (análise de posicionamento)
    if pos['deslocamentos']:
        deslocamento_medio = sum(pos['deslocamentos']) / len(pos['deslocamentos'])
        deslocamento_max = max(pos['deslocamentos'])
    else:
        deslocamento_medio = deslocamento_max = 0

    # 5. Match por Seção (se os dados incluírem informação de seções)
    metricas_secao = {}  # Pode ser preenchido se houver dados de seções

    return {
        'precisao_geral': round(precisao_geral, 2),
        'similaridade_harmonica': round(similaridade_harmonica, 2),
        'taxa_erros': {k: round(v, 2) for k, v in taxa_erros.items()},
        'deslocamento': {
            'medio': round(deslocamento_medio, 2),
            'maximo': deslocamento_max
        },
        'contagens': {
            'acertos_exatos': ac['exatos'],
            'variacoes_aceitaveis': ac['variacoes'],
            'divergencias_graves': len(ac['divergentes']),
            'acordes_faltantes': len(ac['faltantes']),
            'acordes_extras': len(ac['extras'])
        }
    }

# MAIN

In [30]:
songs = get_songs_from_csv()
song_paths = []
song_names = []
lyrics_urls = []

chord_sheets = []
validation_chords = []

evaluation_paths = []

for song in songs:
    song_name = song.get_name().replace(" ", "_")
    song_path = extract_sound_recording(song.get_audio_url(), song_name)

    song_paths.append(song_path)
    song_names.append(song_name)
    lyrics_urls.append(song.get_lyrics_url())

    evaluation_path = get_chord_sheet_from_webpage(song.get_chords_url(), song_name)
    evaluation_paths.append(evaluation_path)

chords_paths = extract_chords_from(song_paths, song_names)

create_output_folder()
lyrics_paths, lyrics_timestamped_paths = get_synced_lyrics(lyrics_urls, song_paths, song_names)

chord_sheets = format_song_with_chords(lyrics_timestamped_paths, lyrics_paths, chords_paths, song_names)

Fetching urls...


Urls fetched!


Creating SongUrls...


SongUrls created!


Extracting audio from https://youtu.be/kFpkyT_KPpc...


Audio saved as /content/audios/que_pais_e_este.wav!


Extracting audio from https://youtu.be/eCyMh-mZ1B0...


Audio saved as /content/audios/infiel.wav!


Extracting audio from https://youtu.be/CugYXgJ2SFI?t=11...


Audio saved as /content/audios/tocando_em_frente.wav!


Extracting audio from https://youtu.be/VchbqjKk6wA...


Audio saved as /content/audios/petrolina_juazeiro.wav!


Extracting audio from https://youtu.be/ezoUemHzHno...


Audio saved as /content/audios/carla.wav!


Extracting audio from https://youtu.be/ePFQIiLI1G8...


Audio saved as /content/audios/uma_carta.wav!


Extracting audio from https://youtu.be/dHEqjSdDKok...


Audio saved as /content/audios/como_vai_voce.wav!


Extracting audio from https://youtu.be/Y59pC4FcBxM...


Audio saved as /content/audios/evidencias.wav!


Extracting audio from https://youtu.be/lnjZ2pqI6Z8...


Audio sa

Downloading: "https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/955717e8-8726e21a.th" to /root/.cache/torch/hub/checkpoints/955717e8-8726e21a.th


Extracting timestamps...


100%|██████████| 80.2M/80.2M [00:00<00:00, 139MB/s]
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/162 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[1;30;43mA saída de streaming foi truncada nas últimas 5000 linhas.[0m
BREAK: C#m - 173.7084807256236
WORD: think - 171.82
ATUAL: C#m - 173.7084807256236
BREAK: C#m - 173.7084807256236
WORD: I'll - 172.20000000000002
ATUAL: C#m - 173.7084807256236
BREAK: C#m - 173.7084807256236
WORD: miss - 172.42000000000002
ATUAL: C#m - 173.7084807256236
BREAK: C#m - 173.7084807256236
WORD: you - 172.68
ATUAL: C#m - 173.7084807256236
BREAK: C#m - 173.7084807256236
WORD: forever - 172.96
ATUAL: C#m - 173.7084807256236
DENTRO: C#m - 173.7084807256236
ATUAL: A - 175.89115646258503
BREAK: A - 175.89115646258503
WORD: Like - 175.08
ATUAL: A - 175.89115646258503
BREAK: A - 175.89115646258503
WORD: the - 175.38
ATUAL: A - 175.89115646258503
BREAK: A - 175.89115646258503
WORD: stars - 175.58
ATUAL: A - 175.89115646258503
DENTRO: A - 175.89115646258503
ATUAL: F#m - 178.0041723356009
BREAK: F#m - 178.0041723356009
WORD: miss - 176.18
ATUAL: F#m - 178.0041723356009
BREAK: F#m - 178.0041723356009
WORD: the - 1

In [76]:
# Comparar as cifras
# cifra_gerada =
resultado = comparar_cifras(chord_sheets[1], evaluation_paths[1])

print("=== Análise de Posicionamento ===")
print(f"Match estrutural: {resultado['posicionamento']['match_estrutural']}")
print(f"Erros posicionais: {resultado['posicionamento']['erros_posicionais']}")

print("\n=== Análise de Acordes ===")
print(f"Acordes exatos: {resultado['acordes']['exatos']}")
print(f"Variações: {resultado['acordes']['variacoes']}")
print(f"Acordes faltantes: {len(resultado['acordes']['faltantes'])}")
print(f"Acordes extras: {len(resultado['acordes']['extras'])}")

[('Am', 0), ('Em', 5), ('Em', 13), ('C', 19), ('D', 56), ('Em', 63), ('C', 131), ('D', 196), ('Em', 203), ('C', 255), ('D', 291), ('Em', 300), ('C', 361), ('D', 430), ('Am', 444), ('Em', 519), ('G', 621), ('D', 710), ('E', 712), ('Am', 796), ('Em', 896), ('G', 990), ('D', 1082), ('Am', 1166), ('C', 1243), ('Am', 1280), ('Em', 1317), ('D', 1384), ('A', 1451), ('Am', 1512), ('Em', 1550), ('D', 1619), ('A', 1691), ('E', 1716), ('Am7', 1770), ('Em', 1774), ('Em', 1842), ('G', 1942), ('D', 2034), ('Am', 2118), ('C', 2195), ('Am', 2229), ('Em', 2273), ('D', 2343), ('A', 2411), ('Am', 2472), ('Em', 2514), ('D', 2577), ('A', 2649), ('E', 2674), ('Am7', 2755), ('Em', 2800), ('D', 2863), ('A', 2935), ('E', 2956)]
[('Em', 0), ('C', 22), ('D', 59), ('Em', 72), ('D', 130), ('Em', 143), ('C', 199), ('D', 238), ('Em', 249), ('D', 312), ('Am', 334), ('Em', 408), ('Am', 502), ('Em', 574), ('D', 669), ('Am', 730), ('C', 792), ('Am', 815), ('Em', 847), ('D', 899), ('A', 953), ('Am', 999), ('Em', 1028), (

In [None]:
!zip -r /content/audios.zip /content/audios/
!zip -r /content/chords.zip /content/chords/
!zip -r /content/chord_sheets.zip /content/chord_sheets/
!zip -r /content/lyrics.zip /content/lyrics/
!zip -r /content/words.zip /content/lyrics-sync/output/
!zip -r /content/chords_evaluation.zip /content/chords_evaluation/