In [1]:
import pandas as pd

df = pd.read_csv("hf://datasets/ailsntua/Chordonomicon/chordonomicon_v2.csv")

  from .autonotebook import tqdm as notebook_tqdm
  df = pd.read_csv("hf://datasets/ailsntua/Chordonomicon/chordonomicon_v2.csv")
  df = pd.read_csv("hf://datasets/ailsntua/Chordonomicon/chordonomicon_v2.csv")


In [5]:
df.columns

Index(['id', 'chords', 'release_date', 'genres', 'decade', 'rock_genre',
       'artist_id', 'main_genre', 'spotify_song_id', 'spotify_artist_id'],
      dtype='object')

In [6]:
df['main_genre'].unique()

array(['pop', 'metal', nan, 'electronic', 'rock', 'soul', 'punk',
       'pop rock', 'country', 'alternative', 'jazz', 'rap', 'reggae'],
      dtype=object)

In [7]:
# Filter for pop genre and look at a sample of chord progressions
rock_df = df[df['main_genre'] == 'rock']
print(f"Number of rock songs: {len(rock_df)}")
print(f"\nSample chord progression:")
print(rock_df['chords'].iloc[0] if len(rock_df) > 0 else "No rock songs found")

Number of rock songs: 67238

Sample chord progression:
<intro_1> D G D G D G D G D G D Bb C D G D Bb C D <verse_1> G D Bb C D G D Bb C G D G D Bb C D G D Bb C D <chorus_1> C G C A D C <interlude_1> D G D Bb C D G D Bb C D <verse_2> G D Bb C D G D Bb C D G D Bb C D G D Bb C D <chorus_2> C G C A D <interlude_2> A D A D A C D <outro_1> G D Bb C D G D Bb C D G D Bb C D G D Bb C D G D G D Bb C D G D Bb C D


In [8]:
import re
from collections import Counter

def is_shuttle_pattern(progression):
    """
    Check if a 4-chord progression is a 2-chord shuttle (A-B-A-B pattern).
    Returns True if it's a shuttle, False if it's a true 4-chord narrative.
    """
    if len(progression) != 4:
        return False
    # Check for A-B-A-B pattern
    return progression[0] == progression[2] and progression[1] == progression[3]

def extract_aligned_4bar(data):
    """
    Extract the most common 4-bar progression that aligns with section boundaries.
    Prioritizes patterns that START sections (1234, 1234, 1234) over arbitrary windows (4123).
    Excludes 2-chord shuttles (like V-I-V-I) to only get true 4-chord narrative loops.
    """
    # 1. Parse Sections - split by tags to get clean lists of chords
    raw_sections = re.split(r'<[^>]+>', data)
    sections = [s.strip().split() for s in raw_sections if s.strip()]
    
    # Store all found 4-grams
    all_ngrams = []
    
    # Store ONLY 4-grams that are aligned to section start (position 0, 4, 8, ...)
    aligned_starters = set()

    for chords in sections:
        if len(chords) < 4:
            continue
        
        # Register aligned 4-grams (at positions 0, 4, 8, 12, ...)
        # These are the "true" bar-aligned patterns
        for start in range(0, len(chords) - 3, 4):
            aligned_tuple = tuple(chords[start:start+4])
            # Only add if NOT a shuttle pattern
            if not is_shuttle_pattern(aligned_tuple):
                aligned_starters.add(aligned_tuple)
        
        # Sliding window for ALL occurrences (to count frequency)
        for i in range(len(chords) - 3):
            seq = tuple(chords[i : i+4])
            # Only count if NOT a shuttle pattern
            if not is_shuttle_pattern(seq):
                all_ngrams.append(seq)

    if not all_ngrams:
        return None

    # 2. Count Frequencies
    counts = Counter(all_ngrams)
    
    # 3. Get candidates sorted by frequency
    candidates = counts.most_common()
    
    # Filter: Must repeat at least once (count >= 2) to be reliable
    valid_candidates = [c for c in candidates if c[1] >= 5]
    
    if not valid_candidates:
        return None

    # 4. Pick the Winner - prefer aligned patterns
    # First pass: Look for the most frequent candidate that is also aligned
    for seq, count in valid_candidates:
        if seq in aligned_starters:
            return list(seq)
    
    # Fallback: If no aligned patterns repeat enough, take the most frequent
    return list(valid_candidates[0][0])

# === TEST ===
data = "<intro_1> C <verse_1> F C E7 Amin C F C G7 C F C E7 Amin C F G7 C <verse_2> F C E7 Amin C F C G7 C F C E7 Amin C F G7 C <chorus_1> F C F C G C F C E7 Amin C F G7 C <solo_1> D <chorus_2> G D G D A D G D Fs7 Bmin D G A7 D G A7 D"

result = extract_aligned_4bar(data)
print(f"Extracted Progression: {result}")
# Should return ['F', 'C', 'E7', 'Amin'] (aligned at start of verse) 
# instead of ['C', 'F', 'G7', 'C'] which appears but isn't section-aligned

# Test shuttle filtering
print("\n--- Testing shuttle filter ---")
test_shuttles = [
    ('C', 'G', 'C', 'G'),      # Shuttle: should be excluded
    ('F', 'C', 'F', 'C'),      # Shuttle: should be excluded
    ('C', 'G', 'Amin', 'F'),   # True 4-chord: should be included
    ('I', 'V', 'vi', 'IV'),    # True 4-chord: should be included
    ('Amin', 'G', 'Amin', 'G'), # Shuttle: should be excluded
]
for prog in test_shuttles:
    is_shuttle = is_shuttle_pattern(prog)
    print(f"{prog}: {'SHUTTLE (excluded)' if is_shuttle else 'True 4-chord (included)'}")

Extracted Progression: ['F', 'C', 'E7', 'Amin']

--- Testing shuttle filter ---
('C', 'G', 'C', 'G'): SHUTTLE (excluded)
('F', 'C', 'F', 'C'): SHUTTLE (excluded)
('C', 'G', 'Amin', 'F'): True 4-chord (included)
('I', 'V', 'vi', 'IV'): True 4-chord (included)
('Amin', 'G', 'Amin', 'G'): SHUTTLE (excluded)


In [9]:
# Extract 4-bar progressions from all pop songs
rock_df['four_bar_progression'] = rock_df['chords'].apply(extract_aligned_4bar)

# Filter out songs where no valid progression was found
valid_progressions = rock_df[rock_df['four_bar_progression'].notna()].copy()

print(f"Total pop songs: {len(rock_df)}")
print(f"Songs with valid 4-bar progressions: {len(valid_progressions)}")
print(f"Songs without valid progressions: {len(rock_df) - len(valid_progressions)}")

# Show sample of extracted progressions
print("\nSample extracted progressions:")
valid_progressions[['four_bar_progression']].head(10)

Total pop songs: 67238
Songs with valid 4-bar progressions: 44136
Songs without valid progressions: 23102

Sample extracted progressions:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rock_df['four_bar_progression'] = rock_df['chords'].apply(extract_aligned_4bar)


Unnamed: 0,four_bar_progression
12,"[G, D, Bb, C]"
17,"[C, G, C, F]"
63,"[D, G, Emin, C]"
71,"[F, Amin, C, G]"
78,"[G, Gsus4, G, F]"
87,"[D, Emin, G, A]"
97,"[A, C, D, A]"
140,"[Cs, Gs, E, Fs]"
233,"[Eb, Gmin7, F, Eb]"
261,"[F, G, Cmin, F]"


In [10]:
# Define chord parsing and normalization functions

# All 12 chromatic notes in order (using sharps)
NOTES = ['C', 'Cs', 'D', 'Ds', 'E', 'F', 'Fs', 'G', 'Gs', 'A', 'As', 'B']
# Flat equivalents mapping
FLAT_TO_SHARP = {'Db': 'Cs', 'Eb': 'Ds', 'Fb': 'E', 'Gb': 'Fs', 'Ab': 'Gs', 'Bb': 'As', 'Cb': 'B'}

def parse_chord(chord):
    """
    Parse a chord string into (root, quality).
    Returns (root, 'major') or (root, 'minor') or (None, None) if invalid.
    """
    if not chord:
        return None, None
    
    # Handle flats first - convert to sharp equivalents
    for flat, sharp in FLAT_TO_SHARP.items():
        if chord.startswith(flat):
            chord = sharp + chord[len(flat):]
            break
    
    # Try to match root note (with optional sharp)
    root = None
    remainder = None
    
    if len(chord) >= 2 and chord[1] == 's':  # Sharp note like Cs, Fs
        root = chord[:2]
        remainder = chord[2:]
    elif len(chord) >= 1 and chord[0] in 'ABCDEFG':
        root = chord[0]
        remainder = chord[1:]
    else:
        return None, None
    
    # Validate root
    if root not in NOTES:
        return None, None
    
    # Determine quality
    remainder = remainder.lower() if remainder else ''
    
    if remainder in ['min', 'm', 'minor']:
        return root, 'minor'
    elif remainder in ['', 'maj', 'major']:
        return root, 'major'
    else:
        return None, None

def is_valid_triad_progression(progression):
    """Check if all chords in progression are major or minor triads."""
    if progression is None:
        return False
    for chord in progression:
        root, quality = parse_chord(chord)
        if root is None:
            return False
    return True

def interval_semitones(root1, root2):
    """Calculate interval in semitones between two roots."""
    idx1 = NOTES.index(root1)
    idx2 = NOTES.index(root2)
    return (idx2 - idx1) % 12

def detect_key(progression):
    """
    Detect the key of a 4-chord progression using music theory heuristics.
    Returns (tonic_root, mode) where mode is 'major' or 'minor'.
    
    Uses Krumhansl-Schmuckler key-finding algorithm principles:
    - Score each potential key by how well chords fit diatonic functions
    - Weight V->I motion heavily (strongest cadence)
    - Consider chord frequency and quality matching
    """
    if not progression or len(progression) < 2:
        return None, None
    
    # Parse all chords
    parsed = []
    for chord in progression:
        root, quality = parse_chord(chord)
        if root is None:
            return None, None
        parsed.append((root, quality))
    
    # Diatonic chords with weights (higher = more indicative of key)
    # Major: I(0), ii(2), iii(4), IV(5), V(7), vi(9)
    MAJOR_DIATONIC = {
        0: ('major', 3.0),   # I - tonic, strong indicator
        2: ('minor', 1.5),   # ii
        4: ('minor', 1.5),   # iii
        5: ('major', 2.5),   # IV - subdominant
        7: ('major', 3.0),   # V - dominant, strong indicator
        9: ('minor', 2.0),   # vi - relative minor
    }
    
    # Minor: i(0), III(3), iv(5), v/V(7), VI(8), VII(10)
    MINOR_DIATONIC = {
        0: ('minor', 3.0),   # i - tonic
        3: ('major', 2.0),   # III - relative major
        5: ('minor', 2.0),   # iv
        7: ('major', 2.5),   # V (harmonic minor) - common
        8: ('major', 2.0),   # VI
        10: ('major', 2.0),  # VII
    }
    
    best_score = -1
    best_key = None
    best_mode = None
    
    for tonic in NOTES:
        # === MAJOR KEY SCORING ===
        major_score = 0
        has_tonic = False
        has_dominant = False
        
        for i, (root, quality) in enumerate(parsed):
            interval = interval_semitones(tonic, root)
            
            if interval in MAJOR_DIATONIC:
                expected_quality, weight = MAJOR_DIATONIC[interval]
                if expected_quality == quality:
                    major_score += weight
                    if interval == 0:
                        has_tonic = True
                    if interval == 7:
                        has_dominant = True
                else:
                    major_score += 0.3  # Wrong quality but diatonic root
        
        # Bonus for V->I motion (authentic cadence)
        for i in range(len(parsed) - 1):
            root1, q1 = parsed[i]
            root2, q2 = parsed[i + 1]
            int1 = interval_semitones(tonic, root1)
            int2 = interval_semitones(tonic, root2)
            if int1 == 7 and int2 == 0 and q1 == 'major' and q2 == 'major':
                major_score += 3.0  # Strong V-I cadence
            if int1 == 5 and int2 == 0 and q1 == 'major' and q2 == 'major':
                major_score += 1.5  # IV-I plagal cadence
        
        # Bonus for having both tonic and dominant
        if has_tonic and has_dominant:
            major_score += 1.0
        
        if major_score > best_score:
            best_score = major_score
            best_key = tonic
            best_mode = 'major'
        
        # === MINOR KEY SCORING ===
        minor_score = 0
        has_tonic_minor = False
        has_dominant_minor = False
        
        for i, (root, quality) in enumerate(parsed):
            interval = interval_semitones(tonic, root)
            
            if interval in MINOR_DIATONIC:
                expected_quality, weight = MINOR_DIATONIC[interval]
                if expected_quality == quality:
                    minor_score += weight
                    if interval == 0:
                        has_tonic_minor = True
                    if interval == 7:
                        has_dominant_minor = True
                elif interval == 7 and quality == 'minor':
                    # v (natural minor) also acceptable
                    minor_score += 1.5
                else:
                    minor_score += 0.3
        
        # Bonus for V->i motion in minor
        for i in range(len(parsed) - 1):
            root1, q1 = parsed[i]
            root2, q2 = parsed[i + 1]
            int1 = interval_semitones(tonic, root1)
            int2 = interval_semitones(tonic, root2)
            if int1 == 7 and int2 == 0 and q2 == 'minor':
                minor_score += 3.0  # V-i cadence
        
        if has_tonic_minor and has_dominant_minor:
            minor_score += 1.0
        
        if minor_score > best_score:
            best_score = minor_score
            best_key = tonic
            best_mode = 'minor'
    
    return best_key, best_mode

def transpose_chord(chord, semitones):
    """Transpose a chord by a number of semitones."""
    root, quality = parse_chord(chord)
    if root is None:
        return None
    
    current_idx = NOTES.index(root)
    new_idx = (current_idx + semitones) % 12
    new_root = NOTES[new_idx]
    
    if quality == 'minor':
        return new_root + 'min'
    else:
        return new_root

def normalize_to_c(progression):
    """
    Normalize a progression to C (major or minor) by detecting the key
    and transposing accordingly.
    """
    if not progression:
        return None
    
    tonic, mode = detect_key(progression)
    if tonic is None:
        return None
    
    current_idx = NOTES.index(tonic)
    semitones = (0 - current_idx) % 12
    
    normalized = []
    for chord in progression:
        transposed = transpose_chord(chord, semitones)
        if transposed is None:
            return None
        normalized.append(transposed)
    
    return normalized

# Test the improved key detection
test_progs = [
    ['F', 'C', 'G', 'Amin'],       # C major: IV-I-V-vi
    ['G', 'D', 'Emin', 'C'],       # G major: I-V-vi-IV
    ['A', 'E', 'Fsmin', 'D'],      # A major: I-V-vi-IV
    ['Amin', 'F', 'C', 'G'],       # Ambiguous - vi-IV-I-V in C or i-VI-III-VII in Am
    ['Amin', 'G', 'F', 'E'],       # A minor: i-VII-VI-V (Andalusian)
    ['C', 'G', 'Amin', 'F'],       # C major: I-V-vi-IV
    ['Dmin', 'G', 'C', 'C'],       # C major: ii-V-I-I
    ['C', 'F', 'C', 'G'],          # C major: I-IV-I-V
    ['Emin', 'C', 'G', 'D'],       # G major: vi-IV-I-V (or E minor?)
]

print("Testing improved key detection:\n")
for prog in test_progs:
    tonic, mode = detect_key(prog)
    normalized = normalize_to_c(prog)
    print(f"Progression: {prog}")
    print(f"  Detected key: {tonic} {mode}")
    print(f"  Normalized to C: {normalized}")
    print()

Testing improved key detection:

Progression: ['F', 'C', 'G', 'Amin']
  Detected key: C major
  Normalized to C: ['F', 'C', 'G', 'Amin']

Progression: ['G', 'D', 'Emin', 'C']
  Detected key: G major
  Normalized to C: ['C', 'G', 'Amin', 'F']

Progression: ['A', 'E', 'Fsmin', 'D']
  Detected key: A major
  Normalized to C: ['C', 'G', 'Amin', 'F']

Progression: ['Amin', 'F', 'C', 'G']
  Detected key: C major
  Normalized to C: ['Amin', 'F', 'C', 'G']

Progression: ['Amin', 'G', 'F', 'E']
  Detected key: A minor
  Normalized to C: ['Cmin', 'As', 'Gs', 'G']

Progression: ['C', 'G', 'Amin', 'F']
  Detected key: C major
  Normalized to C: ['C', 'G', 'Amin', 'F']

Progression: ['Dmin', 'G', 'C', 'C']
  Detected key: C major
  Normalized to C: ['Dmin', 'G', 'C', 'C']

Progression: ['C', 'F', 'C', 'G']
  Detected key: C major
  Normalized to C: ['C', 'F', 'C', 'G']

Progression: ['Emin', 'C', 'G', 'D']
  Detected key: G major
  Normalized to C: ['Amin', 'F', 'C', 'G']



In [11]:
# Roman numeral conversion (based on interval from tonic)
# For major keys
MAJOR_ROMAN = {
    0: 'I', 1: '♭II', 2: 'II', 3: '♭III', 4: 'III', 5: 'IV',
    6: '♯IV', 7: 'V', 8: '♭VI', 9: 'VI', 10: '♭VII', 11: 'VII'
}

def chord_to_roman(chord, tonic, mode):
    """
    Convert a chord to Roman numeral notation relative to the tonic.
    Major chords = uppercase, Minor chords = lowercase
    """
    root, quality = parse_chord(chord)
    if root is None:
        return None
    
    interval = interval_semitones(tonic, root)
    
    # Get base Roman numeral
    roman = MAJOR_ROMAN.get(interval, str(interval))
    
    # Lowercase for minor chords
    if quality == 'minor':
        roman = roman.lower()
    
    return roman

def normalize_to_roman(progression):
    """
    Convert a progression to Roman numeral notation.
    First detects the key, then expresses each chord relative to that tonic.
    """
    if not progression:
        return None
    
    tonic, mode = detect_key(progression)
    if tonic is None:
        return None
    
    roman_numerals = []
    for chord in progression:
        roman = chord_to_roman(chord, tonic, mode)
        if roman is None:
            return None
        roman_numerals.append(roman)
    
    return roman_numerals

# Apply to pop songs: filter for valid triads and normalize to Roman numerals

# First, filter progressions that only contain major/minor triads
valid_progressions['is_triad_only'] = valid_progressions['four_bar_progression'].apply(is_valid_triad_progression)
triad_only = valid_progressions[valid_progressions['is_triad_only']].copy()

# Normalize to Roman numerals
triad_only['normalized_progression'] = triad_only['four_bar_progression'].apply(normalize_to_roman)

print(f"Pop songs with valid 4-bar progressions: {len(valid_progressions)}")
print(f"Progressions with only major/minor triads: {len(triad_only)}")

# Show sample with original and Roman numeral versions
print("\nSample normalized progressions:")
sample = triad_only[['four_bar_progression', 'normalized_progression']].head(10)
for idx, row in sample.iterrows():
    orig = ' → '.join(row['four_bar_progression'])
    roman = ' → '.join(row['normalized_progression']) if row['normalized_progression'] else 'N/A'
    print(f"  {orig:<30} => {roman}")

Pop songs with valid 4-bar progressions: 44136
Progressions with only major/minor triads: 32289

Sample normalized progressions:
  G → D → Bb → C                 => I → V → ♭III → IV
  C → G → C → F                  => I → V → I → IV
  D → G → Emin → C               => V → I → vi → IV
  F → Amin → C → G               => IV → vi → I → V
  D → Emin → G → A               => I → ii → IV → V
  A → C → D → A                  => V → ♭VII → I → V
  Cs → Gs → E → Fs               => I → V → ♭III → IV
  F → G → Cmin → F               => IV → V → i → IV
  Eb → Gmin → Ab → Eb            => I → iii → IV → I
  Amin → F → C → Dmin            => vi → IV → I → ii


In [12]:
# Count the most common normalized 4-bar progressions in pop (Roman numerals)
from collections import Counter

# Convert lists to tuples for counting
progression_counts = Counter(tuple(p) for p in triad_only['normalized_progression'] if p is not None)

print("Most common 4-bar progressions in pop (Roman numerals):\n")
print(f"{'Rank':<5} {'Progression':<25} {'Count':<8} {'%':<6}")
print("-" * 50)

total = len(triad_only)
for i, (prog, count) in enumerate(progression_counts.most_common(20), 1):
    prog_str = " → ".join(prog)
    pct = (count / total) * 100
    print(f"{i:<5} {prog_str:<25} {count:<8} {pct:.2f}%")

Most common 4-bar progressions in pop (Roman numerals):

Rank  Progression               Count    %     
--------------------------------------------------
1     IV → I → V → IV           975      3.02%
2     I → IV → V → I            972      3.01%
3     V → IV → I → V            958      2.97%
4     I → V → IV → I            927      2.87%
5     I → IV → I → V            824      2.55%
6     vi → IV → I → V           780      2.42%
7     I → V → vi → IV           767      2.38%
8     IV → I → V → I            709      2.20%
9     IV → V → I → IV           604      1.87%
10    I → vi → IV → V           534      1.65%
11    V → I → IV → V            420      1.30%
12    V → I → IV → I            415      1.29%
13    i → ♭VI → ♭VII → i        345      1.07%
14    IV → I → V → vi           339      1.05%
15    I → V → IV → V            320      0.99%
16    IV → V → vi → IV          308      0.95%
17    vi → I → V → vi           268      0.83%
18    I → V → I → IV            263      0.81

In [13]:
# Save to CSV
# Create a dataframe with progression counts (Roman numerals)
progression_df = pd.DataFrame([
    {
        'progression': ' → '.join(prog),
        'chord_1': prog[0],
        'chord_2': prog[1],
        'chord_3': prog[2],
        'chord_4': prog[3],
        'count': count,
        'percentage': (count / total) * 100
    }
    for prog, count in progression_counts.most_common()
])

# Save to CSV
output_path = '../results/rock_4bar_progressions_roman.csv'
progression_df.to_csv(output_path, index=False)
print(f"Saved {len(progression_df)} unique progressions to {output_path}")
progression_df.head(10)

Saved 2223 unique progressions to ../results/rock_4bar_progressions_roman.csv


Unnamed: 0,progression,chord_1,chord_2,chord_3,chord_4,count,percentage
0,IV → I → V → IV,IV,I,V,IV,975,3.019604
1,I → IV → V → I,I,IV,V,I,972,3.010313
2,V → IV → I → V,V,IV,I,V,958,2.966955
3,I → V → IV → I,I,V,IV,I,927,2.870947
4,I → IV → I → V,I,IV,I,V,824,2.551953
5,vi → IV → I → V,vi,IV,I,V,780,2.415683
6,I → V → vi → IV,I,V,vi,IV,767,2.375422
7,IV → I → V → I,IV,I,V,I,709,2.195794
8,IV → V → I → IV,IV,V,I,IV,604,1.870606
9,I → vi → IV → V,I,vi,IV,V,534,1.653814
