In [77]:
!pip install tensorflow



In [78]:
from IPython import get_ipython
from IPython.display import display
# %%
!pip install librosa scikit-learn matplotlib numpy soundfile moviepy pillow



In [79]:
import numpy as np
import librosa
import soundfile as sf
from sklearn.ensemble import RandomForestClassifier
from moviepy.editor import VideoClip, AudioFileClip
from PIL import Image, ImageDraw, ImageFont
import matplotlib.pyplot as plt
import tempfile
import random

In [80]:
SAMPLE_RATE = 22050

CHORDS = {
    # Major Triads
    "C Major":   ['C', 'E', 'G'],
    "C# Major":  ['C#', 'F', 'G#'],
    "D Major":   ['D', 'F#', 'A'],
    "D# Major":  ['D#', 'G', 'A#'],
    "E Major":   ['E', 'G#', 'B'],
    "F Major":   ['F', 'A', 'C'],
    "F# Major":  ['F#', 'A#', 'C#'],
    "G Major":   ['G', 'B', 'D'],
    "G# Major":  ['G#', 'C', 'D#'],
    "A Major":   ['A', 'C#', 'E'],
    "A# Major":  ['A#', 'D', 'F'],
    "B Major":   ['B', 'D#', 'F#'],

    # Minor Triads
    "C Minor":   ['C', 'D#', 'G'],
    "C# Minor":  ['C#', 'E', 'G#'],
    "D Minor":   ['D', 'F', 'A'],
    "D# Minor":  ['D#', 'F#', 'A#'],
    "E Minor":   ['E', 'G', 'B'],
    "F Minor":   ['F', 'G#', 'C'],
    "F# Minor":  ['F#', 'A', 'C#'],
    "G Minor":   ['G', 'A#', 'D'],
    "G# Minor":  ['G#', 'B', 'D#'],
    "A Minor":   ['A', 'C', 'E'],
    "A# Minor":  ['A#', 'C#', 'F'],
    "B Minor":   ['B', 'D', 'F#']
}

NOTE_FREQ = {
    'C': 261.63, 'C#': 277.18, 'D': 293.66, 'D#': 311.13, 'E': 329.63, 'F': 349.23,
    'F#': 369.99, 'G': 392.00, 'G#': 415.30, 'A': 440.00, 'A#': 466.16, 'B': 493.88
}


def synth_chord(notes, duration=1.0, sr=SAMPLE_RATE, noise_level=0.01):
    # ... (synth_chord function remains the same) ...
    t = np.linspace(0, duration, int(sr * duration), False)
    audio = sum(np.sin(2 * np.pi * NOTE_FREQ[note] * t) for note in notes)
    audio /= len(notes)

    # Apply a simple fade out
    fade = np.linspace(1, 0.1, len(audio))
    audio = audio * fade

    # Add controlled random noise (could be improved with more realistic noise)
    noise = noise_level * np.random.randn(len(audio)) * np.random.uniform(0.5, 1.5) # Vary noise level
    audio += noise

    # Add volume variation
    volume_factor = random.uniform(0.8, 1.2)
    audio *= volume_factor

    # Ensure audio is within valid range
    audio = np.clip(audio, -1.0, 1.0)

    return audio.astype(np.float32)


# Prepare data for CNN
X_cnn, y_cnn = [], []
max_padding = 0 # To find the maximum feature length for padding

for chord_label, notes in CHORDS.items():
    for i in range(100):  # Increased samples per chord
        duration = random.uniform(0.5, 1.5)
        audio = synth_chord(notes, duration=duration)
        y_audio = audio

        # Extract Mel Spectrogram
        # n_mels can be adjusted, n_fft and hop_length affect time/frequency resolution
        mel_spec = librosa.feature.melspectrogram(y=y_audio, sr=SAMPLE_RATE, n_mels=128, n_fft=2048, hop_length=512)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

        X_cnn.append(mel_spec_db)
        y_cnn.append(chord_label)

        # Update max padding
        if mel_spec_db.shape[1] > max_padding:
            max_padding = mel_spec_db.shape[1]

# Pad sequences to a fixed length
X_padded = []
for spec in X_cnn:
    padded_spec = librosa.util.pad_center(spec, size=max_padding, axis=1)
    X_padded.append(padded_spec)

X_padded = np.array(X_padded)
# Add channel dimension for CNN input (batch_size, height, width, channels)
X_cnn = np.expand_dims(X_padded, axis=-1)

# Convert labels to numerical format
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_cnn_encoded = label_encoder.fit_transform(y_cnn)


In [83]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from sklearn.model_selection import train_test_split

# Split data
X_train_cnn, X_test_cnn, y_train_cnn, y_test_cnn = train_test_split(X_cnn, y_cnn_encoded, test_size=0.2, random_state=42)

# Define the CNN model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(X_cnn.shape[1], X_cnn.shape[2], 1)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5), # Add dropout for regularization
    Dense(len(label_encoder.classes_), activation='softmax') # Output layer with number of classes
])

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy', # Use sparse_categorical_crossentropy for integer labels
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train_cnn, y_train_cnn, epochs=50, batch_size=32, validation_split=0.2) # Use a validation split during training

# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test_cnn, y_test_cnn, verbose=0)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

# You can also get predictions and use classification_report if needed
# y_pred_cnn = model.predict(X_test_cnn)
# y_pred_classes = np.argmax(y_pred_cnn, axis=1)
# print("Test Report with CNN:\n", classification_report(y_test_cnn, y_pred_classes, target_names=label_encoder.classes_))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Epoch 1/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 486ms/step - accuracy: 0.0390 - loss: 6.2902 - val_accuracy: 0.0417 - val_loss: 3.1727
Epoch 2/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 489ms/step - accuracy: 0.0340 - loss: 3.1821 - val_accuracy: 0.0391 - val_loss: 3.1772
Epoch 3/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 451ms/step - accuracy: 0.0375 - loss: 3.1777 - val_accuracy: 0.0339 - val_loss: 3.1789
Epoch 4/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 475ms/step - accuracy: 0.0461 - loss: 3.1779 - val_accuracy: 0.0469 - val_loss: 3.1773
Epoch 5/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 467ms/step - accuracy: 0.1038 - loss: 3.0259 - val_accuracy: 0.7057 - val_loss: 1.1063
Epoch 6/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 494ms/step - accuracy: 0.5841 - loss: 1.2134 - val_accuracy: 1.0000 - val_loss: 0.0658
Epoch 7/50
[1m48/48[

In [85]:
# -- Provide your audio file here:
audio_file = "scientists.wav"  # <---- CHANGE THIS

# Analyze per-beat chords (with ML)
y_song, sr = librosa.load(audio_file, sr=SAMPLE_RATE)
duration = librosa.get_duration(y=y_song, sr=sr)
tempo, beat_frames = librosa.beat.beat_track(y=y_song, sr=sr)
beat_times = librosa.frames_to_time(beat_frames, sr=sr)

# Chord prediction per beat using the trained CNN model
window_size_samples = int(sr * 1.0)

ml_chord_on_beats = []

for i, beat_time in enumerate(beat_times):
    # Define segment start and end around the beat
    start_sample = int(max(0, sr * beat_time - window_size_samples / 2))
    end_sample = int(min(len(y_song), sr * beat_time + window_size_samples / 2))
    segment = y_song[start_sample:end_sample]

    if len(segment) == 0:
        ml_chord_on_beats.append(("Unknown", beat_time))
        continue

    # Extract Mel Spectrogram, matching training parameters (n_mels, n_fft, hop_length)
    try:
        mel_spec = librosa.feature.melspectrogram(y=segment, sr=sr, n_mels=128, n_fft=2048, hop_length=512)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

        # Pad the segment's mel spectrogram to the same size as training data
        # Use the max_padding value calculated during training data prep
        if 'max_padding' not in globals():

             pass

        # Pad the segment feature
        padded_spec = librosa.util.pad_center(mel_spec_db, size=max_padding, axis=1)

        # Reshape for CNN input (batch_size=1, height, width, channels=1)
        cnn_input_feat = np.expand_dims(np.expand_dims(padded_spec, axis=0), axis=-1)

        # Predict using the trained CNN model
        prediction_proba = model.predict(cnn_input_feat, verbose=0)
        predicted_class_index = np.argmax(prediction_proba, axis=1)[0]
        # Decode the numerical prediction back to chord label
        chord_label = label_encoder.inverse_transform([predicted_class_index])[0]

    except Exception as e:
        # Handle potential errors during feature extraction or prediction
        print(f"Error processing segment at beat time {beat_time}: {e}")
        chord_label = "Unknown"


    ml_chord_on_beats.append((chord_label, beat_time))

# Merge repeated chords for better visuals
chord_timeline = []
# Include the end of the song for the last beat's chord duration
beat_times_with_end = np.append(beat_times, duration)

for i in range(len(beat_times_with_end) - 1):
    # Ensure ml_chord_on_beats has an entry for each beat time
    # If a beat segment was empty, it would have been added as "Unknown"
    if i < len(ml_chord_on_beats):
        chord_label = ml_chord_on_beats[i][0]
        start_time = beat_times_with_end[i]
        end_time = beat_times_with_end[i+1]
        chord_timeline.append((start_time, end_time, chord_label))
    else:
        # Handle cases where ml_chord_on_beats might be shorter than beat_times
        print(f"Warning: No chord prediction for beat index {i}")
        last_known_chord = ml_chord_on_beats[-1][0] if ml_chord_on_beats else "Unknown"
        start_time = beat_times_with_end[i]
        end_time = beat_times_with_end[i+1]
        chord_timeline.append((start_time, end_time, last_known_chord))


merged_chords = []
for segment in chord_timeline:
    if not merged_chords or merged_chords[-1][2] != segment[2]:
        merged_chords.append(segment)
    else:
        merged_chords[-1] = (merged_chords[-1][0], segment[1], merged_chords[-1][2])

In [86]:
chord_palette = {
    # Major chords – cheerful, warm, and vivid
    "C Major":   ((255, 255, 0),     'circle'),     # Bright Yellow – Happy, sunny resolution
    "C# Major":  ((255, 0, 255),     'triangle'),   # Magenta – vivid, expressive
    "D Major":   ((0, 255, 0),       'circle'),     # Vibrant Green – Triumphant, fresh
    "D# Major":  ((255, 153, 51),    'hex'),        # Warm Orange – bold, animated
    "E Major":   ((255, 51, 255),    'star'),       # Light Magenta – open and colorful
    "F Major":   ((255, 204, 0),     'rect'),       # Golden Yellow – bright, balanced
    "F# Major":  ((102, 255, 102),   'triangle'),   # Soft Mint – smooth and lush
    "G Major":   ((255, 85, 0),      'circle'),     # Orange-Red – lively, warm
    "G# Major":  ((255, 0, 102),     'star'),       # Hot Pink – playful, vivid
    "A Major":   ((255, 255, 255),   'rect'),       # White – radiant, uplifting (you may change this)
    "A# Major":  ((0, 255, 255),     'triangle'),   # Cyan – energetic and clear
    "B Major":   ((255, 0, 0),       'hex'),        # True Red – confident, intense

    # Minor chords – deep, melancholic, or rich
    "C Minor":   ((51, 0, 153),      'rect'),       # Indigo – calm, reflective
    "C# Minor":  ((75, 0, 130),      'hex'),        # Dark Violet – poetic, obscure
    "D Minor":   ((0, 102, 204),     'triangle'),   # Deep Blue – introspective
    "D# Minor":  ((102, 0, 204),     'star'),       # Midnight Purple – mysterious
    "E Minor":   ((153, 51, 255),    'triangle'),   # Rich Violet – somber but rich
    "F Minor":   ((0, 51, 153),      'hex'),        # Navy Blue – deep, serious
    "F# Minor":  ((0, 0, 204),       'star'),       # Cobalt – cold, immersive
    "G Minor":   ((30, 70, 180),     'rect'),       # Ocean Blue – fragile, thoughtful
    "G# Minor":  ((60, 60, 220),     'triangle'),   # Steel Blue – icy, controlled
    "A Minor":   ((0, 0, 153),       'triangle'),   # Deep Blue – sad or mellow
    "A# Minor":  ((102, 0, 204),     'star'),       # Blue-Violet – dramatic, mysterious
    "B Minor":   ((85, 85, 255),     'hex'),        # Electric Indigo – nostalgic, dreamy

    "Unknown":   ((160, 160, 160),   'circle')      # Mid Gray – undefined
}
W, H = 720, 720


def draw_shape(draw, shape, color, size, center, t_frac):
    x, y = center
    if shape == "circle":
        r = int(size * (0.9 + 0.15*np.sin(2*np.pi*t_frac)))
        draw.ellipse([x - r, y - r, x + r, y + r], fill=color, outline=None)
    elif shape == "rect":
        s = int(size * (0.85 + 0.2*np.cos(2*np.pi*t_frac)))
        draw.rectangle([x - s, y - s, x + s, y + s], fill=color)
    elif shape == "triangle":
        s = int(size * (0.85 + 0.2*np.sin(4*np.pi*t_frac)))
        pts = [(x, y - s), (x - s, y + s), (x + s, y + s)]
        draw.polygon(pts, fill=color)
    elif shape == "hex":
        s = int(size * (0.85 + 0.15*np.cos(4*np.pi*t_frac)))
        angle = np.linspace(0, 2*np.pi, 7)
        pts = [(x + s*np.cos(a), y + s*np.sin(a)) for a in angle]
        draw.polygon(pts, fill=color)
    elif shape == "star":
        s = size
        pts = []
        for i in range(10):
            r = s if i % 2 == 0 else s//2
            theta = np.pi/5 * i + 2*np.pi*t_frac
            pts.append((x + int(r * np.sin(theta)), y - int(r * np.cos(theta))))
        draw.polygon(pts, fill=color)
    # More shapes

def make_frame(t):
    idx = np.searchsorted([bt for _, bt in ml_chord_on_beats], t, side='right') - 1
    chord, bt = ml_chord_on_beats[max(idx, 0)]
    color, shape = chord_palette.get(chord, ((180,180,180), "circle"))
    img = Image.new("RGB", (W, H), (30, 30, 30))
    draw = ImageDraw.Draw(img)
    # Artistic transitions
    next_idx = min(idx+1, len(ml_chord_on_beats)-1)
    bt1 = ml_chord_on_beats[idx][1]
    bt2 = ml_chord_on_beats[next_idx][1] if next_idx != idx else duration
    t_frac = (t - bt1) / max(0.001, (bt2 - bt1))
    next_color = chord_palette.get(ml_chord_on_beats[next_idx][0], ((180,180,180), shape))[0]
    curr_col = tuple(int((1-t_frac)*c1 + t_frac*c2) for c1, c2 in zip(color, next_color))
    draw_shape(draw, shape, curr_col, 170, (W//2, H//2), t_frac)
    try:
        font = ImageFont.truetype("DejaVuSans-Bold.ttf", 54)
    except:
        font = ImageFont.load_default()
    draw.text((W//2-70, H//2+180), chord, font=font, fill=(255,255,255,220))
    return np.array(img)

# %%
video_duration = duration
video = VideoClip(make_frame, duration=video_duration)
audio = AudioFileClip(audio_file).subclip(0, video_duration)
video = video.set_audio(audio)

outpath = tempfile.mktemp(suffix='.mp4')
video.write_videofile(outpath, fps=12, codec="libx264", audio_codec="aac")

Moviepy - Building video /tmp/tmp450u2gsi.mp4.
MoviePy - Writing audio in tmp450u2gsiTEMP_MPY_wvf_snd.mp4




MoviePy - Done.
Moviepy - Writing video /tmp/tmp450u2gsi.mp4





Moviepy - Done !
Moviepy - video ready /tmp/tmp450u2gsi.mp4


In [88]:
video_duration = duration
video = VideoClip(make_frame, duration=video_duration)
audio = AudioFileClip(audio_file).subclip(0, video_duration)
video = video.set_audio(audio)

outpath = tempfile.mktemp(suffix='.mp4')
video.write_videofile(outpath, fps=12, codec="libx264", audio_codec="aac")


Moviepy - Building video /tmp/tmpn1g8wwom.mp4.
MoviePy - Writing audio in tmpn1g8wwomTEMP_MPY_wvf_snd.mp4




MoviePy - Done.
Moviepy - Writing video /tmp/tmpn1g8wwom.mp4





Moviepy - Done !
Moviepy - video ready /tmp/tmpn1g8wwom.mp4
