In [1]:
%pip install kaleido==0.1.0post1
%pip install tqdm

# Configuration
FPS = 30
FFT_WINDOW_SECONDS = 0.25 # how many seconds of audio make up an FFT window

# Note range to display
FREQ_MIN = 10
FREQ_MAX = 1000

# Notes to display
TOP_NOTES = 3

# Names of the notes
NOTE_NAMES = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]

# Output size. Generally use SCALE for higher res, unless you need a non-standard aspect ratio.
RESOLUTION = (1920, 1080)
SCALE = 2 # 0.5=QHD(960x540), 1=HD(1920x1080), 2=4K(3840x2160)

Collecting kaleido==0.1.0post1
  Downloading kaleido-0.1.0.post1-py2.py3-none-win_amd64.whl.metadata (15 kB)
Downloading kaleido-0.1.0.post1-py2.py3-none-win_amd64.whl (56.0 MB)
   ---------------------------------------- 0.0/56.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/56.0 MB 1.3 MB/s eta 0:00:44
   ---------------------------------------- 0.2/56.0 MB 5.0 MB/s eta 0:00:12
   ---------------------------------------- 0.5/56.0 MB 5.9 MB/s eta 0:00:10
    --------------------------------------- 0.8/56.0 MB 6.8 MB/s eta 0:00:09
    --------------------------------------- 1.1/56.0 MB 7.2 MB/s eta 0:00:08
    --------------------------------------- 1.4/56.0 MB 6.7 MB/s eta 0:00:09
   - -------------------------------------- 1.9/56.0 MB 8.0 MB/s eta 0:00:07
   - -------------------------------------- 2.2/56.0 MB 8.1 MB/s eta 0:00:07
   - -------------------------------------- 2.6/56.0 MB 8.3 MB/s eta 0:00:07
   -- ------------------------------------- 2.9/56.0 MB 8.5

In [2]:
import matplotlib.pyplot as plt
from scipy.fftpack import fft
from scipy.io import wavfile # get the api
import os

AUDIO_FILE = "c-scale-demo.wav"

fs, data = wavfile.read(AUDIO_FILE) # load the data

# Check if the audio data has more than one channel
if len(data.shape) > 1 and data.shape[1] > 1:
    audio = data.T[0] # this is a two channel soundtrack, get the first track
else:
    audio = data # this is a mono soundtrack

FRAME_STEP = (fs / FPS) # audio samples per video frame
FFT_WINDOW_SIZE = int(fs * FFT_WINDOW_SECONDS)
AUDIO_LENGTH = len(audio)/fs

In [3]:
import plotly.graph_objects as go

def plot_fft(p, xf, fs, notes, dimensions=(960,540)):
  layout = go.Layout(
      title="frequency spectrum",
      autosize=False,
      width=dimensions[0],
      height=dimensions[1],
      xaxis_title="Frequency (note)",
      yaxis_title="Magnitude",
      font={'size' : 24}
  )

  fig = go.Figure(layout=layout,
                  layout_xaxis_range=[FREQ_MIN,FREQ_MAX],
                  layout_yaxis_range=[0,1]
                  )
  
  fig.add_trace(go.Scatter(
      x = xf,
      y = p))
  
  for note in notes:
    fig.add_annotation(x=note[0]+10, y=note[2],
            text=note[1],
            font = {'size' : 48},
            showarrow=False)
  return fig

def extract_sample(audio, frame_number):
  end = frame_number * FRAME_OFFSET
  begin = int(end - FFT_WINDOW_SIZE)

  if end == 0:
    # We have no audio yet, return all zeros (very beginning)
    return np.zeros((np.abs(begin)),dtype=float)
  elif begin<0:
    # We have some audio, padd with zeros
    return np.concatenate([np.zeros((np.abs(begin)),dtype=float),audio[0:end]])
  else:
    # Usually this happens, return the next sample
    return audio[begin:end]

def find_top_notes(fft,num):
  if np.max(fft.real)<0.001:
    return []

  lst = [x for x in enumerate(fft.real)]
  lst = sorted(lst, key=lambda x: x[1],reverse=True)

  idx = 0
  found = []
  found_note = set()
  while( (idx<len(lst)) and (len(found)<num) ):
    f = xf[lst[idx][0]]
    y = lst[idx][1]
    n = freq_to_number(f)
    n0 = int(round(n))
    name = note_name(n0)

    if name not in found_note:
      found_note.add(name)
      s = [f,note_name(n0),y]
      found.append(s)
    idx += 1
    
  return found

In [5]:
import plotly.graph_objects as go
import plotly.io as pio

# Create a simple scatter plot
fig = go.Figure(data=go.Scatter(x=[1, 2, 3, 4], y=[10, 15, 13, 17]))

# Save the figure as a PNG image
fig.write_image('test_image.png')

In [6]:
import numpy as np
import tqdm

# See https://newt.phys.unsw.edu.au/jw/notes.html
def freq_to_number(f): return 69 + 12*np.log2(f/440.0)
def number_to_freq(n): return 440 * 2.0**((n-69)/12.0)
def note_name(n): return NOTE_NAMES[n % 12] + str(int(n/12 - 1))

# Hanning window function
window = 0.5 * (1 - np.cos(np.linspace(0, 2*np.pi, FFT_WINDOW_SIZE, False)))

xf = np.fft.rfftfreq(FFT_WINDOW_SIZE, 1/fs)
FRAME_COUNT = int(AUDIO_LENGTH*FPS)
FRAME_OFFSET = int(len(audio)/FRAME_COUNT)

# Pass 1, find out the maximum amplitude so we can scale.
mx = 0
for frame_number in range(FRAME_COUNT):
  sample = extract_sample(audio, frame_number)

  fft = np.fft.rfft(sample * window)
  fft = np.abs(fft).real 
  mx = max(np.max(fft),mx)

print(f"Max amplitude: {mx}")

# Pass 2, produce the animation
print("Producing frames...")
for frame_number in tqdm.tqdm(range(FRAME_COUNT)):
  sample = extract_sample(audio, frame_number)

  fft = np.fft.rfft(sample * window)
  fft = np.abs(fft) / mx 
  
  
  s = find_top_notes(fft,TOP_NOTES)
  
  fig = plot_fft(fft.real,xf,fs,s,RESOLUTION)
  try:
    fig.write_image(f"frames/frame{frame_number}.png",scale=2)
  except Exception as e:
    print(f"Error writing image: {e}")


Max amplitude: 31356926.787363894
Producing frames...
here fft
here notes
here plot
here write
here fft
here notes
here plot
here write
here fft
here notes
here plot
here write
here fft
here notes
here plot
here write
here fft
here notes
here plot
here write
here fft
here notes
here plot
here write
here fft
here notes
here plot
here write
here fft
here notes
here plot
here write
here fft
here notes
here plot
here write
here fft
here notes
here plot
here write
here fft
here notes
here plot
here write
here fft
here notes
here plot
here write
here fft
here notes
here plot
here write
here fft
here notes
here plot
here write
here fft
here notes
here plot
here write
here fft
here notes
here plot
here write
here fft
here notes
here plot
here write
here fft
here notes
here plot
here write
here fft
here notes
here plot
here write
here fft
here notes
here plot
here write
here fft
here notes
here plot
here write
here fft
here notes
here plot
here write
here fft
here notes
here plot
here write
her

In [None]:
!ffmpeg -y -r {FPS} -f image2 -s 1920x1080 -i frames/frame%d.png -i {AUDIO_FILE} -c:v libx264 -pix_fmt yuv420p demo-movie.mp4

ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --e