In [1]:
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from scipy.fftpack import fft
from scipy.io import wavfile # get the api
import numpy as np
import tqdm
import os


# See https://newt.phys.unsw.edu.au/jw/notes.html

NOTE_NAMES = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]

def freq_to_midi(f): return 69 + 12*np.log2(f/440.0) # Pitch is often given by MIDI number
def midi_to_freq(n): return 440 * 2.0**((n-69)/12.0) # MIDI back to Frequency
def note_name(n): return NOTE_NAMES[n % 12] + str(int(n/12 - 1)) # Gives Detailed Note i.e. G3



In [24]:
# Grabbing file path
TRACKS_DIR = os.path.join(os.getcwd(), "tracks")
AUDIO_FILE = os.path.join(TRACKS_DIR, "dos_gardenias.wav")

sampling_rate, data = wavfile.read(AUDIO_FILE) # load the data
audio = data.T[0] # this is a two channel soundtrack, get the first channel

FTT_WINDOW_SEC = 0.25 # In Seconds: We're going to extract most pronounced notes from every 0.250 second window
FFT_WINDOW_SIZE = int(sampling_rate * FTT_WINDOW_SEC) # In Data Points - How many data points in each FFT window
AUDIO_LENGTH = len(audio)/sampling_rate # Track length in seconds 

FREQ_MIN = 10 # Mininum Frequency to display
FREQ_MAX = 4200 # Maximum frequency to display
TOP_N_NOTES = 3

# Hanning window function # TODO: Do more research to understand
# "Hanning window smoothly tapers the endpoints to zero and mitigates the discontinuity that produces leakage."
window = 0.5 * (1 - np.cos(np.linspace(0, 2*np.pi, FFT_WINDOW_SIZE, False)))

# In this context - A Frame is just an FFT Window
FRAMES_PER_SEC = 30 # int(1/FTT_WINDOW_SEC) # Just change to 1/window size? WINDOW SIZE Must be <= 1
FRAME_COUNT = int(AUDIO_LENGTH*FRAMES_PER_SEC) # Frames Per Sec in Video * Length of track in seconds
FRAME_OFFSET = int(len(audio)/FRAME_COUNT) # How many data points are in each frame


def extract_sample(audio, frame_number):
    end = frame_number * FRAME_OFFSET
    begin = int(end - FFT_WINDOW_SIZE)

    if end == 0:  # We have no audio yet, return all zeros (very beginning)
        return np.zeros((np.abs(begin)),dtype=float)
    elif begin<0: # We have some audio, pad with zeros
        return np.concatenate([np.zeros((np.abs(begin)),dtype=float),audio[0:end]])
    else:         # Usually this happens, return the next sample
        return audio[begin:end]

    
def find_max_frequency():
    mx = 0
    for frame_number in range(FRAME_COUNT):
        sample = extract_sample(audio, frame_number)
        fft = np.fft.rfft(sample * window)
        fft = np.abs(fft).real 
        mx = max(np.max(fft),mx)

    return mx

def find_top_notes(fft,num):
    if np.max(fft.real)<0.001:
        return []

    lst = [x for x in enumerate(fft.real)]
    lst = sorted(lst, key=lambda x: x[1],reverse=True)

    idx = 0
    found = []
    found_note = set()
    while( (idx<len(lst)) and (len(found)<num) ):
        f = xf[lst[idx][0]]
        y = lst[idx][1]
        n = freq_to_midi(f)
        n0 = int(round(n))
        name = note_name(n0)

        if name not in found_note:
            found_note.add(name)
            s = [f,note_name(n0),y]
            found.append(s)
        idx += 1

    return found


MAX_FREQ = find_max_frequency()
fs = sampling_rate
xf = np.fft.rfftfreq(FFT_WINDOW_SIZE, 1/fs)


# Figure out how to get rid of this at some point
# Output size. Generally use SCALE for higher res, unless you need a non-standard aspect ratio.
RESOLUTION = (1920, 1080)
RESOLUTION = (1920/2, 1080/2)



In [46]:
def extract_all_notes_all_windows():
    top_notes_each_window = np.array([])

    for frame_number in range(FRAME_COUNT):
        sample = extract_sample(audio, frame_number)
        fft = np.fft.rfft(sample * window)
        fft = np.abs(fft) / MAX_FREQ 
        s = find_top_notes(fft,TOP_N_NOTES)
        notes = [_[1] for _ in s]
        top_notes_each_window = np.append(top_notes_each_window, notes)
    return top_notes_each_window

In [51]:
complete_note_seq = extract_all_notes_all_windows()

In [60]:
complete_note_seq[:20]

array(['B6', 'C7', 'C2', 'B6', 'C7', 'C#7', 'C7', 'B6', 'C#7', 'C7', 'B6',
       'G6', 'C7', 'C5', 'C8', 'C7', 'C8', 'C5', 'C7', 'C8'], dtype='<U32')

In [57]:
unique, counts = np.unique(complete_note_seq, return_counts=True)

dict(zip(unique, counts))


{'A#2': 79,
 'A#3': 159,
 'A#4': 242,
 'A#5': 134,
 'A#6': 44,
 'A0': 6,
 'A1': 16,
 'A2': 296,
 'A3': 298,
 'A4': 309,
 'A5': 66,
 'A6': 25,
 'A7': 8,
 'B1': 165,
 'B2': 431,
 'B3': 492,
 'B4': 738,
 'B5': 146,
 'B6': 126,
 'B7': 4,
 'C#2': 462,
 'C#3': 117,
 'C#4': 236,
 'C#5': 240,
 'C#6': 47,
 'C#7': 30,
 'C1': 7,
 'C2': 516,
 'C3': 189,
 'C4': 414,
 'C5': 340,
 'C6': 88,
 'C7': 59,
 'C8': 16,
 'D#1': 5,
 'D#2': 946,
 'D#3': 145,
 'D#4': 114,
 'D#5': 275,
 'D#6': 99,
 'D#7': 46,
 'D1': 8,
 'D2': 379,
 'D3': 334,
 'D4': 149,
 'D5': 273,
 'D6': 74,
 'D7': 44,
 'E2': 828,
 'E3': 397,
 'E4': 338,
 'E5': 410,
 'E6': 55,
 'E7': 22,
 'E8': 4,
 'F#2': 228,
 'F#3': 1580,
 'F#4': 247,
 'F#5': 161,
 'F#6': 13,
 'F#7': 34,
 'F2': 199,
 'F3': 93,
 'F4': 51,
 'F5': 171,
 'F6': 30,
 'F7': 4,
 'G#1': 4,
 'G#2': 35,
 'G#3': 85,
 'G#4': 111,
 'G#5': 86,
 'G#6': 28,
 'G#7': 15,
 'G0': 4,
 'G1': 5,
 'G2': 259,
 'G3': 919,
 'G4': 327,
 'G5': 133,
 'G6': 15,
 'G7': 14}

In [52]:
len(complete_note_seq)

16341

In [53]:
complete_note_seq.count()

AttributeError: 'numpy.ndarray' object has no attribute 'count'

In [17]:
data.T[1].max()

29340

In [18]:
data.T[0].max()

28180

In [8]:
import plotly.graph_objects as go

def plot_fft(p, xf, fs, notes, dimensions=(960,540)):
    layout = go.Layout(
      title="frequency spectrum",
      autosize=False,
      width=dimensions[0],
      height=dimensions[1],
      xaxis_title="Frequency (note)",
      yaxis_title="Magnitude",
      font={'size' : 24}
    )

    fig = go.Figure(layout=layout,
                  layout_xaxis_range=[FREQ_MIN,FREQ_MAX],
                  layout_yaxis_range=[0,1]
                  )

    fig.add_trace(go.Scatter(
      x = xf,
      y = p))

    for note in notes:
        fig.add_annotation(x=note[0]+10, y=note[2],
                text=note[1],
                font = {'size' : 48},
                showarrow=False)
    return fig

In [26]:
len(sample)

11025

11025.0

In [30]:

#fig = plot_fft(fft.real,xf,fs,s,RESOLUTION)
#fig.show()

[[2100.0, 'C7', 0.1769333594108977],
 [4204.0, 'C8', 0.05079035360991278],
 [524.0, 'C5', 0.04208247100077738]]

In [None]:
!pip install --upgrade nbformat

In [None]:
sample = extract_sample(audio, frame_number)
fft = np.fft.rfft(sample * window)
fft = np.abs(fft).real 
mx = max(np.max(fft),mx)

In [None]:
mx

In [None]:
# Plot Sample
plt.plot(sample)

In [None]:
# Plot Sample * Window Function
plt.plot(sample*window)

In [None]:

# Pass 1, find out the maximum amplitude so we can scale.

In [None]:
FRAME_COUNT

In [None]:
window

In [None]:
FTT_WINDOW_SEC = 0.25 # In Seconds
TOP_N_NOTES = 3 # Notes to display 

# Names of the notes


SCALE = 2 # 0.5=QHD(960x540), 1=HD(1920x1080), 2=4K(3840x2160)

IMG_CONTENT_DIR = os.path.join(os.getcwd(), "content")
print(os.path.exists(AUDIO_FILE))

In [None]:
fs, data = wavfile.read(AUDIO_FILE) # load the data
FRAME_STEP = (fs / FRAMES_PER_SEC) # audio samples per video frame

#FRAME_COUNT = int(AUDIO_LENGTH*FRAMES_PER_SEC)

In [None]:
AUDIO_LENGTH

In [None]:

# Pass 2, produce the animation
for frame_number in tqdm.tqdm(range(FRAME_COUNT)):
    sample = extract_sample(audio, frame_number)

    fft = np.fft.rfft(sample * window)
    fft = np.abs(fft) / mx 

    s = find_top_notes(fft,TOP_N_NOTES)

    fig = plot_fft(fft.real,xf,fs,s,RESOLUTION)
    img_base_path = f"frame{frame_number}.png"
    img_path = os.path.join(IMG_CONTENT_DIR, img_base_path)
    fig.write_image(img_path,scale=2)



In [None]:
!ffmpeg -y -r {FRAMES_PER_SEC} -f image2 -s 1920x1080 -i frame%d.png -i {AUDIO_FILE} -c:v libx264 -pix_fmt yuv420p movie.mp4
