# Timing ```04_record_loops``` using ```timeit``` module
Firstly, lets initialize all the necessary variables, as that is not something I have to time:

In [5]:
from typing import Any
import numpy.typing as npt

import sounddevice as sd
import numpy as np
import soundfile as sf
import threading
import random
import timeit


SAMPLERATE = 44100  # [samples per second]
BLOCKSIZE = 1000  # [samples]
DTYPE = np.int16
STR_DTYPE = "int16"
CHANNELS = 2
LATENCY = 0

METRONOME_SAMPLE_PATH = "lib/samples/metronome.wav"

recording = False
stream_active = True

tracks: list[npt.NDArray[DTYPE]] = []
recorded_track: npt.NDArray[DTYPE] = np.empty(shape=(0, CHANNELS), dtype=DTYPE)
current_frame = 0
len_beat: int  # number of samples per beat

It is questionable whether some of these could be optimalised. E.g. using numpy array instead of list for ```tracks``` could be faster. However, different tracks need to differ in length and I do not know whether numpy allows this.

Next, lets analyze the functions in the program. There are three groups of functions:
- ```metronome_generator``` and ```initialize_metronome```, which execute before the body of the program,
- ```input_checker``` and ```post_production```, which are executed in the input thread,
-  ```main``` and ```callback```, which are executed in the thread which is processing audio.

As the first ones do not need to be timed, lets execute them now so the other ones can use the metronome sample.

In [6]:
def metronome_generator(bpm: int, path: str) -> npt.NDArray[DTYPE]:
    global len_beat

    sample: npt.NDArray[DTYPE]
    sample, fs = sf.read(file=path, dtype=STR_DTYPE)

    desired_len = int((60*fs)/bpm)
    len_beat = desired_len

    if len(sample) <= desired_len:
        # rounding desired_len introduces a distortion of bpm
        sample = np.concatenate(
            (sample, np.zeros(shape=(desired_len-len(sample), CHANNELS), dtype=DTYPE)))
    else:
        sample = sample[:desired_len]

    # adjust volume
    sample = (sample/4).astype(dtype=DTYPE)

    return sample


def initialize_metronome() -> None:
    global tracks
    # initialize metronome
    bpm = int(input("bpm: "))
    metronome = metronome_generator(bpm=bpm, path=METRONOME_SAMPLE_PATH)
    tracks.append(metronome)

The other two groups run in the same thread, for python does not allow true parallelism, so it is necessary to optimalise both of them. Lets start with the audio processing, as I suppose it may require more CPU power and I also have some ideas for optimalisation. 

In [7]:
def main() -> None:
    """ processes the audio """
    global stream_active

    def callback(indata: npt.NDArray[DTYPE], outdata: npt.NDArray[DTYPE],
                 frames: int, time: Any, status: sd.CallbackFlags) -> None:
        global current_frame
        global tracks
        global recorded_track

        if status:
            print(status)

        if recording:
            recorded_track = np.concatenate([recorded_track, indata])

        # mixer & slicer
        num_tracks = len(tracks)
        data = (indata/(num_tracks + 1)).astype(dtype=DTYPE)

        for track in tracks:
            # slice
            start = current_frame % len(track)
            end = (current_frame+frames) % len(track)
            if end < start:
                track_slice = np.concatenate(
                    (track[start:], track[:end]))
            else:
                track_slice = track[start:end]
            # mix
            track_slice = (track_slice/(num_tracks+1)).astype(dtype=DTYPE)

            data += track_slice

        outdata[:] = data
        current_frame += frames

    try:
        with sd.Stream(samplerate=SAMPLERATE, blocksize=BLOCKSIZE, dtype=STR_DTYPE,
                       channels=CHANNELS, callback=callback):
            while stream_active:
                pass
    finally:
        print("Good bye!")

Ideas:
- is using global variables slow?
- the for loop is certainly inefficient -> use numpy.mean() for the mixing
    - how to improve the cutting?
- is the handling of large integers in ```current_frame``` okay?
- is the while loop, which is keeping the stream active, draining CPU power? If yes, how to keep the stream alive without this drawback?

First of all, mixing and cutting. Or just mixing, as it is a smaller piece of code. Lets initialize some track data, so we can test different approaches:

## Timing mixing

In [8]:
LEN_TRACK = 100
NUM_TRACKS = 10
NUMBER = 1
REPEAT = 10000

def generate_random_track(length: int, channels: int = 2) -> npt.NDArray[DTYPE]:
    return np.array([[random.randint(-2**15, 2**15-1), random.randint(-2**15, 2**15-1)] for i in range(length)])

cut_tracks = np.array([generate_random_track(LEN_TRACK) for i in range(NUM_TRACKS)])

for_loop = """
data = cut_tracks[0]
for track_slice in cut_tracks:
    track_slice = (track_slice/(NUM_TRACKS+1)).astype(dtype=DTYPE)
    data += track_slice"""

np_mean = """
data = np.mean(a=cut_tracks, axis=0)
"""

Now we can comparte those two:

In [9]:
for_loop_time = min(timeit.repeat(stmt=for_loop, number=NUMBER, repeat=REPEAT, globals=globals()))
mean_time = min(timeit.repeat(stmt=np_mean, number=NUMBER, repeat=REPEAT, globals=globals()))

difference = for_loop_time - mean_time
if difference < 0:
    print(f"for loop was {difference*-1} second faster")
else:
    print(f"np.mean was {difference} second faster")

np.mean was 0.0001272629997401964 second faster


In january I have figured out that I store the recorded tracks in a `list` of `np.arrays`. I am wondering whether it would be faster to store them in a big `np.array`, so lets time it!

In [10]:
np_matrix = np.array([generate_random_track(length=LEN_TRACK) for i in range(NUM_TRACKS)])
big_list = [generate_random_track(length=LEN_TRACK) for i in range(NUM_TRACKS)]

# averaging a list with for loop
for_loop_list = """
data = big_list[0]
for track_slice in big_list:
    track_slice = (track_slice/(NUM_TRACKS+1)).astype(dtype=DTYPE)
    data += track_slice
"""

# averaging an array with for loop
for_loop_matrix = """
data = np_matrix[0]
for track_slice in np_matrix:
    track_slice = (track_slice/(NUM_TRACKS+1)).astype(dtype=DTYPE)
    data += track_slice
"""

# averaging a list with np.mean
np_mean_list = """
data = np.mean(a=big_list, axis=0)
"""

# averaging an array with np.mean
np_mean_matrix = """
data = np.mean(a=np_matrix, axis=0)
"""

In [11]:
for_loop_list_time = min(timeit.repeat(stmt=for_loop_list, number=NUMBER, repeat=REPEAT, globals=globals()))
for_loop_matrix_time = min(timeit.repeat(stmt=for_loop_matrix, number=NUMBER, repeat=REPEAT, globals=globals()))
np_mean_list_time = min(timeit.repeat(stmt=np_mean_list, number=NUMBER, repeat=REPEAT, globals=globals()))
np_mean_matrix_time = min(timeit.repeat(stmt=np_mean_matrix, number=NUMBER, repeat=REPEAT, globals=globals()))

print(f"time of averaging a list with for loop:      {for_loop_list_time} s")
print(f"time of averaging an array with for loop:    {for_loop_matrix_time} s")
print(f"time of averaging a list with np.mean:       {np_mean_list_time} s")
print(f"time of averaging an array with np.mean:     {np_mean_matrix_time} s")

time of averaging a list with for loop:      0.00015654699996048294 s
time of averaging an array with for loop:    0.00016584100012551062 s
time of averaging a list with np.mean:       5.441300004349614e-05 s
time of averaging an array with np.mean:     3.953800000999763e-05 s


In [12]:
print(f"""averaging an array with np.mean is: 
      {for_loop_list_time/np_mean_matrix_time} times faster than averaging a list with for loop (my current practice)""")
print(f"""averaging a list with np.mean is: 
      {for_loop_list_time/np_mean_list_time} times faster than averaging a list with for loop (my current practice)""")

averaging an array with np.mean is: 
      3.959406138927064 times faster than averaging a list with for loop (my current practice)
averaging a list with np.mean is: 
      2.8770146809649146 times faster than averaging a list with for loop (my current practice)


It seems that this part of `callback` can be sped up nearly four times! Now it feels almost obligatory to time the other parts as well.

## Timing frame addition
I am interested in seeing the difference between adding small and large integers. But first it is time for some general calculations: If I want at most 10 ms latency (as fast googling shows), then my callback has this time to execute. Sounddevice docs say *It is reasonable to expect to be able to utilise 70% or more of the available CPU time in the PortAudio callback.* So my callback has **7 ms** to execute, and the blocksize for 10 ms latency is 44100/100 = 441 samples per call.

The callback will be called with frequency 100 Hz. Supposing a very long recording sesh would last one hour, the callback will be called 100*3600 = 360 000 times.

In [13]:
from matplotlib import pyplot as plt

In [58]:
low_setup="""frames = 10
curr_fr = 0"""
adding = "for i in range(100): curr_fr += frames"
low_times = timeit.repeat(stmt=adding, repeat=10000, number=100, setup=low_setup)
print(f"low: {sum(low_times)/len(low_times)}")

low: 0.0007239095003981219


In [59]:
high_setup="""frames = 10
curr_fr = 10**20"""
high_times = timeit.repeat(stmt=adding, repeat=10000, number=100, setup=high_setup)
print(f"high: {sum(high_times)/len(high_times)}")

high: 0.0008153054061997864


Even if I recorded for 31 billion years, this piece of code would only use 8.153054061997864e-08 s per callback, which is approximately 0.00001 % of the callback time. No need to worry here. **But** I have forgotten that this is not the only place where frames are used! Lets test for this code snippet:
```py
start = _current_frame % len(track)
end = (_current_frame+frames) % len(track)
``` 
Self is not relevant and len(track) is constant (I do not suppose len would return very differently for very short and very long lists, might test that later), so it can be reduced to:
```py
start = current_frame % n
end = (current_frame+frames) % n
```


In [60]:
low_setup = """
n = 44100*5
frames = 100
current_frame = 0
"""
high_setup = """
n = 44100*5
frames = 100
current_frame = 10**20
"""
snip = """start = current_frame % n
end = (current_frame+frames) % n"""

In [63]:
low_times = timeit.repeat(stmt=snip, setup=low_setup)
high_times = timeit.repeat(stmt=snip, setup=high_setup)
print(f"low: {sum(low_times)/len(low_times)}")
print(f"high: {sum(high_times)/len(high_times)}")

low: 0.1129516416001934
high: 0.3304551569997784


This seems to be significantly worse! But actually in the worst case this is gonna be 3e-1*e-6 = 3e-7 s, thus this is neither a critical point.

But lets do this thoroughly, lets time all the parts of the callback. In a new file.