# Edit Podcast

## Goal

Automatize podcast edition as much as possible

https://ironbar.github.io/tertulia_inteligencia_artificial/como-se-hace/edicion/

## References

- https://librosa.org/doc/main/generated/librosa.feature.rms.html
- https://librosa.org/doc/main/generated/librosa.resample.html

## Imports

In [None]:
import os
import librosa
import soundfile as sf
import numpy as np
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import matplotlib as mpl
import subprocess
import glob
from skimage.measure import block_reduce

plt.plot()
plt.close('all')
plt.rcParams["figure.figsize"] = (30, 5)
mpl.rcParams['lines.linewidth'] = 1
mpl.rcParams['font.size'] = 16

In [None]:
import logging

# Remove all handlers associated with the root logger
logging.getLogger().handlers = []

# Configure logging again with the desired format and level
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S')

## Code

### Volume tracks

In [None]:
def adjust_tracks_volume(filepath, output_filepath=None):
    gains, amplification = compute_gains_to_merge_audios(filepath)
    print(f'Gains: {gains}')
    audio, sr = merge_audios_with_gains(filepath, gains, amplification)
    if output_filepath is None:
        output_filepath = filepath.replace('raw_audios', 'curated_audios').replace('_alsa2', '')
    print(f'Saving to: {output_filepath}')
    os.makedirs(os.path.dirname(output_filepath), exist_ok=True)
    sf.write(output_filepath, audio, sr)

In [None]:
frame_length = 2048
hop_length = 512
target_sr = 8000

def compute_gains_to_merge_audios(filepath, target_sr=target_sr,
                                  db_goal=-12.5, moving_average_window=10):
    """
    db_goal=-25, moving_average_window=10 works well
    """
    audios = librosa.load(filepath, sr=target_sr, mono=False)[0]
    print(f'Loaded audio with {len(audios)} tracks and duration of {len(audios[0]) / target_sr / 60:.1f} minutes')
    rms_values = [librosa.feature.rms(y=y, frame_length=frame_length, hop_length=hop_length)[0] for y in tqdm(audios, desc='computing rms values')]
    gains = optimize_track_gains(rms_values, db_goal)
    amplification = get_amplification_mask_to_lower_high_volume_instants(
        rms_values, gains, db_goal=db_goal, moving_average_window=moving_average_window)
    return gains, amplification

In [None]:
def optimize_track_gains(rms_values, db_goal=-20, n_runs=10, factors=[0.8, 1.25, 1.5, 2]):
    gains = [1 for _ in rms_values]
    visualize_tracks_with_gains(rms_values, gains)
    merge = merge_rms(rms_values, gains)
    best_fitness = measure_fitness(merge, db_goal)
    print(f'Initial fitness: {best_fitness:.2f}')

    for _ in tqdm(range(n_runs), desc='Optimizing track gain'):
        for factor in factors:
            for track_idx, _ in enumerate(rms_values):
                new_gains = gains.copy()
                new_gains[track_idx] *= factor
                fitness = measure_fitness(merge_rms(rms_values, new_gains), db_goal)
                if fitness > best_fitness:
                    gains = new_gains
                    best_fitness = fitness
    print(f'Final fitness: {best_fitness:.2f}')
    visualize_tracks_with_gains(rms_values, gains)
    visualize_merged_energy(rms_values, gains)
    return gains

def merge_rms(rms_values, gains):
    merge = (rms_values[0]*gains[0])**2
    for rms, gain in zip(rms_values[1:], gains[1:]):
        merge += (rms*gain)**2
    return np.sqrt(merge)

def measure_fitness(rms, db_goal, goal_width=2):
    lower_bound = db_goal - goal_width
    upper_bound = db_goal + goal_width
    rms_db = librosa.amplitude_to_db(rms)
    fitness = (rms_db > lower_bound) & (rms_db < upper_bound)
    return np.mean(fitness)


def visualize_tracks_with_gains(rms_values, gains):
    bins = np.linspace(-60, 0, 500)
    for idx, rms in enumerate(rms_values):
        label = f'track{idx+1} (gain={gains[idx]:.2f})'
        plt.hist(librosa.amplitude_to_db(rms*gains[idx]), bins=bins, alpha=0.8, label=label, histtype='step', density=True)
    plt.hist(librosa.amplitude_to_db(merge_rms(rms_values, gains)), bins=bins, alpha=0.2, label='merge', density=True)
    plt.grid()
    plt.legend(loc=0)
    plt.show()

def visualize_merged_energy(rms_values, gains):
    merge = merge_rms(rms_values, gains)
    t = librosa.frames_to_time(range(len(merge)), sr=target_sr, hop_length=hop_length, n_fft=frame_length)
    plt.plot(t/60, librosa.amplitude_to_db(merge))
    plt.plot(t/60, moving_average(librosa.amplitude_to_db(merge), 30))
    plt.title('RMS Energy')
    plt.xlabel('Time (minutes)')
    plt.ylabel('Energy (dB)')
    plt.grid(axis='y')
    plt.show()

def moving_average(data, window_size):
    weights = np.ones(window_size) / window_size
    moving_average = np.convolve(data, weights, 'valid')
    left = window_size//2
    right = len(data) - len(moving_average) - left
    moving_average = np.pad(moving_average, (left, right), mode='edge')
    return moving_average

In [None]:
def merge_audios_with_gains(filepaths, gains):
    audio, sr = None, None
    for filepath, gain in tqdm(zip(filepaths, gains), total=len(filepaths), desc='merging audios'):
        ret = librosa.load(filepath, sr=None)
        new_audio = ret[0]
        if audio is None:
            audio = new_audio*gain
            sr = ret[1]
        else:
            audio[:len(new_audio)] += new_audio[:len(audio)]*gain
    return audio, sr

In [None]:
def get_amplification_mask_to_lower_high_volume_instants(rms_values, gains, db_goal, moving_average_window=30):
    merge = merge_rms(rms_values, gains)
    merge_rms_dbs = librosa.amplitude_to_db(merge)
    modifier = np.clip(moving_average(merge_rms_dbs, moving_average_window) - db_goal, 0, None)
    amplification = 10**(-modifier/20)
    return amplification

In [None]:
def merge_audios_with_gains(filepath, gains, amplification):
    audios, sr = librosa.load(filepath, sr=None, mono=False)
    audio = np.average(audios, axis=0, weights=gains)*np.sum(gains)/len(gains)
    audio *= np.repeat(amplification, np.ceil(len(audio)/len(amplification)))[:len(audio)]
    audio = remove_audio_saturation(audio)
    return audio, sr

In [None]:
def remove_audio_saturation(audio, threshold=0.9, pool_size=4800):
    intensity = np.abs(audio)
    modification = np.clip(intensity/threshold, 1, None)
    modification = block_reduce(modification, block_size=pool_size, func=np.max)
    soften_modification = moving_average(modification, 10)
    modification = np.maximum(modification, soften_modification)
    plt.plot(modification)
    modification = np.repeat(modification, pool_size)
    audio[:len(modification)] /= modification[:len(audio)]
    return audio

### Compose program

In [None]:
def compose_program(intro_filepath,
                    episode_filepath,
                    output_filepath,
                    intro_music_filepath='/mnt/data/other/data/TERTULia/sound_library/intro_music_v5_auto.mp3',
                    outro_music_filepath='/mnt/data/other/data/TERTULia/sound_library/outro_v5_auto.mp3',
                    background_music_filepath='/mnt/data/other/data/TERTULia/sound_library/The lofi room_113.mp3',
                    intro_music_start_duration=55,
                    intro_music_high_duration=15,
                    outro_music_high_duration=15,
                    background_music_gain=0.5,
                    intro_music_gain=0.5,
                    outro_music_gain=0.5,
                    sr=48000):
    logging.info('Preparing program intro')
    intro_audio = librosa.load(intro_filepath, sr=sr)[0]
    intro_audio = np.pad(intro_audio, (sr, 0), 'constant')
    intro_music = librosa.load(intro_music_filepath, sr=sr)[0]*intro_music_gain
    crop_intro = intro_music_start_duration - len(intro_audio)/sr
    assert crop_intro > 0
    intro_music = intro_music[int(crop_intro*sr):]
    intro_music[:len(intro_audio)] += intro_audio
    logging.info('Adding program outro')
    outro_music = librosa.load(outro_music_filepath, sr=sr)[0]*outro_music_gain
    episode = librosa.load(episode_filepath, sr=sr)[0]
    episode = np.pad(episode, (int(len(intro_audio) + intro_music_high_duration*sr), int(outro_music_high_duration*sr)), 'constant')
    episode[:len(intro_music)] += intro_music
    episode[-len(outro_music):] += outro_music
    logging.info('Adding background music')
    background_music = librosa.load(background_music_filepath, sr=sr)[0]
    background_music = background_music[:len(episode) - len(intro_music) - len(outro_music)]
    episode[len(intro_music):-len(outro_music)] += background_music*background_music_gain
    logging.info(f'Saving program to {output_filepath}...')
    sf.write(output_filepath, episode, sr)
    logging.info('Program saved.')

### Create video

In [None]:
def create_video_with_ffmpeg(audio_filepath, image_filepath, video_filepath, temp_video_duration=200):
    # create_video_command = f'ffmpeg -loop 1 -framerate 1 -i "{image_filepath}" -i "{audio_filepath}" -c:v libx264 -preset ultrafast -tune stillimage -c:a copy -pix_fmt yuv420p -shortest -threads 12 "{video_filepath}"'
    # execute_command(create_video_command)
    short_video_filepath = 'temp.mp4'
    if os.path.exists(short_video_filepath):
        os.remove(short_video_filepath)
    command = f'ffmpeg -loop 1 -framerate 1 -i "{image_filepath}" -c:v libx264 -preset ultrafast -tune stillimage -t {temp_video_duration} -pix_fmt yuv420p "{short_video_filepath}"'
    execute_command(command)
    command = f'ffmpeg -stream_loop -1 -i "{short_video_filepath}" -i "{audio_filepath}" -c:v copy -c:a copy -shortest "{video_filepath}"'
    execute_command(command)
    os.remove(short_video_filepath)
    return video_filepath


def execute_command(command, verbose=True):
    if verbose:
        print(f'Executing command: {command}')
        process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)

        # Loop to print the output in real-time
        for line in process.stdout:
            print(line, end='')

        # Wait for the process to complete
        process.wait()
    else:
        logging.info(f'Running {command}')
        os.system(command)
        logging.info(f'Finished running')

### Concatenate audios

In [None]:
def concatenate_all_audios(folder):
    filepaths = sorted(glob.glob(os.path.join(folder, 'recording*.wav')))
    output_filepath = os.path.join(folder, f'concatenated.wav')
    concatenate_audios(filepaths, output_filepath)

def concatenate_audios(filepaths, output_filepath):
    ret = [librosa.load(filepath, sr=None, mono=False) for filepath in tqdm(filepaths, desc='loading audios')]
    sr = ret[0][1]
    assert all(sr == ret[i][1] for i in range(1, len(ret)))
    audios = np.hstack([ret[i][0] for i in range(len(ret))])
    print(f'Writing concatenated audio to {output_filepath} with {len(audios)} tracks and {len(audios[0])/sr/60:.1f} minutes duration')
    sf.write(output_filepath, np.transpose(audios, axes=(1, 0)), sr)

In [None]:
raise

## Concatenate all audios

Sometimes it makes sense to concatenate all audios before adjusting the track volume.

In [None]:
concatenate_all_audios('/mnt/data/other/data/TERTULia/episodios/temporada_3/episodio_14_mayo/raw_audios')

## Adjust tracks volume

There are many speakers in the podcast and they should have the same volume. When merging the tracks into a single one we would like to see a uniform volume level.

Sometimes when a speaker speaks too little, or the signal-noise ratio is bad there can be problems. In those cases the best solution is to manually edit the file and silence the background parts.

In [None]:
adjust_tracks_volume('/mnt/data/other/data/TERTULia/episodios/temporada_3/episodio_14_mayo/raw_audios/concatenated.wav')

## Manual revision

- Go to `curated_audios` folder
- Rename the audio to something like `part1_v0.wav`
- Remove parts of the episode with fails
- Reorder the episode if necessary (if we record the intro at the end)
- Truncate silence, -25 dB, 1 second, 1 second (adjust the noise threshold if necessary)
- Compressor, Threshold -20 dB, Noise Floor -60 dB, Ratio 3:1, attack time 0.2s, release time 1s
- Save as `part1_v1.wav`

## Record intro audio

1. Write intro for the program, and add it to the description. Giving ChatGPT the script of the program could be useful to gather ideas for the intro.
2. Record it using `record.sh`
3. Load with audacity and `Normalize` and apply `Compressor`
4. Save it with the name `part1_intro.wav`

On a few programs I have tried using GPT4.5 to generate the intro text, using this prompt:

```
Resume este abstract de un artículo en 2-3 líneas para una introducción de un podcast. La introducción tiene que ser muy atractiva para que el oyente sepa lo que va a escuchar en el programa y se quede. La frase final tiene que tener mucha fuerza porque luego se pone la sintonía del programa.
``

## Compose the program

In [None]:
folder = '/mnt/data/other/data/TERTULia/episodios_tertulia/temporada_4/grabacion_01'
idx = 1

In [None]:
compose_program(intro_filepath=os.path.join(folder, 'curated_audios', f'part{idx}_intro.mp3'),
                episode_filepath=os.path.join(folder, 'curated_audios', f'part{idx}_v1.mp3'),
                output_filepath=os.path.join(folder, 'curated_audios', f'part{idx}_v2.mp3'))

## Miniatures

Create a folder called miniatures and save the miniatures there with these names: `part1_youtube.png` and `part1_ivoox.png`

https://docs.google.com/presentation/d/1vtZ28nXhAE0UOhX389GVYb2JZdlFav0FtU0BLGOgFow/edit#slide=id.g2e5393793fc_0_0

## Create video

We have to create a video using the audio of the program and the miniature.

In [None]:
create_video_with_ffmpeg(
    audio_filepath=os.path.join(folder, 'curated_audios', f'part{idx}_v2.mp3'),
    image_filepath=os.path.join(folder, 'miniatures', f'part{idx}_youtube.png'),
    video_filepath=os.path.join(folder, 'curated_audios', f'part{idx}_v2.mp4'))

## Publish and gather suscribers stats

[Google sheet](https://docs.google.com/spreadsheets/d/1rT_tqf2MN8p5VNHsaxPrNQTT-IkxjzchQKbhbH84Esw/edit?gid=2012942565#gid=2012942565)

## Announce in twitter

## TODO

- [x] Automatic search of gains to have a good merge audio
- [x] Try with audio from other episodes
- [x] Find a correspondence between audacity dBs and this notebook dBs. It is exactly the same. The difference is that audacity shows 2 values, the peak energy and mean energy. We are measuring mean energy.
- [x] Add a script to compose the program
- [x] Add a script to create the video for youtube
- [x] Add a function to decrease volume in some parts of the audio.
- [x] Add a function to avoid saturation of volume
- [x] Add lofi music to the background of the episode
- [ ] Simplify and automate paths
- [ ] Logging
- [ ] Reduce echo, -15dB seems to be more natural.