In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
from PIL import Image
import random

class AudioProcessor:
    def __init__(self, input_dir, output_dir, colormap_name='jet', image_size=(224, 224)):
        self.input_dir = input_dir
        self.output_dir = output_dir
        self.colormap_name = colormap_name
        self.image_size = image_size

    def ensure_output_path_exists(self, path):
        if not os.path.exists(path):
            os.makedirs(path)

    def process_audio_files(self):
        for root, dirs, files in os.walk(self.input_dir):
            for dir_name in dirs:
                subdir_path = os.path.join(root, dir_name)
                output_subdir_path = os.path.join(self.output_dir, dir_name)
                self.ensure_output_path_exists(output_subdir_path)

                for file_name in os.listdir(subdir_path):
                    if file_name.endswith('.mp3'):
                        file_path = os.path.join(subdir_path, file_name)
                        self.process_single_file(file_path, output_subdir_path)

    def process_single_file(self, file_path, output_subdir_path):
        audio, sampling_rate = librosa.load(file_path, sr=None)
        duration = len(audio) / sampling_rate

        num_segments = int(np.floor(duration))

        for segment_idx in range(num_segments):
            start_idx = segment_idx * sampling_rate
            end_idx = (segment_idx + 1) * sampling_rate
            segment_audio = audio[start_idx:end_idx]

            mel_spectrogram = librosa.feature.melspectrogram(y=segment_audio, sr=sampling_rate, n_mels=224, fmax=8000)
            log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)

            # Normalize
            normalized_image = (log_mel_spectrogram - log_mel_spectrogram.min()) / (log_mel_spectrogram.max() - log_mel_spectrogram.min())

            colormap = plt.cm.get_cmap(self.colormap_name)
            colormap_image = colormap(normalized_image)[:, :, :3]

            colormap_image = (colormap_image * 255).astype(np.uint8)

            img = Image.fromarray(colormap_image)
            img = img.resize(self.image_size)
            img.save(os.path.join(output_subdir_path, f'spectrum_color_2_{segment_idx + 1}.png'))

        print(f'Processed and saved images for {file_path} in {output_subdir_path}')

# Usage
if __name__ == "__main__":
    input_dir = "./database"
    output_dir = "./processed_data"
    processor = AudioProcessor(input_dir, output_dir)
    processor.process_audio_files()

    print(f'Fourier transform images with color saved in {output_dir}')


  colormap = plt.cm.get_cmap(self.colormap_name)


Processed and saved images for ./database\English\1.mp3 in ./processed_data\English
Processed and saved images for ./database\English\10.mp3 in ./processed_data\English


  normalized_image = (log_mel_spectrogram - log_mel_spectrogram.min()) / (log_mel_spectrogram.max() - log_mel_spectrogram.min())


Processed and saved images for ./database\English\11.mp3 in ./processed_data\English
Processed and saved images for ./database\English\2.mp3 in ./processed_data\English
Processed and saved images for ./database\English\3.mp3 in ./processed_data\English
Processed and saved images for ./database\English\4.mp3 in ./processed_data\English
Processed and saved images for ./database\English\5.mp3 in ./processed_data\English
Processed and saved images for ./database\English\6.mp3 in ./processed_data\English
Processed and saved images for ./database\English\7.mp3 in ./processed_data\English
Processed and saved images for ./database\English\8.mp3 in ./processed_data\English
Processed and saved images for ./database\English\9.mp3 in ./processed_data\English
Processed and saved images for ./database\Farsi\1.mp3 in ./processed_data\Farsi
Processed and saved images for ./database\Farsi\2.mp3 in ./processed_data\Farsi
Processed and saved images for ./database\Farsi\3.mp3 in ./processed_data\Farsi
Pro