In [19]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import json
import os

import librosa
import soundfile as sf
import torchaudio
from pydub import AudioSegment
from scipy.io import wavfile
from sklearn.preprocessing import StandardScaler
from torchaudio.transforms import MelSpectrogram
from torchvision.transforms import Compose



In [15]:

with open("data/audio_path.json", "r", encoding="utf-8") as f:
    transcriptions = json.load(f)

labels = np.array([transcription["transcription"] for transcription in transcriptions])
data = labels

vocabularies = []
word_to_index = {}
index_to_word = {}
for sentence in data:
    tokens = sentence.split()
    for token in tokens:
        if token not in word_to_index:
            word_to_index[token] = len(word_to_index)
            index_to_word[len(index_to_word)] = token
            vocabularies.append(token)

# Encodage des séquences
encoded_data = []
for sentence in data:
    tokens = sentence.split()
    encoded_sentence = [word_to_index[token] for token in tokens]
    encoded_data.append(encoded_sentence)


# Création du Dataset et DataLoader
class LanguageDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return torch.tensor(self.data[index])


dataset = LanguageDataset(encoded_data)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

with open("data/labels.json", "w") as f:
        json.dump(vocabularies, f, ensure_ascii=False, indent=4)


In [18]:
with open("data/labels.json", "r") as f:
    vocab = json.load(f)

print(vocab)

['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'height', 'nine']


# Prediction de label d'une nouvelle audio

In [22]:
def load_mean_length(file_src="data/mean_length.json"):
    if not os.path.exists(file_src):
        print(
            "Le fichier contenant la longueur moyenne n'existe pas. Cela peut affecter la pprecision du modele"
        )
        return 0.4374343333333333

    with open(file_src, "r", encoding="utf-8") as f:
        return json.load(f)


class Custom_preprocessing:
    def __init__(self, audio_path, output_file):
        self.audio = audio_path
        # Charger les fichiers audio et calculer leurs longueurs
        self.output_file = output_file

        self.target_length = load_mean_length()
        self.process()

    # Élimination des silences
    def remove_silence(self, audio_path, output):
        # Charger le fichier audio
        signal, sr = librosa.load(audio_path, sr=None)

        # Détection des régions actives dans le signal
        non_silent_intervals = librosa.effects.split(signal, top_db=30)

        # Fusionner les intervalles non silencieux
        non_silent_signal = librosa.effects.remix(signal, non_silent_intervals)

        # Sauvegarder le signal audio sans les silences
        sf.write(output, non_silent_signal, sr)

    # Normalisation du volume
    def normalize_audio_volume(self, audio_path, output_path, target_dBFS=-20.0):
        # Chargement de l'enregistrement audio
        audio = AudioSegment.from_file(audio_path)

        # Calcul du facteur de normalisation pour atteindre le niveau cible
        current_dBFS = audio.dBFS
        normalization_factor = target_dBFS - current_dBFS

        # Normalisation du volume de l'audio
        normalized_audio = audio + normalization_factor

        # Export de l'audio normalisé
        normalized_audio.export(output_path, format="wav")

    # Filtrage du bruit
    def filtrage_du_bruit(self, audio_path, output, noise_threshold=-40.0):
        # Chargement de l'enregistrement audio
        audio = AudioSegment.from_file(audio_path)

        # Détection du bruit de fond
        background_noise = audio.dBFS

        # Filtrer le bruit de fond
        if background_noise > noise_threshold:
            audio = audio - noise_threshold
        else:
            audio = audio - background_noise

        # Export de l'audio filtré
        audio.export(output, format="wav")

    # Segmentation de la parole
    def segmentation_parole(self, audio_path, output_file, silence_threshold=-45):
        # Charger le fichier audio
        audio = AudioSegment.from_file(audio_path, format="wav")

        # Détection des silences
        non_silent_audio = audio.strip_silence(silence_thresh=silence_threshold)

        # Exporter le fichier audio sans les silences
        non_silent_audio.export(output_file, format="wav")

    # Éliminé des artefacts
    def remove_artifacts(self, audio_path, output_path):
        # Chargement de l'enregistrement audio
        audio = AudioSegment.from_file(audio_path)

        # Suppression d'artefacts basée sur la fréquence ou l'amplitude
        # Par exemple, supprimer les fréquences inférieures à 1000 Hz
        audio_filtered = audio.low_pass_filter(1000)

        # Export de l'audio filtré
        audio_filtered.export(output_path, format="wav")

    # Préaccentuation
    def preaccentuation(self, audio_file, output):
        # Charger le signal vocal (remplacer "audio.wav" par votre propre fichier audio)
        sample_rate, audio_data = wavfile.read(audio_file)

        # Paramètres de la préaccentuation
        alpha = 0.95  # Facteur de préaccentuation (typiquement entre 0.9 et 1)

        # Appliquer la préaccentuation
        preemphasis_audio = np.append(
            audio_data[0], audio_data[1:] - alpha * audio_data[:-1]
        )

        # Enregistrer le signal filtré en tant que fichier WAV
        wavfile.write(output, sample_rate, np.int16(preemphasis_audio))

    # Normalisation temporelle
    def time_stretch_audio(self, input_file, output_file, target_duration):
        # Charger l'audio
        audio, sr = librosa.load(input_file)

        # Calculer la durée actuelle
        current_duration = len(audio) / 44100

        # Calculer le facteur de normalisation
        speed_factor = current_duration / target_duration

        # Normaliser l'audio en modifiant la vitesse
        normalized_audio = librosa.effects.time_stretch(y=audio, rate=speed_factor)

        # Sauvegarder l'audio normalisé
        sf.write(output_file, normalized_audio, sr)

    # Convertir en_format adapté
    def convert_to_spectrogram(self, audio_path):
        transform = Compose(
            [
                torchaudio.transforms.Resample(
                    orig_freq=44100, new_freq=16000
                ),  # Échantillonnage à 16 kHz
                MelSpectrogram(
                    n_fft=400, win_length=400, hop_length=160, n_mels=128
                ),  # Créer un spectrogramme Mel
            ]
        )
        waveform, sample_rate = torchaudio.load(audio_path)
        spectrogram = transform(waveform).unsqueeze(0)  # Ajouter une dimension de lot
        spectrogram = np.array(spectrogram)
        spectrogram = spectrogram.reshape(-1)

        return spectrogram, sample_rate

    # Correction d'accent
    def correction_accent(self, audio_file):
        signal, sample_rate = librosa.load(audio_file, sr=None)

        # Calculer les coefficients cepstraux MFCC
        mfccs = librosa.feature.mfcc(y=signal, sr=sample_rate, n_mfcc=13)

        # Normaliser les coefficients MFCC
        scaler = StandardScaler()
        return scaler.fit_transform(mfccs.T).T

    def process(self):
        mfcc_features_list = []
        self.remove_silence(self.audio, "temp/remove_silence.wav")
        self.normalize_audio_volume("temp/remove_silence.wav", "temp/normalize_audio_volume.wav")
        self.time_stretch_audio(
            "temp/normalize_audio_volume.wav", "temp/time_stretch_audio.wav", self.target_length
        )
        self.filtrage_du_bruit("temp/time_stretch_audio.wav", "temp/filtrage_du_bruit.wav")
        self.segmentation_parole("temp/filtrage_du_bruit.wav", "temp/segmentation_parole.wav")
        self.remove_artifacts("temp/segmentation_parole.wav", "temp/remove_artifacts.wav")
        self.preaccentuation("temp/remove_artifacts.wav", "temp/preaccentuation.wav")
        mfcc_features_normalized = self.correction_accent("temp/preaccentuation.wav")
        mfcc_features_list.append(mfcc_features_normalized)

        X = np.array(mfcc_features_list)
        X = X.reshape(X.shape[0], -1)

        # Enregistrement des caractéristiques prétraitées dans un fichier (optionnel)
        np.save(self.output_file, X)


audio_ = "./data/audio_chiffre/7_theo_43.wav"
test = Custom_preprocessing(audio_, "data/works.npy")


In [23]:
import os

# Obtenir le chemin du répertoire temporaire
temp_dir = "temp/"

# Parcourir les fichiers du répertoire temporaire
for file in os.listdir(temp_dir):
    # Vérifier si c'est un fichier
    if os.path.isfile(os.path.join(temp_dir, file)):
        # Supprimer le fichier
        os.remove(os.path.join(temp_dir, file))


In [12]:
X = np.load("works.npy")
X = X.reshape(X.shape[0], 1, X.shape[1])
print(X.shape)
X_tensor = torch.Tensor(X)
# Charger le modèle entraîné
model = SpeechRecognition(hidden_size=hidden_size, num_classes=num_classes, n_feats=input_size, num_layers=1, dropout=0.1)
model.load_state_dict(torch.load('data/model.pth'))
model.eval()

def decode_output(output, vocab):
    decoded_sequence = []
    print(output.shape)
    for timestep_output in output:
        predicted_symbol_index = torch.argmax(timestep_output).item()
        predicted_symbol = vocab[predicted_symbol_index]
        decoded_sequence.append(predicted_symbol)
    return ' '.join(decoded_sequence)

# Utiliser le modèle pour transcrire l'audio
with torch.no_grad():
    model.eval()
    outputs, _ = model(X_tensor.unsqueeze(0), model._init_hidden(X_tensor.size(0)))
    decoded_text = decode_output(outputs, vocab)
    print("Texte transcrit :", decoded_text)

(1, 1, 494)


NameError: name 'SpeechRecognition' is not defined