In [2]:
import os
import numpy as np
from scipy.io import wavfile
from python_speech_features import mfcc
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean
import librosa
import tkinter as tk
from tkinter import filedialog


# Function to extract MFCC features separately for each digit and operation
def extract_mfcc(audio_path):
    audio, sr = librosa.load(audio_path, sr=None)
    return mfcc(audio, sr)


# Function to find the most similar reference using DTW
def find_most_similar(test_audio, references):
    min_distance = float('inf')
    most_similar = None

    for label, reference_audios in references.items():
        for reference_audio in reference_audios:
            distance, _ = fastdtw(test_audio, reference_audio, dist=euclidean)
            if distance < min_distance:
                min_distance = distance
                most_similar = label

    return most_similar


# Function to get references
def get_references(folder, labels):
    references = {}

    for label in labels:
        audio_files = [f for f in os.listdir(os.path.join(folder, label)) if f.endswith('.wav')]

        references[label] = []
        for audio_file in audio_files:
            audio_path = os.path.join(folder, label, audio_file)
            references[label].append(extract_mfcc(audio_path))

    return references


# Function to predict the spoken word from an audio file
def predict_spoken_word(test_audio, references):
    test_features = extract_mfcc(test_audio)
    return find_most_similar(test_features, references)


# Function to predict the digits and operations from an audio file
def predict(audio_path, digits_folder, operations_folder):
    digits = [str(i) for i in range(10)]
    operations = ["plus", "moins"]

    digit_references = get_references(digits_folder, digits)
    operation_references = get_references(operations_folder, operations)

    audio, sr = librosa.load(audio_path, sr=None)
    speech_segments = librosa.effects.split(audio, top_db=30)

    labels = []

    MIN_SEGMENT_DURATION = 0.2

    for i, segment in enumerate(speech_segments):
        segment_duration = librosa.get_duration(y=audio[segment[0]:segment[1]], sr=sr)
        if segment_duration >= MIN_SEGMENT_DURATION:
            segment_audio = audio[segment[0]:segment[1]]
            segment_path = f'segment_{i}.wav'
            wavfile.write(segment_path, sr, (segment_audio * 32768).astype(np.int16))

            # Use digit references for the first and third segments and operation references for the second
            if i == 2:
                label = predict_spoken_word(segment_path, operation_references)
                if label is not None:
                    labels.append(label)
            if i == 0 or i == 2:
                label = predict_spoken_word(segment_path, digit_references)
                labels.append(int(label))

    # Print the detected digits and operations
    print("Detected digits and operations:", labels)

    if len(labels) == 3:
        if labels[1] == 'plus':
            result = labels[0] + labels[2]
        elif labels[1] == 'moins':
            result = labels[0] - labels[2]
        return str(result)
    else:
        return "Unable to detect digits and operations correctly."


class Application(tk.Frame):
    def __init__(self, master=None):
        super().__init__(master)
        self.master = master
        self.master.title("Calculatrice")
        self.master.geometry("500x400")
        self.master.configure(bg="#c0ded9")
        self.pack(expand=True, fill=tk.BOTH)
        self.create_widgets()

    def create_widgets(self):
        self.title_label = tk.Label(
            self,
            text="Voice Calculator",
            font=("Helvetica", 24, "bold"),
            bg="#c0ded9",
            fg="#000000"
        )
        self.title_label.pack(side="top", pady=20)

        self.select_button = tk.Button(
            self,
            text="Select Audio File",
            command=self.select_file,
            bg="#76b041",
            fg="#FFFFFF",
            font=("Helvetica", 18, "bold"),
            relief=tk.FLAT,
            padx=40,
            pady=20
        )
        self.select_button.pack(expand=True)

        self.result_label = tk.Label(
            self,
            text="",
            font=("Helvetica", 24),
            bg="#c0ded9",
            fg="#000000"
        )
        self.result_label.pack(side="bottom", pady=20)

    def select_file(self):
        file_path = filedialog.askopenfilename(filetypes=(("WAV files", "*.wav"), ("All files", "*.*")))

        # Paths to your audio files
        digits_folder =  r'C:\Users\aorus\Desktop\s8\TP\TP_Calcule\digits'
        operations_folder = r'C:\Users\aorus\Desktop\s8\TP\TP_Calcule\operations'

        result = predict(file_path, digits_folder, operations_folder)
        self.result_label["text"] = f"Result: {result}"
        
root = tk.Tk()
app = Application(master=root)
app.mainloop()