# RTF Model for predicting racket type using P1, P2 and P3, based on attack time - Sound

### Model Description

This notebook implements a Random Tree Forest (RTF) model to predict the type of a racket (RB, RO, RR, RV) based on sound features extracted from audio files. The workflow involves reading `.wav` files, extracting frequency peaks using FFT, and incorporating the attack time as an additional feature. The attack time is calculated as the duration it takes for the signal to reach a specified amplitude threshold. These features are used to train the model, and its performance is evaluated using accuracy metrics and visualized through scatter plots and confusion matrices.

### Import libraries

In [43]:
import os
import sys
import glob
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from scipy.io import wavfile
from scipy.fft import fft
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

### Import Tools Functions

In [44]:
# Reach the project root
notebook_path = os.path.abspath('')
project_root = os.path.abspath(os.path.join(notebook_path, '../../../'))
functions_path = os.path.join(project_root, 'Functions')

# Add Functions folder
if functions_path not in sys.path:
    sys.path.append(functions_path)

In [45]:
# Import functions

from attack_time_from_signal import attack_time_from_signal
from extractPeakFromSignal import extractPeakFromSignal
from readWavFolder import readWavFolder
from spectrumFromSignal import spectrumFromSignal
from spectrumFromWav import spectrumFromWav

# Main

In [46]:
# List of racket types
raquetteTypeList = {"RB":0,"RO":1,"RR":2,"RV":3}

# List to store results
results = []

for n_peak in range(5, 101, 5):
    X_peaksHz = []
    X_peaksAmplitude = []
    X_attack_time = []
    Y_Label = []
    attack_times = []
    #print("Nbr_peak", n_peak)

    # Create a DataFrame to store details of each wav file
    wav_files_data = []

    # Process each folder (P1, P2, P3)
    for folder, folder_path in [("P1", "../../../Data/Sound/P1"), 
                                ("P2", "../../../Data/Sound/P2"), 
                                ("P3", "../../../Data/Sound/P3")]:
        sample_rates, wav_files, file_names = readWavFolder(folder_path)
        
        for sample_rate, wav_file, file_name in zip(sample_rates, wav_files, file_names):
            wav_files_data.append({
                "Folder": folder,
                "File_Name": file_name,
                "Sample_Rate": sample_rate,
                "Signal": wav_file
            })

    # Convert the list of dictionaries into a DataFrame
    wav_files_df = pd.DataFrame(wav_files_data)

    # Display the DataFrame
    #print(wav_files_df)

    spectrumVect = []

    # For each wav file, extract its spectrum, filter it between 150 and 1000 Hz, and take the top n peaks
    for i in range(len(wav_files_df["Signal"])):
        if "C" in wav_files_df["File_Name"][i]:
            if 'RB' in wav_files_df["File_Name"][i]:
                raquetteType = 'RB'
            elif 'RR' in wav_files_df["File_Name"][i]:
                raquetteType = 'RR'
            elif 'RO' in wav_files_df["File_Name"][i]:
                raquetteType = 'RO'
            elif 'RV' in wav_files_df["File_Name"][i]:
                raquetteType = 'RV'

            # Extract the spectrum from the wav file
            spectrum, freqs = spectrumFromSignal(wav_files_df["Signal"][i], wav_files_df["Sample_Rate"][i])
            spectrumVect.append(freqs)

            peaks, peak_values = extractPeakFromSignal(signal=spectrum, num_peaks=n_peak)

            X_peaksHz.append(peaks)
            X_peaksAmplitude.append(peak_values)
            Y_Label.append(raquetteType)

            # Calcul of attack time
            row = wav_files_df.iloc[i]
            attack_time = attack_time_from_signal(row["Signal"], row["Sample_Rate"])
            attack_times.append(attack_time)

    # Normalize the attack times
    X_attack_time = np.array(attack_times).reshape(-1, 1)
    X_attack_time = (X_attack_time - X_attack_time.min()) / (X_attack_time.max() - X_attack_time.min())

    # Ensure all arrays in X_peaksHz have the same length
    max_length = max(max(len(peaks) for peaks in X_peaksHz), max(len(amps) for amps in X_peaksAmplitude))
    X_peaksHz_padded = np.array([np.pad(peaks, (0, max_length - len(peaks)), constant_values=0) for peaks in X_peaksHz])
    X_peaksAmplitude_padded = np.array([np.pad(amps, (0, max_length - len(amps)), constant_values=0) for amps in X_peaksAmplitude])

    # Combine the frequencies and amplitudes into a single feature matrix
    X = np.hstack((X_peaksHz_padded, X_attack_time))

    # Encode string labels into integers
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(Y_Label)

    # Diviser les données en ensembles d'entraînement et de test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [47]:
# Random Forest algorithm parameters
n_estimators_range = range(10, 101, 10)  # Number of trees between 10 and 100
max_depth_range = [None, 10, 20, 30, 40]  # Different depths
min_samples_split_range = [2, 5, 10]  # Minimum number to divide a node
min_samples_leaf_range = [1, 2, 4]  # Minimum number of samples in a sheet
max_features_range = ['sqrt', 'log2', None]  # Number of features per tree

# Test all the combinations of parameters
for n_estimators in n_estimators_range:
    for max_depth in max_depth_range:
        for min_samples_split in min_samples_split_range:
            # Create and train the Random Forest model
            rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                        min_samples_split=min_samples_split, random_state=42)
            rf.fit(X_train, y_train)

            # Evaluate on the test set
            y_pred = rf.predict(X_test)
            accuracy_test = accuracy_score(y_test, y_pred)

            # Evaluate on the training set
            y_train_pred = rf.predict(X_train)
            accuracy_train = accuracy_score(y_train, y_train_pred)

            # Add the results to the list
            results.append({
                'n_peak': n_peak,
                'n_estimators': n_estimators,
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'accuracy_train': accuracy_train,
                'accuracy_test': accuracy_test
            })

# Convert the results list to a DataFrame
results_df = pd.DataFrame(results)

# Sort the results by accuracy_test in descending order
sorted_results_df = results_df.sort_values(by='accuracy_test', ascending=False)

print(sorted_results_df)

# Register the best parameters in a CSV file
sorted_results_df.to_csv("S_RTF_Racket_P1.P2.P3_AttackT.csv", index=False)

print("Results have been saved to 'S_RTF_Racket_P1.P2.P3_AttackT.csv'.")

     n_peak  n_estimators  max_depth  min_samples_split  accuracy_train  \
75      100            60        NaN                  2        1.000000   
13      100            10       40.0                  5        0.968619   
1       100            10        NaN                  5        0.968619   
69      100            50       30.0                  2        1.000000   
66      100            50       20.0                  2        1.000000   
..      ...           ...        ...                ...             ...   
122     100            90        NaN                 10        0.956067   
123     100            90       10.0                  2        0.997908   
33      100            30       10.0                  2        0.995816   
48      100            40       10.0                  2        0.997908   
3       100            10       10.0                  2        0.976987   

     accuracy_test  
75        0.675000  
13        0.675000  
1         0.675000  
69        0.675