# RTF Model for predicting racket type using P1, P2 and P3, based on MFCC features - Sound

### Model Description

This notebook implements a Random Tree Forest (RTF) model to predict the type of a racket (RB, RO, RR, RV) based on sound features extracted from audio files. The workflow involves reading `.wav` files, extracting frequency peaks using FFT, computing MFCCs, and training the model using these features. The model's performance is evaluated using accuracy metrics, and the results are saved in an Excel file for further analysis.

### Import libraries

In [21]:
import os
import sys
import glob
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from scipy.io import wavfile
from scipy.fft import fft
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from scipy.signal import hilbert
from scipy.fftpack import dct

### Import Tools Functions

In [22]:
# Reach the project root
notebook_path = os.path.abspath('')
project_root = os.path.abspath(os.path.join(notebook_path, '../../../'))
functions_path = os.path.join(project_root, 'Functions')

# Add Functions folder
if functions_path not in sys.path:
    sys.path.append(functions_path)

In [23]:
# Import functions

from readWavFolder import readWavFolder
from spectrumFromSignal import spectrumFromSignal
import librosa # MFCC

## Main

In [24]:
# Liste des types de raquettes
raquetteTypeList = {"RB": 0, "RO": 1, "RR": 2, "RV": 3}

# Liste pour stocker les résultats
results = []

for n_mfcc in [13, 20, 30]:
    
    n_fft_range = [256, 512]
    hop_length_range = [128, 256]
    n_mels_range = range(20, 21)

    X = []
    Y_Label = []

    # Create a DataFrame to store details of each wav file
    wav_files_data = []

    # Process each folder (P1, P2, P3)
    for folder, folder_path in [("P1", "../../../Data/Sound/P1"), 
                                ("P2", "../../../Data/Sound/P2"), 
                                ("P3", "../../../Data/Sound/P3")]:
        sample_rates, wav_files, file_names = readWavFolder(folder_path)
        
        for sample_rate, wav_file, file_name in zip(sample_rates, wav_files, file_names):
            wav_files_data.append({
                "Folder": folder,
                "File_Name": file_name,
                "Sample_Rate": sample_rate,
                "Signal": wav_file
            })

    # Convert the list of dictionaries into a DataFrame
    wav_files_df = pd.DataFrame(wav_files_data)

    # Display the DataFrame
    #print(wav_files_df)

    spectrumVect = []

    # For each wav file, extract its spectrum, filter it between 150 and 1000 Hz, and take the top n peaks
    for i in range(len(wav_files_df["Signal"])):
        if "C" in wav_files_df["File_Name"][i]:
            if 'RB' in wav_files_df["File_Name"][i]:
                raquetteType = 'RB'
            elif 'RR' in wav_files_df["File_Name"][i]:
                raquetteType = 'RR'
            elif 'RO' in wav_files_df["File_Name"][i]:
                raquetteType = 'RO'
            elif 'RV' in wav_files_df["File_Name"][i]:
                raquetteType = 'RV'

            # Extract the spectrum from the wav file
            spectrum, freqs = spectrumFromSignal(wav_files_df["Signal"][i], wav_files_df["Sample_Rate"][i])
            spectrumVect.append(freqs)

            # Convert the signal to floating-point format
            signal_float = wav_files_df["Signal"][i][:, 0].astype(float) / np.max(np.abs(wav_files_df["Signal"][i][:, 0]))
            mfccs = librosa.feature.mfcc(y=signal_float, sr=wav_files_df["Sample_Rate"][i], n_mfcc=13)

            features = np.std(mfccs, axis=1) # Temporal mean of the MFCCs

            features /= np.max(np.abs(features)) # Normalize the features

            X.append(features)
            Y_Label.append(raquetteType)

In [25]:
# Encode string labels into integers
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(Y_Label)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None, shuffle=True)

In [26]:
# Random Forest algorithm parameters
n_estimators_range = range(10, 101, 10)  # Number of trees between 10 and 100
max_depth_range = [None, 10, 20, 30, 40]  # Different depths
min_samples_split_range = [2, 5, 10]  # Minimum number to divide a node
min_samples_leaf_range = [1, 2, 4]  # Minimum number of samples in a sheet
max_features_range = ['sqrt', 'log2', None]  # Number of features per tree

# Test all the combinations of parameters
for n_estimators in n_estimators_range:
    for max_depth in max_depth_range:
        for min_samples_split in min_samples_split_range:
            # Create and train the Random Forest model
            rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                        min_samples_split=min_samples_split, random_state=42)
            rf.fit(X_train, y_train)

            # Evaluate on the test set
            y_pred = rf.predict(X_test)
            accuracy_test = accuracy_score(y_test, y_pred)

            # Evaluate on the training set
            y_train_pred = rf.predict(X_train)
            accuracy_train = accuracy_score(y_train, y_train_pred)

            # Add the results to the list
            results.append({
                'n_mfcc': n_mfcc,
                'n_estimators': n_estimators,
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'accuracy_train': accuracy_train,
                'accuracy_test': accuracy_test
            })

# Convert the results list to a DataFrame
results_df = pd.DataFrame(results)

# Sort the results by accuracy_test in descending order
sorted_results_df = results_df.sort_values(by='accuracy_test', ascending=False)

print(sorted_results_df)

# Register the best parameters in a CSV file
sorted_results_df.to_csv("S_RTF_Racket_P1.P2.P3_MFCC.csv", index=False)

print("Results have been saved to 'S_RTF_Racket_P1.P2.P3_MFCC.csv'.")

     n_mfcc  n_estimators  max_depth  min_samples_split  accuracy_train  \
109      30            80       10.0                  5        0.993724   
101      30            70       30.0                 10        0.966527   
98       30            70       20.0                 10        0.966527   
92       30            70        NaN                 10        0.966527   
104      30            70       40.0                 10        0.966527   
..      ...           ...        ...                ...             ...   
16       30            20        NaN                  5        0.993724   
13       30            10       40.0                  5        0.962343   
1        30            10        NaN                  5        0.962343   
10       30            10       30.0                  5        0.962343   
7        30            10       20.0                  5        0.962343   

     accuracy_test  
109       0.658333  
101       0.650000  
98        0.650000  
92        0.650