# KNN Model for predicting raquette type

Import

In [14]:
import os
import glob
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from scipy.io import wavfile, loadmat
from scipy.signal import hilbert
from scipy.fft import fft
from scipy.signal import find_peaks
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
# import openpyxl


sr = 20000


Readwav function

In [15]:
def readCsvFolder(folderPath):
    fileFolder = []
    fileNames = []
    for dirpath, dirs, files in os.walk(folderPath):
        for file in files:
            if file.endswith('.csv') and "_C_" in file:
                base_name = os.path.splitext(file)[0]
                file_path = os.path.join(dirpath, file)  # Full path of the file
                data = pd.read_csv(file_path)  # Read the CSV file using pandas
                fileFolder.append(data)
                fileNames.append(file)  # Store the file name
    return fileFolder, fileNames

p1_csv_vect, p1_files = readCsvFolder("../Vibrations/P1_clean")
p2_csv_vect, p2_files = readCsvFolder("../Vibrations/P2_clean")
p3_csv_vect, p3_files = readCsvFolder("../Vibrations/P3_clean")



for i in range(len(p1_csv_vect)):
    plt.plot(p1_csv_vect[i].iloc[:, 0])  # Plot only the first column
plt.show()

for i in range(len(p2_csv_vect)):
    plt.plot(p2_csv_vect[i].iloc[:, 0])  # Plot only the first column
plt.show()

for i in range(len(p3_csv_vect)):
    plt.plot(p3_csv_vect[i].iloc[:, 0])  # Plot only the first column
plt.show()

Spectrum

In [16]:
def spectrumFromCsv(channel, sampling_rate=sr):
    signal = channel.iloc[:, 0].values  # vibration data
    N = len(signal)
    xf = np.fft.fftfreq(N, d=1/sampling_rate)[:N // 2]
    spectrum = fft(signal)
    magnitude = abs(spectrum[:N // 2])
    return xf, magnitude

# Data sources
file_groups = [p1_files, p2_files, p3_files]
csv_groups = [p1_csv_vect, p2_csv_vect, p3_csv_vect]
spectrum_groups = [[], [], []]  # Will hold p1_spectrum_vect, p2_spectrum_vect, p3_spectrum_vect

# Step 1: Compute and store filtered spectra for all groups
for group_idx in range(len(file_groups)):
    for i in range(len(file_groups[group_idx])):
        freqs, spectrum = spectrumFromCsv(csv_groups[group_idx][i])
        band_mask = (freqs >= 150) & (freqs <= 1000)
        filtered_freqs = freqs[band_mask]
        filtered_spectrum = spectrum[band_mask]
        spectrum_groups[group_idx].append((filtered_freqs, filtered_spectrum))

# Optional: Print lengths for verification
print(len(spectrum_groups[0]), len(spectrum_groups[1]), len(spectrum_groups[2]))

# Step 2: Plot one sample per racket type from each group
for group_idx in range(len(file_groups)):
    spectrum_vect = spectrum_groups[group_idx]
    files = file_groups[group_idx]
    shown_raquettes = {"RB": 0, "RO": 0, "RR": 0, "RV": 0}

    for i, (freqs, spectrum) in enumerate(spectrum_vect):
        for raquetteType in shown_raquettes:
            if raquetteType in files[i] and shown_raquettes[raquetteType] == 0:
                plt.figure(figsize=(6, 3))
                plt.plot(freqs, spectrum)
                plt.grid()
                plt.title(f"{files[i]}")
                plt.xlabel("Frequency (Hz)")
                plt.ylabel("Amplitude")
                plt.show()
                shown_raquettes[raquetteType] = 1



0 0 0


# Peaks

In [17]:
def extractNPeak(n_peak, signal_tuple):
    freqs, spectrum = signal_tuple

    peaks, _ = find_peaks(spectrum, distance=5)
    sorted_indices = np.argsort(spectrum[peaks])[::-1][:n_peak]
    peaks = peaks[sorted_indices]

    peak_frequencies = freqs[peaks]
    peak_values = spectrum[peaks]   

    return peak_frequencies, peak_values

peak_groups = [[],[],[]]
for group_idx in range(len(file_groups)):
    files = file_groups[group_idx]
    spectrum_vect = spectrum_groups[group_idx]

    # Limit to just the first sample from each group
    for i, (freqs, spectrum) in enumerate(spectrum_vect):
        peak_freq, peak_val = extractNPeak(8, (freqs, spectrum))
        peak_groups[group_idx].append((peak_freq, peak_val))
    peaks, peak_values = peak_groups[group_idx][0]
    plt.figure(figsize=(6, 3))
    plt.plot(spectrum_vect[0][0], spectrum_vect[0][1], label="Spectrum")
    plt.scatter(peaks, peak_values, color='red', label="Peaks")
    plt.title(files[i])
    plt.xlabel("Frequency (Hz)")
    plt.ylabel("Amplitude")
    plt.grid()
    plt.legend()
    plt.show()

IndexError: list index out of range

## Envelope

In [None]:
def envelope(signal_tuple, fs=sr):
    freqs, spectrum = signal_tuple
    analytic_signal = hilbert(spectrum)
    amplitude_envelope = np.abs(analytic_signal)
    return freqs, amplitude_envelope

envelope_groups = [[],[],[]]
for group_idx in range(len(file_groups)):
    files = file_groups[group_idx]
    spectrum_vect = spectrum_groups[group_idx]

    # Limit to just the first sample from each group
    for i, (freqs, spectrum) in enumerate(spectrum_vect):
        freq, val = envelope((freqs, spectrum))
        envelope_groups[group_idx].append((freq, val))
    freqs, vals = envelope_groups[group_idx][0]
    plt.figure(figsize=(6, 3))
    plt.plot(spectrum_vect[0][0], spectrum_vect[0][1], label="Spectrum")
    plt.plot(freqs, vals, color='red', label="Peaks")
    plt.title(files[i])
    plt.xlabel("Frequency (Hz)")
    plt.ylabel("Amplitude")
    plt.grid()
    plt.legend()
    plt.show()

IndexError: list index out of range

## energy per band


In [None]:
def energy_per_frequency_band_from_spectrum(spectrum, freqs, band_width):
    # Calculate energy per band
    band_energies = []
    band_frequencies = []
    for start_freq in range(0, int(freqs[-1]), band_width):
        end_freq = start_freq + band_width
        band_indices = np.where((freqs >= start_freq) & (freqs < end_freq))[0]
        band_energy = np.sum(spectrum[band_indices]**2)  # Sum of squared magnitudes
        band_energies.append(band_energy)
        band_frequencies.append((start_freq + end_freq) / 2)  # Center frequency of the band

    return band_energies, band_frequencies

band_width = 50  # Band width in Hz
energy_groups = [[], [], []]
for group_idx in range(len(file_groups)):
    files = file_groups[group_idx]
    spectrum_vect = spectrum_groups[group_idx]

    # Calculate energy per band for each spectrum
    for i, (freqs, spectrum) in enumerate(spectrum_vect):
        band_energies, band_frequencies = energy_per_frequency_band_from_spectrum(spectrum, freqs, band_width)
        energy_groups[group_idx].append((band_frequencies, band_energies))

    # Plot energy per band for the first sample in the group
    band_freqs, band_energies = energy_groups[group_idx][0]
    plt.figure(figsize=(6, 3))
    plt.bar(band_freqs, band_energies, width=band_width * 0.8, align='center', alpha=0.7)
    plt.title(f"Energy per Band - {files[0]}")
    plt.xlabel("Frequency (Hz)")
    plt.ylabel("Energy")
    plt.grid()
    plt.show()


IndexError: list index out of range

## mfcc's

In [None]:
import librosa

def mfccs_from_csv(channel, sr=sr, n_mfcc=13):
    signal = channel.iloc[:, 0].values  # vibration data
    mfccs = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=n_mfcc)
    return mfccs

mfccs = mfccs_from_csv(p1_csv_vect[0])
print(mfccs)

IndexError: list index out of range

## Main 

In [18]:
raquetteTypeList = {"RB": 0, "RO": 1, "RR": 2, "RV": 3}
results = []

# Loop over different peak counts (e.g., 5, 10, 15, 20, 25)
for i in range(1, 10):
    bw = i * 10
    print("bw", bw)

    X_enerfyFreq = []
    X_energyValue = []
    Y_Label = []

    spectra = spectrum_groups[0]

    # Loop through each sample
    for j, (freqs, spectrum) in enumerate(spectra):
        for racketType in raquetteTypeList:
            if racketType in files[j]:
                # Extract N peaks
                energy_freq, energy_value = energy_per_frequency_band_from_spectrum(freqs, spectrum, bw)

                X_enerfyFreq.append(energy_freq)
                X_energyValue.append(energy_value)
                Y_Label.append(racketType)

    # Pad vectors to the same length
    max_length = max(max(len(p) for p in X_enerfyFreq), max(len(a) for a in X_energyValue))
    X_peaksFreq_padded = [np.pad(p, (0, max_length - len(p)), constant_values=0) for p in X_enerfyFreq]
    X_peaksAmplitude_padded = [np.pad(a, (0, max_length - len(a)), constant_values=0) for a in X_energyValue]

    # Combine features
    X = np.hstack((np.array(X_peaksFreq_padded), np.array(X_peaksAmplitude_padded)))

    # Encode labels
    label_encoder = LabelEncoder()
    Y = label_encoder.fit_transform(Y_Label)

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=None, shuffle=True)

    # KNN hyperparameter grid search
    for n_neighbors in range(1, 21):
        for weights in ['uniform', 'distance']:
            for metric in ['euclidean', 'manhattan', 'chebyshev', 'minkowski']:
                knn = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, metric=metric)
                knn.fit(X_train, y_train)

                y_pred = knn.predict(X_test)
                accuracy_test = accuracy_score(y_test, y_pred)

                y_train_pred = knn.predict(X_train)
                accuracy_train = accuracy_score(y_train, y_train_pred)

                results.append({
                    'bandwidth': bw,
                    'n_neighbors': n_neighbors,
                    'weights': weights,
                    'metric': metric,
                    'accuracy_train': accuracy_train,
                    'accuracy_test': accuracy_test
                })

# Save results
results_df = pd.DataFrame(results)
if os.path.exists("results.xlsx"):
    os.remove("results.xlsx")
results_df.to_excel("results.xlsx", index=False)
print("Results have been saved to 'results.xlsx'.")


bw 10


ValueError: max() arg is an empty sequence

Visualisation des données fournis au model

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns

# Define frequency bands
frequency_bands = [(0, 200), (200, 400), (400, 600), (600, 800), (800, 1000)]  # Example bands in Hz
colors = ['red', 'blue', 'green', 'orange']  # RB, RO, RR, RV
raquetteTypeList = {"RB": 0, "RO": 1, "RR": 2, "RV": 3}
label_names = list(raquetteTypeList.keys())

# Extract best configuration from results
best_params = results_df.loc[results_df['accuracy_test'].idxmax()]
n_neighbors_best = int(best_params['n_neighbors'])
weights_best = best_params['weights']
metric_best = best_params['metric']

print("\nBest KNN Parameters:")
print(f"bandwidth: {best_params['bandwidth']}")
print(f"n_neighbors: {n_neighbors_best}")
print(f"weights: {weights_best}")
print(f"metric: {metric_best}")
print(f"Train Accuracy: {best_params['accuracy_train']:.4f}")
print(f"Test Accuracy: {best_params['accuracy_test']:.4f}")

# Rebuild data with energy per band
X_energyBands = []
Y_Label = []

for group_idx in range(len(spectrum_groups)):
    files = file_groups[group_idx]
    spectra = spectrum_groups[group_idx]
    
    for j, (freqs, spectrum) in enumerate(spectra):
        for racketType in raquetteTypeList:
            if racketType in files[j]:
                energy_bands = []
                for band in frequency_bands:
                    band_energy = np.sum(spectrum[(freqs >= band[0]) & (freqs < band[1])])
                    energy_bands.append(band_energy)
                X_energyBands.append(energy_bands)
                Y_Label.append(racketType)

X_energyBands = np.array(X_energyBands)
X_energyBands = X_energyBands / np.max(X_energyBands, axis=0)  # Normalize per band
label_encoder = LabelEncoder()
Y = label_encoder.fit_transform(Y_Label)

X_train, X_test, y_train, y_test = train_test_split(X_energyBands, Y, test_size=0.2, random_state=None, shuffle=True)

# Train best KNN model
knn = KNeighborsClassifier(n_neighbors=n_neighbors_best, weights=weights_best, metric=metric_best)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

# ---------- Confusion Matrix ----------
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_names, yticklabels=label_names)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# ---------- Classification Report ----------
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_names))


NameError: name 'results_df' is not defined