In [6]:
import pandas as pd
import librosa
import soundfile as sf
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

In [7]:
# Load the Excel file
labels = pd.read_excel('label2.xlsx')

# Display the first few rows
print(labels.head())

                                            audio  pronunciation  fluency
0  audio_beb45dff-f264-43c5-b051-61dab6a3b2a1.m4a             11       25
1  audio_f86da9b5-9ed2-4153-9508-e59448e7d8bc.m4a             53       64
2  audio_0e7cded9-1569-4202-98f8-59f88384437b.m4a             40       51
3  audio_08bf8a53-1e31-4d82-8979-b53a14baa196.m4a             14       36
4  audio_a3b474a9-3b8e-4533-aa95-0cda51f1395d.m4a             11       25


In [8]:
def extract_features(audio_path):
    y, sr = librosa.load(audio_path, sr=None)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
    return np.hstack((np.mean(mfccs, axis=1), 
                      np.mean(chroma, axis=1), 
                      np.mean(spectral_contrast, axis=1)))

# Directory containing audio files
audio_dir = 'audio/'

# Initialize lists to store features and labels
X = []
y_pronunciation = []
y_fluency = []

# Iterate over the rows of the labels DataFrame
for index, row in labels.iterrows():
    audio_path = os.path.join(audio_dir, row['audio'])
    if os.path.exists(audio_path):
        features = extract_features(audio_path)
        X.append(features)
        y_pronunciation.append(row['pronunciation'])
        y_fluency.append(row['fluency'])
    else:
        print(f'File {audio_path} does not exist.')

X = np.array(X)
y_pronunciation = np.array(y_pronunciation)
y_fluency = np.array(y_fluency)

print(f'Feature matrix shape: {X.shape}')
print(f'Pronunciation scores shape: {y_pronunciation.shape}')
print(f'Fluency scores shape: {y_fluency.shape}')


  y, sr = librosa.load(audio_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


File audio/audio_fc49c5b8-0d30-4f5a-9cae-4ed06e68149d.m4a does not exist.
File audio/audio_f7c4471e-2910-4dc2-ba5a-7b425f220f95.m4a does not exist.
File audio/audio_e77aaa31-88e0-4678-9d95-5f560ca3b672.m4a does not exist.
File audio/audio_e5021a34-141c-407d-891a-b137bab4c6a5.m4a does not exist.
File audio/audio_b7fbbe97-2a32-4cde-9cb7-72e53478f068.m4a does not exist.
File audio/audio_af70b6ad-b311-4797-bb79-635785f13cf0.m4a does not exist.
File audio/audio_af5ea7f3-ca32-4cbe-ba69-67e39c75b418.m4a does not exist.
File audio/audio_81667da5-66c4-4267-a8ed-1a117c4700ad.m4a does not exist.
File audio/audio_7987ee3e-4a51-433d-b460-a6a1d994239f.m4a does not exist.
File audio/audio_62ceeb02-8b27-47f2-8ae3-ed3f0a2d02b1.m4a does not exist.
File audio/audio_55561999-7a53-49ac-abee-17002be412b9.m4a does not exist.
File audio/audio_3cab51c4-b75e-4878-aa91-58692b8f8a84.m4a does not exist.
File audio/audio_37b00798-0317-4557-a1b5-f11253d6e09f.m4a does not exist.
File audio/audio_3770eed3-b6de-42d0-8c

In [9]:
# Split the data
X_train, X_test, y_pron_train, y_pron_test = train_test_split(X, y_pronunciation, test_size=0.2, random_state=42)
X_train, X_test, y_flu_train, y_flu_test = train_test_split(X, y_fluency, test_size=0.2, random_state=42)

# Initialize and train the model for pronunciation score
model_pron = SVR(kernel='rbf')
model_pron.fit(X_train, y_pron_train)

# Initialize and train the model for fluency score
model_flu = SVR(kernel='rbf')
model_flu.fit(X_train, y_flu_train)

# Make predictions
y_pron_pred = model_pron.predict(X_test)
y_flu_pred = model_flu.predict(X_test)

# Evaluate the model
mse_pron = mean_squared_error(y_pron_test, y_pron_pred)
mse_flu = mean_squared_error(y_flu_test, y_flu_pred)

print(f'Pronunciation MSE: {mse_pron}')
print(f'Fluency MSE: {mse_flu}')

Pronunciation MSE: 494.1627332296969
Fluency MSE: 335.44441098835097


In [10]:
# Assuming you have a function to extract features from a single audio file
new_audio_features = extract_features('don1.mp3')

# Predict pronunciation score
pron_score_pred = model_pron.predict(new_audio_features.reshape(1, -1))

# Predict fluency score
fluency_score_pred = model_flu.predict(new_audio_features.reshape(1, -1))

print(f'Predicted Pronunciation Score: {pron_score_pred[0]}')
print(f'Predicted Fluency Score: {fluency_score_pred[0]}')

Predicted Pronunciation Score: 52.64565503166643
Predicted Fluency Score: 62.733352677777205
