In [48]:
import os
import numpy as np
import librosa
import duckdb
import plotly.express as px
import plotly.graph_objects as go

Define Dataset Paths

In [95]:
basePath = '/kaggle/input/shl-intern-hiring-assessment/dataset'
trainAudios = f"{basePath}/audios_train"
testAudios = f"{basePath}/audios_test"
trainCSV = f"{basePath}/train.csv"
testCSV = f"{basePath}/test.csv"
resultCSV = "result.csv"


Function to extract the following features:

- MFCC (Mel Frequency Cepstral Coefficients)

- Spectral Centroid

- Zero Crossing Rate

In [18]:
def extractFeatures(path, sr=22050):
    y, sr = librosa.load(path, sr=sr)
    mfcc = np.mean(librosa.feature.mfcc(y=y, sr=sr).T, axis=0)
    centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
    zcr = np.mean(librosa.feature.zero_crossing_rate(y))
    return np.concatenate([mfcc[:5], [centroid], [zcr]])


Test on sample audio.

In [19]:
sampleFile = os.listdir(trainAudios)[0]
extractFeatures(os.path.join(trainAudios, sampleFile))


array([-3.69039062e+02,  9.56511765e+01,  8.35271835e+00,  1.35610476e+01,
        1.47757602e+00,  1.70971742e+03,  1.00861775e-01])

This function loops through audio files, extracts features, and returns a table of features along with column names.

In [20]:
def buildFeatureTable(audioDir, filenames):
    table = []
    for file in filenames:
        try:
            features = extractFeatures(os.path.join(audioDir, file))
            table.append([file] + features.tolist())
        except Exception as e:
            print(f"Error with {file}: {e}")
    columns = ['filename'] + [f'feature_{i}' for i in range(len(table[0]) - 1)]
    return table, columns


In [23]:
trainSample = duckdb.sql(f"SELECT * FROM read_csv_auto('{trainCSV}') LIMIT 2").df()
features, cols = buildFeatureTable(trainAudios, trainSample['filename'].tolist())
features


[['audio_1261.wav',
  -491.02215576171875,
  50.90874481201172,
  5.2743120193481445,
  13.072772979736328,
  0.2889324426651001,
  1766.877440527962,
  0.10828712144996136],
 ['audio_942.wav',
  -429.4405517578125,
  32.125301361083984,
  -4.888644695281982,
  10.810473442077637,
  -12.059521675109863,
  2083.5324449857635,
  0.14573478403931608]]

load the CSV file and select filenames and labels

In [36]:
trainData = duckdb.sql(f"SELECT * FROM read_csv_auto('{trainCSV}')").df()


In [37]:
trainData.head()


Unnamed: 0,filename,label
0,audio_1261.wav,1.0
1,audio_942.wav,1.5
2,audio_1110.wav,1.5
3,audio_1024.wav,1.5
4,audio_538.wav,2.0


Extract Features for All Files and Merge with Labels

In [38]:
features, cols = buildFeatureTable(trainAudios, trainData['filename'].tolist())
featureTable = duckdb.from_df(
    duckdb.sql("SELECT * FROM (VALUES " + ",".join([f"({','.join([repr(x) for x in row])})" for row in features]) + f") AS t({','.join(cols)})").df()
)


In [39]:
trainTable = duckdb.sql("""
    SELECT f.*, l.label
    FROM featureTable f
    JOIN trainData l
    ON f.filename = l.filename
""").df()


In [40]:
trainTable.head()


Unnamed: 0,filename,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,label
0,audio_1261.wav,-491.022156,50.908745,5.274312,13.072773,0.288932,1766.877441,0.108287,1.0
1,audio_1110.wav,-484.911224,61.282734,-4.729174,8.597246,7.826371,2081.078022,0.21644,1.5
2,audio_1024.wav,-417.125244,92.675316,-7.478524,17.03348,11.358152,1276.24513,0.076038,1.5
3,audio_538.wav,-321.468414,114.073738,-14.03763,3.440585,-1.315021,1573.781942,0.101856,2.0
4,audio_350.wav,-365.578339,114.854919,15.065681,29.256744,7.596002,1251.110861,0.088828,2.5


Feature Matrix and Label Vector

In [41]:
X = trainTable.drop(columns=['filename', 'label']).values
y = trainTable['label'].values
print("Features shape:", X.shape)
print("Labels shape:", y.shape)


Features shape: (444, 7)
Labels shape: (444,)


    Train regression model, evaluates it using MSE, and plots predicted vs actual values.

    Args:
        name (str): Name of the model (for labeling).
        model: A Scikit-learn regressor instance.
        XTrain: Training features (already scaled).
        yTrain: Training labels.
        XVal: Validation features (already scaled).
        yVal: Validation labels.

    Returns:
        float: Mean Squared Error (MSE) for this model.

In [51]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.neural_network import MLPRegressor


In [64]:
def evaluateModel(
    name,
    model,
    XTrain,
    yTrain,
    XVal,
    yVal,
    showScatter=True,
    showResiduals=True,
    showLinePlot=True
):
    model.fit(XTrain, yTrain)
    yPred = model.predict(XVal)
    mse = mean_squared_error(yVal, yPred)

    if showScatter:
        fig = go.Figure()
        fig.add_trace(go.Scatter(
            x=yVal, y=yPred,
            mode='markers',
            marker=dict(color='blue', opacity=0.6),
            name='Predicted'
        ))
        fig.add_trace(go.Scatter(
            x=yVal, y=yVal,
            mode='lines',
            line=dict(color='red', dash='dash'),
            name='Diagonal Fit'
        ))
        fig.update_layout(
            title=f"{name} — Scatter Plot (MSE: {mse:.3f})",
            xaxis_title="Actual",
            yaxis_title="Predicted",
            width=700, height=500
        )
        fig.show()

    if showResiduals:
        residuals = yVal - yPred
        fig = px.scatter(x=yPred, y=residuals, labels={'x': 'Predicted', 'y': 'Residuals'},
                         title=f"{name} — Residual Plot")
        fig.add_hline(y=0, line_dash='dash', line_color='red')
        fig.show()

    if showLinePlot:
        fig = go.Figure()
        fig.add_trace(go.Scatter(y=yVal, name='Actual'))
        fig.add_trace(go.Scatter(y=yPred, name='Predicted'))
        fig.update_layout(
            title=f"{name} — Index-wise Prediction",
            xaxis_title="Sample Index",
            yaxis_title="Score"
        )
        fig.show()

    return mse


Models to test

In [60]:
models = {
    "Linear Regression": LinearRegression(),
    "3NN": KNeighborsRegressor(n_neighbors=3),
    "4NN": KNeighborsRegressor(n_neighbors=4),
    "5NN": KNeighborsRegressor(n_neighbors=5),
    "SVR": SVR(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Extra Trees": ExtraTreesRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42)
}

In [65]:
XTrain, XVal, yTrain, yVal = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
XTrainScaled = scaler.fit_transform(XTrain)
XValScaled = scaler.transform(XVal)

modelMSEs = {}
for name, model in models.items():
    print(f"🔍 Evaluating: {name}")
    mse = evaluateModel(name, model, XTrainScaled, yTrain, XValScaled, yVal)
    modelMSEs[name] = mse


🔍 Evaluating: Linear Regression


🔍 Evaluating: 3NN


🔍 Evaluating: 4NN


🔍 Evaluating: 5NN


🔍 Evaluating: SVR


🔍 Evaluating: Random Forest


🔍 Evaluating: Extra Trees


🔍 Evaluating: Gradient Boosting


Since 5NN and ExtraTree seems to be best fit, train them on full training data and predict scores for test audio files.


In [85]:
def predictTestAudio(model, scaler, featureFunc, trainPath, trainCSV, testPath, testCSV, outputCSV):
    query = f"SELECT * FROM read_csv_auto('{trainCSV}')"
    dfTrain = duckdb.sql(query).df()
    yFull = dfTrain['label'].values
    XFull = np.array([featureFunc(os.path.join(trainPath, f)) for f in dfTrain['filename']])
    
    XFullScaled = scaler.fit_transform(XFull)

    model.fit(XFullScaled, yFull)

    testFiles = duckdb.sql(f"SELECT * FROM read_csv_auto('{testCSV}')").df()['filename'].tolist()

    XTest = np.array([featureFunc(os.path.join(testPath, f)) for f in testFiles])
    XTestScaled = scaler.transform(XTest)

    yPredTest = model.predict(XTestScaled)

    resultDF = duckdb.sql(f"SELECT filename FROM read_csv_auto('{testCSV}')").df()
    resultDF['label'] = yPredTest

    resultDF.to_csv(outputCSV, index=False)
    print(resultDF)
    print(f"Predictions saved to {outputCSV}")


Generate CSV with predicted labels

In [96]:
predictTestAudio(
    ExtraTreesRegressor(n_estimators=100, random_state=42),
    StandardScaler(),
    extractFeatures,
    trainAudios,
    trainCSV,
    testAudios,
    testCSV,
    resultCSV
)


           filename  label
0     audio_706.wav  3.125
1     audio_800.wav  3.155
2      audio_68.wav  3.650
3    audio_1267.wav  3.695
4     audio_683.wav  3.550
..              ...    ...
190   audio_135.wav  4.250
191   audio_512.wav  4.250
192   audio_529.wav  4.245
193   audio_762.wav  4.250
194   audio_379.wav  3.490

[195 rows x 2 columns]
Predictions saved to result.csv
