In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/shl-intern-hiring-assessment/Dataset/sample_submission.csv
/kaggle/input/shl-intern-hiring-assessment/Dataset/train.csv
/kaggle/input/shl-intern-hiring-assessment/Dataset/test.csv
/kaggle/input/shl-intern-hiring-assessment/Dataset/audios/test/audio_885.wav
/kaggle/input/shl-intern-hiring-assessment/Dataset/audios/test/audio_1142.wav
/kaggle/input/shl-intern-hiring-assessment/Dataset/audios/test/audio_1006.wav
/kaggle/input/shl-intern-hiring-assessment/Dataset/audios/test/audio_817.wav
/kaggle/input/shl-intern-hiring-assessment/Dataset/audios/test/audio_765.wav
/kaggle/input/shl-intern-hiring-assessment/Dataset/audios/test/audio_508.wav
/kaggle/input/shl-intern-hiring-assessment/Dataset/audios/test/audio_257.wav
/kaggle/input/shl-intern-hiring-assessment/Dataset/audios/test/audio_330.wav
/kaggle/input/shl-intern-hiring-assessment/Dataset/audios/test/audio_72.wav
/kaggle/input/shl-intern-hiring-assessment/Dataset/audios/test/audio_328.wav
/kaggle/input/shl-intern-hiring-ass

In [2]:
import os
import numpy as np
import pandas as pd
import librosa
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split


In [3]:
# 1. Paths
DATA_PATH = '/kaggle/input/shl-intern-hiring-assessment/Dataset'
TRAIN_AUDIO_PATH = os.path.join(DATA_PATH, 'audios', 'train')
TEST_AUDIO_PATH = os.path.join(DATA_PATH, 'audios', 'test')

# 2. Load train.csv and test.csv
train_df = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
test_df = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))


In [4]:
# 3. Feature extraction function
def extract_features_with_librosa(file_path):
    try:
        y, sr = librosa.load(file_path, sr=None)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        return np.mean(mfccs.T, axis=0)
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return np.zeros(13)

In [5]:
# 4. Extract features from training data
X = []
y = []

for _, row in train_df.iterrows():
    file_name = row['filename']
    path = os.path.join(TRAIN_AUDIO_PATH, file_name)
    features = extract_features_with_librosa(path)
    X.append(features)
    y.append(row['label'])

X = np.array(X)
y = np.array(y)

In [6]:
# 5. Train the model (RandomForest as example)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X, y)


In [7]:
# 6. Extract features from test data
test_features = []
valid_filenames = []

for _, row in test_df.iterrows():
    file_name = row['filename']
    path = os.path.join(TEST_AUDIO_PATH, file_name)
    features = extract_features_with_librosa(path)
    test_features.append(features)
    valid_filenames.append(file_name)

X_test = np.array(test_features)

In [8]:
# 7. Predict grammar scores for test data
predictions = model.predict(X_test)


In [9]:
# 8. Prepare and save the submission file
submission = pd.DataFrame({
    'filename': valid_filenames,
    'label': predictions
})

submission.to_csv('submission.csv', index=False)


In [10]:
# 9. Display submission
submission.head()

Unnamed: 0,filename,label
0,audio_804.wav,2.27
1,audio_1028.wav,3.67
2,audio_865.wav,3.685
3,audio_774.wav,2.845
4,audio_1138.wav,3.815


In [11]:
# Load the saved submission file
submission_df = pd.read_csv('submission.csv')

# Print the number of entries
print("Number of entries in submission.csv:", len(submission_df))

Number of entries in submission.csv: 204
