In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_submission = pd.read_csv('submission.csv')
df_submission

In [None]:
!pip install openai-whisper


In [None]:
# 1. Imports
import os
import numpy as np
import pandas as pd
import librosa
import whisper
import xgboost as xgb
from scipy.stats import pearsonr, skew
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

# 2. Paths
train_df = pd.read_csv("/kaggle/input/shl-intern-hiring-assessment/dataset/train.csv")
test_df = pd.read_csv("/kaggle/input/shl-intern-hiring-assessment/dataset/test.csv")
sample_submission = pd.read_csv("/kaggle/input/shl-intern-hiring-assessment/dataset/sample_submission.csv")

train_path = "/kaggle/input/shl-intern-hiring-assessment/dataset/audios_train/"
test_path = "/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test/"

# 3. Whisper Embeddings + MFCC Extraction
def extract_features(df, audio_path, model):
    features = []
    for fname in tqdm(df['filename']):
        path = os.path.join(audio_path, fname)
        y, sr = librosa.load(path, sr=16000)

        # MFCC stats
        mfcc = librosa.feature.mfcc(y=y, sr=16000, n_mfcc=13)
        mfcc_mean = np.mean(mfcc, axis=1)
        mfcc_std = np.std(mfcc, axis=1)
        mfcc_skew = skew(mfcc, axis=1)

        # Whisper embeddings (we use the encoder output from tiny or base model)
        audio = whisper.load_audio(path)
        audio = whisper.pad_or_trim(audio)
        mel = whisper.log_mel_spectrogram(audio).to(model.device)
        with torch.no_grad():
            embedding = model.encoder(mel.unsqueeze(0))  # shape: (1, frames, dim)
            whisper_feat = embedding[0].mean(dim=0).cpu().numpy()

        # Combine
        feature_vector = np.hstack([mfcc_mean, mfcc_std, mfcc_skew, whisper_feat])
        features.append(feature_vector)

    return np.array(features)

# 4. Load Whisper Model
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
whisper_model = whisper.load_model("base", device=device)  # You can try "small" or "medium"

# 5. Extract Features
X_train = extract_features(train_df, train_path, whisper_model)
y_train = train_df["label"].values
X_test = extract_features(test_df, test_path, whisper_model)

# 6. Handle NaN/Inf
for arr in [X_train, X_test]:
    arr[np.isnan(arr)] = 0
    arr[np.isinf(arr)] = 0

# 7. XGBoost Regressor
xgb_model = xgb.XGBRegressor(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
    n_jobs=-1
)
xgb_model.fit(X_train, y_train)

# 8. Evaluation
y_pred_train = xgb_model.predict(X_train)
pearson_corr = pearsonr(y_train, y_pred_train)[0]
rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
print(f"✅ Pearson Correlation (Train): {pearson_corr:.4f}")
print(f"✅ RMSE (Train): {rmse:.4f}")

# 9. Visualizations
plt.figure(figsize=(8, 5))
sns.scatterplot(x=y_train, y=y_pred_train)
plt.xlabel("Actual Grammar Score")
plt.ylabel("Predicted Score")
plt.title("Predicted vs Actual (Train)")
plt.grid(True)
plt.show()

residuals = y_train - y_pred_train
plt.figure(figsize=(8, 4))
sns.histplot(residuals, kde=True)
plt.title("Residuals Distribution")
plt.xlabel("Residuals")
plt.grid(True)
plt.show()

# 10. Test Prediction
test_preds = xgb_model.predict(X_test)
sample_submission["label"] = test_preds
sample_submission.to_csv("submission_whisper_xgb.csv", index=False)
print("📁 Submission saved as submission_whisper_xgb.csv")


Grammar Scoring from Audio using Whisper and XGBoost
🔍 Overview
This project aims to predict the grammar quality score of spoken English using audio recordings. The core idea is to extract meaningful features from audio—both low-level acoustic characteristics and high-level semantic content—and use a machine learning model to learn the relationship between these features and the grammar score.

📁 Dataset Summary
The dataset includes:

A training set with audio file names and corresponding grammar scores.

A test set with only audio file names (no labels).

Audio folders containing the actual recordings.

A submission format file showing how to structure predictions.

Each audio file represents a short clip of a person speaking. The grammar score (target) reflects the quality of spoken grammar, likely rated by human annotators or derived through linguistic analysis.

🎧 Feature Engineering
To build an effective model, we extracted two types of features from each audio file:

1. MFCC (Mel-Frequency Cepstral Coefficients)
MFCCs are standard acoustic features used in speech processing. For each audio:

We calculated the mean, standard deviation, and skewness of 13 MFCC coefficients.

These capture how energy and frequency components behave across time in the voice.

MFCCs reflect the pronunciation, rhythm, and articulation, indirectly signaling grammar proficiency.

2. Whisper Embeddings (from OpenAI Whisper model)
We used a pretrained Whisper model from OpenAI to extract deeper, semantic-level features:

The Whisper model listens to the audio and produces a numerical representation (embedding) of what it "understands" from it.

These embeddings include information about word usage, sentence structure, clarity, and fluency.

We averaged the encoder outputs to create a fixed-size vector representing the entire audio's content and style.

Combining MFCC and Whisper features gave us a rich representation of both how something was said and what was said.

⚙️ Model Selection: XGBoost Regressor
We trained an XGBoost regression model to map the extracted features to grammar scores.

Why XGBoost?

It performs well with structured, high-dimensional features.

It handles non-linear relationships effectively.

It's fast to train, allows for regularization, and is robust to noise.

The model was trained on the training data and then evaluated to check how well it fits.

📊 Evaluation Results
To measure how good our model was, we used two key metrics:

Pearson Correlation
Tells us how well the predicted scores correlate with actual grammar scores.

A value close to 1 means our predictions follow the true trends very well.

RMSE (Root Mean Squared Error)
Shows the average prediction error.

Lower values indicate more accurate predictions.

The model showed very strong performance with high Pearson correlation and low RMSE on training data, indicating that it effectively learned the grammar scoring pattern.

📈 Visual Insights
Predicted vs Actual Plot
A scatter plot of predicted vs actual scores showed that most points fell close to the diagonal line, indicating accurate predictions.

Residual Plot
The residuals (differences between actual and predicted scores) were normally distributed around zero, which is a good sign. It means the model doesn’t systematically overestimate or underestimate scores.

📦 Final Submission
The model was applied to the test set, and the predictions were saved in the required format for Kaggle submission. This step completes the pipeline from raw audio to automated grammar score prediction.
