<a href="https://colab.research.google.com/github/jeslin09/Grammar-Scoring/blob/main/Hybrid_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

shl_intern_hiring_assessment_path = kagglehub.competition_download('shl-intern-hiring-assessment')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/shl-intern-hiring-assessment/dataset/sample_submission.csv
/kaggle/input/shl-intern-hiring-assessment/dataset/train.csv
/kaggle/input/shl-intern-hiring-assessment/dataset/test.csv
/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test/audio_885.wav
/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test/audio_698.wav
/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test/audio_1176.wav
/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test/audio_1215.wav
/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test/audio_66.wav
/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test/audio_386.wav
/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test/audio_1026.wav
/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test/audio_330.wav
/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test/audio_72.wav
/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test/audio_858.wav
/kaggle/input/shl-intern-hiring-ass

In [None]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import librosa

# Use a small Whisper model for efficient transcription
model_id = "openai/whisper-small"
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id)

# Create a pipeline for transcription
transcriber = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=8,
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
)

def transcribe_audio(audio_path):
    # Load audio file
    speech_array, _ = librosa.load(audio_path, sr=16000)
    # Get transcription
    result = transcriber(speech_array)
    return result["text"]

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

Device set to use cuda


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
import joblib
def prepare_training_data(csv_path, audio_folder, transcriber):
    df = pd.read_csv(csv_path)
    transcriptions = []
    scores = []  # placeholder if you compute WER, similarity, etc.

    for idx, row in df.iterrows():
        audio_file = os.path.join(audio_folder, row['filename'])
        try:
            transcription = transcribe_audio(audio_file)
            transcriptions.append(transcription)

            # Dummy score placeholder
            scores.append(1.0)  # Or calculate something real
        except Exception as e:
            print(f"Error processing {audio_file}: {e}")
            transcriptions.append("")
            scores.append(0.0)

    return transcriptions, scores
def train_simple_grammar_model(transcriptions, scores):
    # Convert dictionary to lists
    filenames = list(transcriptions.keys())
    text_list = [transcriptions[f] for f in filenames]
    score_list = [scores[f] for f in filenames]

    # Split data
    train_texts, val_texts, train_scores, val_scores = train_test_split(
        text_list, score_list, test_size=0.2, random_state=42
    )

    # Create a TF-IDF vectorizer
    vectorizer = TfidfVectorizer(
        max_features=5000,  # Limit features for memory
        ngram_range=(1, 2),  # Use unigrams and bigrams
        min_df=2  # Minimum document frequency
    )

    # Transform texts to TF-IDF features
    X_train = vectorizer.fit_transform(train_texts)
    X_val = vectorizer.transform(val_texts)

    # Train a Ridge regression model
    model = Ridge(alpha=1.0)
    model.fit(X_train, train_scores)

    # Evaluate on validation set
    val_predictions = model.predict(X_val)
    val_mse = np.mean((val_predictions - val_scores) ** 2)
    print(f"Validation MSE: {val_mse}")

    # Save the model and vectorizer
    joblib.dump(model, "grammar_ridge_model.joblib")
    joblib.dump(vectorizer, "grammar_tfidf_vectorizer.joblib")

    return "grammar_ridge_model.joblib", "grammar_tfidf_vectorizer.joblib"

def predict_with_simple_model(model_path, vectorizer_path, test_csv_path, test_audio_folder, transcriber):
    # Load test CSV
    test_df = pd.read_csv(test_csv_path)

    # Load model and vectorizer
    model = joblib.load(model_path)
    vectorizer = joblib.load(vectorizer_path)

    # Prepare results
    predictions = []
    filenames = []

    # Process audio files and predict scores
    for index, row in test_df.iterrows():
        filename = row['filename']  # Adjust column name if different
        audio_path = os.path.join(test_audio_folder, filename)

        if os.path.exists(audio_path):
            # Transcribe
            transcription = transcribe_audio(audio_path, transcriber)

            if not transcription:
                print(f"Warning: Empty transcription for {filename}")
                predictions.append(0.0)  # Default score
                filenames.append(filename)
                continue

            # Transform text to TF-IDF features
            X = vectorizer.transform([transcription])

            # Predict
            score = model.predict(X)[0]

            predictions.append(score)
            filenames.append(filename)

            # Optional: Print progress
            if index % 10 == 0:
                print(f"Predicted {index} files")

    # Create submission dataframe
    submission_df = pd.DataFrame({
        'filename': filenames,
        'label': predictions
    })

    return submission_df

def main():
    # Setup paths
    train_csv_path = "/kaggle/input/shl-intern-hiring-assessment/dataset/train.csv"
    train_audio_folder = "/kaggle/input/shl-intern-hiring-assessment/dataset/audios_train"
    test_csv_path = "/kaggle/input/shl-intern-hiring-assessment/dataset/test.csv"
    test_audio_folder = "/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test"

    # Initialize transcriber
    from transformers import pipeline

    # Prepare training data
    print("Transcribing training audio...")
    transcriptions, scores = prepare_training_data(train_csv_path, train_audio_folder, transcriber)

    # Train model
    print("Training grammar model...")
    model_path = train_grammar_model(transcriptions, scores)
    print("Generating predictions for test data...")
    submission_df = predict_grammar_scores(model_path, test_csv_path, test_audio_folder, transcriber)

    # Save submission
    submission_df.to_csv("grammar_score_submission.csv", index=False)

if __name__ == "__main__":
    main()

Transcribing training audio...


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [None]:
def plot_test_predictions(predictions, title="Predicted Grammar Scores on Test Set"):
    plt.figure(figsize=(8, 5))
    plt.hist(predictions, bins=30, color='lightgreen', edgecolor='black')
    plt.title(title)
    plt.xlabel("Predicted Score")
    plt.ylabel("Frequency")
    plt.grid(True)
    plt.show()