#**SHL Grammar Rating Assignment**


In [None]:
!pip install nbstripout

In [None]:
!nbstripout SHL_Grammar_Rating.ipynb

Mounting Google Drives to access the dataset.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

##Preprocessing the training audio files.

Importing the necessary packages and preprocessing the training audio and saving them in new folder.

In [None]:
import os
import librosa
import soundfile as sf
import pandas as pd
from tqdm.notebook import tqdm

# Paths
AUDIO_DIR = '/content/drive/MyDrive/Colab Notebooks/audios/train'
CSV_PATH = '/content/drive/MyDrive/Colab Notebooks/train.csv'
PROCESSED_DIR = '/content/drive/MyDrive/Colab Notebooks/processed_audio'
os.makedirs(PROCESSED_DIR, exist_ok=True)

train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/train.csv')
train_df.columns = ['filename', 'label']

def preprocess_audio(file_path, save_path, sr=16000):
    y, orig_sr = librosa.load(file_path, sr=None)
    if orig_sr != sr:
        y = librosa.resample(y, orig_sr, sr)
    y = y / max(abs(y))
    y, _ = librosa.effects.trim(y, top_db=25)  #Silence Trimming
    sf.write(save_path, y, sr)

# Preprocessing each audio
for filename in tqdm(train_df['filename']):
    in_path = os.path.join(AUDIO_DIR, filename)
    out_path = os.path.join(PROCESSED_DIR, filename)
    preprocess_audio(in_path, out_path)

print("✅ Audio preprocessing completed. Files saved in:", PROCESSED_DIR)

Verifying the Preprocessed audio files.

In [None]:
files = os.listdir('/content/drive/MyDrive/Colab Notebooks/processed_audio')
print(f"🔎 Found {len(files)} preprocessed audio files.\nExample files:\n", files[:5])

# Checking sample rate and duration of a random file
sample_file = os.path.join('/content/drive/MyDrive/Colab Notebooks/processed_audio', files[0])
y, sr = librosa.load(sample_file, sr=None)

duration = librosa.get_duration(y=y, sr=sr)
print(f"📁 Sample file: {files[0]}")
print(f"🕒 Duration: {duration:.2f} seconds")
print(f"🎧 Sample rate: {sr} Hz")

Importing NLP Package and also making sure it is using GPU Engine.

In [None]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🖥️ Using device: {device}")

Installing OpenAI's Whisper Package for trascripting the audio files

In [None]:
!pip install git+https://github.com/openai/whisper.git

Transcripting the audio files and exporting a CSV file of transcripts.

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/train.csv')
train_df.columns = ['filename', 'label']

In [None]:
import whisper
from tqdm.notebook import tqdm
import pandas as pd
import os


# Loading Whisper ASR model
model = whisper.load_model("base")

# Transcribing and collecting text
transcripts = []

for fname in tqdm(train_df['filename']):
    audio_path = os.path.join('/content/drive/MyDrive/Colab Notebooks/processed_audio', fname)
    result = model.transcribe(audio_path, language='en')
    transcripts.append(result['text'])

# Adding transcripts to dataframe
train_df['transcript'] = transcripts

# Saving updated CSV
train_df.to_csv('/content/drive/MyDrive/Colab Notebooks/temp_storage/train_with_transcripts.csv', index=False)
print("✅ Transcriptions saved to: /content/drive/MyDrive/Colab Notebooks/temp_storage/train_with_transcripts.csv")


Verifying the Transcripts CSV file.

In [None]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/temp_storage/train_with_transcripts.csv')
print("🧾 Columns:", df.columns.tolist())
print("✅ Total records:", len(df))
print("🗣 Sample transcript:\n")
print(df[['filename', 'label', 'transcript']].head(3))

Checking for empty transcript of any audio.

In [None]:
empty_transcripts = df['transcript'].str.strip().eq('').sum()
print(f"⚠️ Empty transcripts found: {empty_transcripts}")

Importing regular expression package to remove amiguity present in audio transcripts like "uh", "like", "you know" etc.

In [None]:
import re

# Ambuiguious filters
FILLERS = ['uh', 'um', 'erm', 'you know', 'like', 'i mean', 'hmm', 'ah', 'uhh', 'huh', 'duh', 'ohh', 'oh']

def clean_transcript(text):
    text = text.lower()
    text = re.sub(r'\b(?:' + '|'.join(FILLERS) + r')\b', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\s([?.!,"])', r'\1', text)
    text = text.strip()
    return text

# Loading previous data
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/temp_storage/train_with_transcripts.csv')

# Cleaning all transcripts
df['cleaned_transcript'] = df['transcript'].astype(str).apply(clean_transcript)

# Saving new version
df.to_csv('/content/drive/MyDrive/Colab Notebooks/temp_storage/train_cleaned.csv', index=False)
print("✅ Cleaned transcripts saved to: /content/drive/MyDrive/Colab Notebooks/temp_storage/train_cleaned.csv")


Sample Output for Cleaned Transcript.

In [None]:
print(df[['transcript', 'cleaned_transcript']].sample(3))

##Feature Extraction on Training Data

Grammar Feature Extraction for training data

In [None]:
import language_tool_python
import spacy
from tqdm.notebook import tqdm

# Grammar checker and NLP parser
tool = language_tool_python.LanguageTool('en-US')
nlp = spacy.load("en_core_web_sm")

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/temp_storage/train_cleaned.csv')

# Feature addition lists
error_counts = []
avg_sent_lengths = []
pos_diversities = []

for text in tqdm(df['cleaned_transcript']):
    if pd.isna(text) or not isinstance(text, str) or text.strip() == "":
        error_counts.append(0)
        avg_sent_lengths.append(0)
        pos_diversities.append(0)
        continue

    matches = tool.check(text)
    error_counts.append(len(matches))

    doc = nlp(text)
    sent_lengths = [len(sent) for sent in doc.sents]
    pos_tags = [token.pos_ for token in doc if token.pos_ != 'SPACE']

    avg_sent_lengths.append(sum(sent_lengths) / len(sent_lengths) if sent_lengths else 0)
    pos_diversities.append(len(set(pos_tags)))

# Appending new features
df['grammar_errors'] = error_counts
df['avg_sentence_length'] = avg_sent_lengths
df['pos_diversity'] = pos_diversities

# Saving the new file
df.to_csv('/content/drive/MyDrive/Colab Notebooks/temp_storage/train_features.csv', index=False)
print("✅ Grammar features saved to: /content/drive/MyDrive/Colab Notebooks/temp_storage/train_features.csv")

Adding more features to dataset

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/temp_storage/train_features.csv')

df['word_count'] = df['cleaned_transcript'].apply(lambda x: len(str(x).split()))

df['grammar_errors_per_word'] = df['grammar_errors'] / df['word_count'].replace(0, 1)

df.to_csv('/content/drive/MyDrive/Colab Notebooks/temp_storage/train_features_enhanced.csv', index=False)
print("Added word_count and grammar_errors_per_word as features")


Installing Transformer Package for Grammmar Error feature addition.

In [None]:
!pip install happytransformer

Adding the GEC features to final ML model dataset.

In [None]:
from happytransformer import HappyTextToText, TTSettings
import pandas as pd
from tqdm.notebook import tqdm

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/temp_storage/train_cleaned.csv')
texts = df['cleaned_transcript'].astype(str).tolist()

happy_tt = HappyTextToText("T5", "vennify/t5-base-grammar-correction")
args = TTSettings(num_beams=5, min_length=1)

edit_counts = []
edit_ratios = []

for text in tqdm(texts):
    result = happy_tt.generate_text("grammar: " + text, args=args)
    corrected = result.text

    original_words = text.split()
    corrected_words = corrected.split()
    edits = sum(1 for o, c in zip(original_words, corrected_words) if o != c)
    edits += abs(len(original_words) - len(corrected_words))

    edit_counts.append(edits)
    edit_ratios.append(edits / max(1, len(original_words)))

# Adding new features to dataframe
df['gec_edits'] = edit_counts
df['gec_edit_rate'] = edit_ratios

# Saving the updated CSV file
df.to_csv('/content/drive/MyDrive/Colab Notebooks/temp_storage/train_gec_features.csv', index=False)
print("GEC features saved to /content/drive/MyDrive/Colab Notebooks/temp_storage/train_gec_features.csv")


Snippet of the earlier features incorporated into dataset.

In [None]:
archis=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/temp_storage/train_features.csv')
archis

Combining of the earlier engineered general feature and GEC features into final dataset ready for ML model training.

In [None]:
import pandas as pd

df_main = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/temp_storage/train_features_enhanced.csv')
df_gec = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/temp_storage/train_gec_features.csv')

df_combined = df_main.copy()
df_combined['gec_edits'] = df_gec['gec_edits']
df_combined['gec_edit_rate'] = df_gec['gec_edit_rate']

df_combined.to_csv('/content/drive/MyDrive/Colab Notebooks/temp_storage/train_all_features.csv', index=False)
print("✅ Combined feature set saved.")

###ML Model Training

Applying various Machine Learning Algorithms.

In [None]:
import json
import os

# Try to get the name of the currently open notebook
notebook_name = [f for f in os.listdir('.') if f.endswith('.ipynb')]

if notebook_name:
    notebook_name = notebook_name[0]
    print(f"Attempting to load notebook: {notebook_name}")
    try:
        with open(notebook_name, 'r') as f:
            notebook_data = json.load(f)
        print("Notebook data loaded successfully.")
    except json.JSONDecodeError as e:
        print(f"Error decoding notebook JSON: {e}")
        print("There might be an issue with the notebook file's structure.")
    except FileNotFoundError:
        print(f"Error: File '{notebook_name}' not found.")
else:
    print("Error: Could not determine the name of the current notebook file.")

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/temp_storage/train_all_features.csv')

df = df.dropna(subset=['label'])

feature_cols = [
    'grammar_errors',
    'avg_sentence_length',
    'pos_diversity',
    'word_count',
    'grammar_errors_per_word',
    'gec_edits',
    'gec_edit_rate'
]
X = df[feature_cols]
y = df['label']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

models = {
    'Ridge': Ridge(alpha=1.0),
    'Lasso': Lasso(alpha=0.1),
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42, verbosity=0),
    'LightGBM': lgb.LGBMRegressor(n_estimators=100, random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    corr, _ = pearsonr(y_val, preds)
    results[name] = {'RMSE': rmse, 'Pearson': corr}
    print(f"{name} -> RMSE: {rmse:.4f}, Pearson: {corr:.4f}")

best_model = max(results.items(), key=lambda x: x[1]['Pearson'])
print(f"\n✅ Best model: {best_model[0]} with Pearson correlation: {best_model[1]['Pearson']:.4f}")

Best Model was GradientBoosting with Pearson Correlation of 0.3167. Hence training the Model with GradientBoosting with better hyper parametering to get refined Pearson Correlation value.

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
import numpy as np


param_grid = {
    'n_estimators': [50, 100, 200],  # Number of boosting stages (trees)
    'learning_rate': [0.01, 0.05, 0.1],  # Step size shrinking
    'max_depth': [3, 5, 7],  # Depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
    'subsample': [0.8, 0.9, 1.0],  # Fraction of samples used for fitting each tree
    'max_features': [None, 'sqrt', 'log2']  # Number of features to consider at each split
}

gb_model = GradientBoostingRegressor(random_state=42)

grid_search = GridSearchCV(estimator=gb_model, param_grid=param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error', verbose=1)

grid_search.fit(X_train, y_train)

print(f"Best parameters found: {grid_search.best_params_}")

y_pred = grid_search.best_estimator_.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
pearson_corr, _ = pearsonr(y_test, y_pred)

print(f"Optimized GradientBoosting -> RMSE: {rmse:.4f}, Pearson: {pearson_corr:.4f}")


##Pre-Processing the test audio files.

Following same procedure as train audio files. Preprocessing the test audio files.

In [None]:
import os
import librosa
import soundfile as sf
import pandas as pd
from tqdm import tqdm

TEST_AUDIO_DIR = '/content/drive/MyDrive/Colab Notebooks/audios/test'
TEST_CSV_PATH = '/content/drive/MyDrive/Colab Notebooks/test.csv'
TEST_PROCESSED_DIR = '/content/drive/MyDrive/Colab Notebooks/processed_audio_test'
os.makedirs(TEST_PROCESSED_DIR, exist_ok=True)

test_df = pd.read_csv(TEST_CSV_PATH)

def preprocess_audio(file_path, save_path, sr=16000):
    y, orig_sr = librosa.load(file_path, sr=None)
    if orig_sr != sr:
        y = librosa.resample(y, orig_sr, sr)
    y = y / max(abs(y))
    y, _ = librosa.effects.trim(y, top_db=25)
    sf.write(save_path, y, sr)

for filename in tqdm(test_df['filename']):
    in_path = os.path.join(TEST_AUDIO_DIR, filename)
    out_path = os.path.join(TEST_PROCESSED_DIR, filename)
    preprocess_audio(in_path, out_path)


Transcripting the Test Audio files.

In [None]:
import whisper

model_whisper = whisper.load_model("base")
transcripts = []

for fname in tqdm(test_df['filename']):
    audio_path = os.path.join(TEST_PROCESSED_DIR, fname)
    result = model_whisper.transcribe(audio_path, language='en')
    transcripts.append(result['text'])

test_df['transcript'] = transcripts

Removing ambiguity in transcript of test audio files.

In [None]:
import re

FILLERS = ['uh', 'um', 'erm', 'you know', 'like', 'i mean', 'hmm', 'ah', 'uhh', 'huh']

def clean_transcript(text):
    text = text.lower()
    text = re.sub(r'\b(?:' + '|'.join(FILLERS) + r')\b', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\s([?.!,"])', r'\1', text)
    return text.strip()

test_df['cleaned_transcript'] = test_df['transcript'].apply(clean_transcript)

test_df.to_csv('/content/drive/MyDrive/Colab Notebooks/temp_storage/test_cleaned.csv', index=False)
print("Cleaned test transcripts saved.")


Test Data Feature Extraction

In [None]:
import language_tool_python
import spacy
from happytransformer import HappyTextToText, TTSettings

tool = language_tool_python.LanguageTool('en-US')
nlp = spacy.load("en_core_web_sm")
happy_tt = HappyTextToText("T5", "vennify/t5-base-grammar-correction")
args = TTSettings(num_beams=5, min_length=1)

error_counts = []
avg_sent_lengths = []
pos_diversities = []
gec_edits = []
gec_rates = []
word_counts = []

for text in tqdm(test_df['cleaned_transcript']):
    matches = tool.check(text)
    error_counts.append(len(matches))

    doc = nlp(text)
    sent_lens = [len(sent) for sent in doc.sents]
    pos_tags = [token.pos_ for token in doc if token.pos_ != 'SPACE']
    avg_sent_lengths.append(sum(sent_lens) / len(sent_lens) if sent_lens else 0)
    pos_diversities.append(len(set(pos_tags)))

    words = text.split()
    word_counts.append(len(words))

    corrected = happy_tt.generate_text("grammar: " + text, args=args).text
    edits = sum(1 for o, c in zip(words, corrected.split()) if o != c)
    edits += abs(len(words) - len(corrected.split()))
    gec_edits.append(edits)
    gec_rates.append(edits / max(1, len(words)))

# Adding the same features as train dataset features
test_df['grammar_errors'] = error_counts
test_df['avg_sentence_length'] = avg_sent_lengths
test_df['pos_diversity'] = pos_diversities
test_df['word_count'] = word_counts
test_df['grammar_errors_per_word'] = test_df['grammar_errors'] / test_df['word_count'].replace(0, 1)
test_df['gec_edits'] = gec_edits
test_df['gec_edit_rate'] = gec_rates

test_df.to_csv('/content/drive/MyDrive/Colab Notebooks/temp_storage/test_cleaned.csv', index=False)
print("✅ Updated test_cleaned.csv with new features.")

Testing the test data on the trained ML model and saving the result into submission.csv file.

In [None]:
import pandas as pd
import numpy as np

test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/temp_storage/test_cleaned.csv')

test_features = test_df[['grammar_errors', 'avg_sentence_length', 'pos_diversity',
                         'word_count', 'grammar_errors_per_word', 'gec_edits', 'gec_edit_rate']]

y_pred = best_gb.predict(test_features)

y_pred_rounded = np.round(y_pred).astype(int)

submission_df = pd.DataFrame({
    'filename': test_df['filename'],
    'label': y_pred_rounded
    })

submission_df.to_csv('/content/drive/MyDrive/Colab Notebooks/submission.csv', index=False)

print("Submission file created: submission.csv")


##Final Outcome

Best Pearson Correlation Score achieved with GradientBoosting Model with value of 0.3167 which is almost producing correct grammatical labelling.

Future ML Pipeline->
  Adding more features into dataset using more efficient NLP libraries and perform better Hyper Parametering.

Exporting the Model for future use.

In [None]:
import joblib

joblib.dump(best_gb, '/content/drive/MyDrive/Colab Notebooks/best_gradient_boosting_model.pkl')

print("Model saved as best_gradient_boosting_model.pkl")

##Some Visualization to understand the Model better

Feature Importance using BarPlot

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
feature_importances = best_gb.feature_importances_
features = ['grammar_errors', 'avg_sentence_length', 'pos_diversity',
            'word_count', 'grammar_errors_per_word', 'gec_edits', 'gec_edit_rate']

feature_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importances
})

feature_df = feature_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_df, palette='viridis')
plt.title('Feature Importance (GradientBoosting)')
plt.show()

Model Prediction Histographical Plot

In [None]:
plt.figure(figsize=(8, 6))
sns.histplot(y_pred, bins=20, kde=True, color='skyblue')
plt.title('Distribution of Model Predictions')
plt.xlabel('Predicted Label')
plt.ylabel('Frequency')
plt.show()

RMSE vs Pearson Correlation Comparsion

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Best model scores
rmse_best = 1.1311
pearson_best = 0.3176

plt.figure(figsize=(8, 6))
sns.barplot(x=['RMSE', 'Pearson Correlation'], y=[rmse_best, pearson_best], palette='muted')
plt.title('Best Model Performance: RMSE & Pearson Correlation')
plt.ylabel('Score')
plt.ylim(0, max(rmse_best, pearson_best) + 0.5)  # Adjust y-axis for clarity
plt.tight_layout()
plt.show()


Correlation Matrix

In [None]:
corr_matrix = test_df[['grammar_errors', 'avg_sentence_length', 'pos_diversity',
                       'word_count', 'grammar_errors_per_word', 'gec_edits', 'gec_edit_rate']].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix of Features')
plt.show()