In [1]:
import librosa
import numpy as np

def extract_audio_features(audio_path):
    y, sr = librosa.load(audio_path, sr=None)
    
    features = {
        "duration": librosa.get_duration(y=y, sr=sr),
        "tempo": librosa.beat.tempo(y=y, sr=sr)[0],
        "zero_crossing_rate": np.mean(librosa.feature.zero_crossing_rate(y)),
        "spectral_centroid": np.mean(librosa.feature.spectral_centroid(y=y, sr=sr)),
        "spectral_rolloff": np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr)),
        "rmse": np.mean(librosa.feature.rms(y=y))
    }
    
    return features


In [2]:
import language_tool_python
import pandas as pd

tool = language_tool_python.LanguageTool('en-US')

def extract_text_features(text):
    matches = tool.check(text)
    num_errors = len(matches)
    
    return {
        "num_grammar_errors": num_errors,
        "num_words": len(text.split()),
        "avg_word_len": sum(len(w) for w in text.split()) / max(len(text.split()), 1),
        "text_length": len(text)
    }


In [3]:
def extract_combined_features(audio_path, transcript):
    audio_feats = extract_audio_features(audio_path)
    text_feats = extract_text_features(transcript)
    
    combined = {**audio_feats, **text_feats}
    return combined


In [4]:
import os

def build_feature_dataframe(csv_path, audio_folder):
    df = pd.read_csv(csv_path)
    feature_rows = []

    for idx, row in df.iterrows():
        filename = row['file']
        transcript = row['transcript']  # Update if column name differs
        audio_path = os.path.join(audio_folder, filename)

        try:
            features = extract_combined_features(audio_path, transcript)
            features['file'] = filename
            features['score'] = row['label']  # Only in train.csv
            feature_rows.append(features)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
    
    return pd.DataFrame(feature_rows)


In [7]:
train_features_df = build_feature_dataframe("outputs/transcripts.csv", "dataset/audios_train")
train_features_df.to_csv("hybrid_train_features.csv", index=False)


KeyError: 'file'

In [9]:
import pandas as pd 
df = pd.read_csv('outputs/hybrid_train_features.csv')
df.columns

Index(['duration', 'zero_crossing_rate', 'spectral_centroid', 'rmse',
       'grammar_score_model', 'original_text', 'corrected_text', 'num_words',
       'avg_word_len', 'text_length', 'filename', 'label'],
      dtype='object')

In [11]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pickle

# Load your CSV
df = pd.read_csv("outputs/hybrid_train_features.csv")

# Drop non-feature columns (keep only numerical/text-based features relevant for modeling)
X = df.drop(columns=["original_text", "corrected_text", "filename", "label"])
y = df["label"]

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost regressor
model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.05, max_depth=4, random_state=42)
model.fit(X_train, y_train)

# Save model as .pkl
with open("hybrid_model.pkl", "wb") as f:
    pickle.dump(model, f)

# Evaluate
y_pred = model.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print(f"Validation MSE: {mse:.4f}")
print(f"Validation R²: {r2:.4f}")


Validation MSE: 1.0924
Validation R²: 0.2065


In [14]:
import librosa
import numpy as np
import pandas as pd
import language_tool_python
import pickle
import os

# ✅ Load both models
with open("outputs/grammar_scorer.pkl", "rb") as f:
    basic_model = pickle.load(f)

with open("hybrid_model.pkl", "rb") as f:
    hybrid_model = pickle.load(f)

# ✅ Setup LanguageTool
tool = language_tool_python.LanguageTool('en-US')

# ✅ Audio feature extractor
def extract_audio_features(audio_path):
    y, sr = librosa.load(audio_path, sr=None)

    features = {
        "duration": librosa.get_duration(y=y, sr=sr),
        "tempo": librosa.beat.tempo(y=y, sr=sr)[0],
        "zero_crossing_rate": np.mean(librosa.feature.zero_crossing_rate(y)),
        "spectral_centroid": np.mean(librosa.feature.spectral_centroid(y=y, sr=sr)),
        "spectral_rolloff": np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr)),
        "rmse": np.mean(librosa.feature.rms(y=y))
    }

    return features

# ✅ Text feature extractor
def extract_text_features(text):
    matches = tool.check(text)
    num_errors = len(matches)

    return {
        "num_grammar_errors": num_errors,
        "num_words": len(text.split()),
        "avg_word_len": sum(len(w) for w in text.split()) / max(len(text.split()), 1),
        "text_length": len(text)
    }

# ✅ Combine audio + text features
def extract_combined_features(audio_path, transcript):
    audio_feats = extract_audio_features(audio_path)
    text_feats = extract_text_features(transcript)
    combined = {**audio_feats, **text_feats}
    return audio_feats, text_feats, combined

# ✅ Input test case
audio_path = "dataset/audios_train/audio_2.wav"  # Change if needed
transcript = """People in the market are selling just about anything and everything. 
You can hear everyone screaming and talking over each other, making offers. 
The crowded market scene makes me want to run out of the door as soon as possible, 
and I picture this happening midday."""

# ✅ Extract all feature sets
audio_feats, text_feats, combined_feats = extract_combined_features(audio_path, transcript)

# ✅ Prepare DataFrames for both models
df_basic = pd.DataFrame([text_feats])         # if your basic model was trained only on text
# df_basic = pd.DataFrame([audio_feats])      # ← use this instead if it was trained on audio only
df_hybrid = pd.DataFrame([combined_feats])    # hybrid needs both

# ✅ Predict
basic_pred = basic_model.predict(df_basic)[0]
hybrid_pred = hybrid_model.predict(df_hybrid)[0]

# ✅ Show result
print("\n🎯 Grammar Scoring Results:")
print(f"📦 Basic Model Score:   {basic_pred:.2f}")
print(f"🧠 Hybrid Model Score:  {hybrid_pred:.2f}")


	This function was moved to 'librosa.feature.rhythm.tempo' in librosa version 0.10.0.
	This alias will be removed in librosa version 1.0.
  "tempo": librosa.beat.tempo(y=y, sr=sr)[0],


AttributeError: 'numpy.ndarray' object has no attribute 'predict'

In [15]:
# Correct way to save the model
with open("outputs/grammar_scorer.pkl", "wb") as f:
    pickle.dump(model, f)


In [16]:
import pickle

with open("outputs/grammar_scorer.pkl", "rb") as f:
    obj = pickle.load(f)

print(type(obj))


<class 'xgboost.sklearn.XGBRegressor'>


In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

# Load train.csv
train_df = pd.read_csv('outputs/train_features_full.csv')  # this should include 'emb_0' to 'emb_383'

# Define embedding columns
embedding_cols = [f'emb_{i}' for i in range(384)]

# Define X and y
X = train_df[embedding_cols]
y = train_df['grammar_score']

# Train/test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = XGBRegressor(n_estimators=100, max_depth=5, random_state=42)
model.fit(X_train, y_train)

# Evaluate
val_preds = model.predict(X_val)
mse = mean_squared_error(y_val, val_preds)
print("Validation MSE:", mse)

# Predict on test set
test_df = pd.read_csv('test_with_embeddings.csv')  # this should include 'emb_0' to 'emb_383'
X_test = test_df[embedding_cols]
test_df['predicted_score'] = model.predict(X_test)

# Save for submission
submission = test_df[['file_name', 'predicted_score']]
submission.to_csv('submission.csv', index=False)


XGBoostError: [03:12:35] C:\actions-runner\_work\xgboost\xgboost\src\data\data.cc:550: Check failed: valid: Label contains NaN, infinity or a value too large.

In [25]:
import os
import librosa
import numpy as np
import pandas as pd
import language_tool_python
from xgboost import XGBRegressor
import pickle
from tqdm import tqdm
import xgboost as xgb

# === Enable GPU logging (optional)
xgb.set_config(verbosity=2)

# === Configuration ===
AUDIO_DIR = "dataset/audios_train"               # Path to audio files
CSV_PATH = "outputs/grammar_scores.csv"          # CSV with transcript and grammar score
MODEL_SAVE_PATH = "hybrid_model.pkl"             # Model output path

# === Initialize grammar checker
tool = language_tool_python.LanguageTool('en-US')

# === Audio feature extraction
def extract_audio_features(audio_path):
    y, sr = librosa.load(audio_path, sr=None)
    return {
        "duration": librosa.get_duration(y=y, sr=sr),
        "zero_crossing_rate": np.mean(librosa.feature.zero_crossing_rate(y)),
        "spectral_centroid": np.mean(librosa.feature.spectral_centroid(y=y, sr=sr)),
        "rmse": np.mean(librosa.feature.rms(y=y))
    }

# === Text feature extraction
def extract_text_features(text):
    words = text.split()
    return {
        "num_words": len(words),
        "avg_word_len": sum(len(w) for w in words) / max(len(words), 1),
        "text_length": len(text)
    }

# === Combine features
def extract_combined_features(audio_path, transcript):
    audio_feats = extract_audio_features(audio_path)
    text_feats = extract_text_features(transcript)
    return {**audio_feats, **text_feats}

# === Load CSV
df = pd.read_csv(CSV_PATH)

# Validate required columns
assert "filename" in df.columns and "label" in df.columns and "transcript" in df.columns, \
       "CSV must contain 'filename', 'label', and 'transcript' columns"

# === Feature extraction
features_list = []
print("🔍 Extracting features...")
for _, row in tqdm(df.iterrows(), total=len(df)):
    audio_path = os.path.join(AUDIO_DIR, row["filename"])
    text = row["transcript"]
    label = row["label"]

    try:
        feats = extract_combined_features(audio_path, text)
        feats["label"] = label
        features_list.append(feats)
    except Exception as e:
        print(f"⚠️ Skipping {row['filename']} due to error: {e}")

# === Create DataFrame
features_df = pd.DataFrame(features_list)
X = features_df.drop(columns=["label"])
y = features_df["label"]

# === Train model with GPU support
print("⚡ Training XGBoost model on GPU...")
model = XGBRegressor(
    n_estimators=100,
    max_depth=4,
    learning_rate=0.1,
    tree_method='gpu_hist',
    predictor='gpu_predictor',
    random_state=42
)

model = xgb.XGBRegressor(
    tree_method='hist',  # instead of gpu_hist
    device='cpu',
    n_estimators=100,
    max_depth=6
)

model.fit(X, y)

# === Save the model
with open(MODEL_SAVE_PATH, "wb") as f:
    pickle.dump(model, f)

print(f"\n✅ Model trained and saved at: {MODEL_SAVE_PATH}")


🔍 Extracting features...


 44%|████▍     | 195/444 [00:19<00:27,  9.09it/s]

⚠️ Skipping audio_147.wav due to error: 'float' object has no attribute 'split'


100%|██████████| 444/444 [00:48<00:00,  9.15it/s]

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


⚡ Training XGBoost model on GPU...
[03:41:41] INFO: C:\actions-runner\_work\xgboost\xgboost\src\data\iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (443, 7, 3101).
[03:41:42] INFO: C:\actions-runner\_work\xgboost\xgboost\src\data\ellpack_page.cu:167: Ellpack is dense.

✅ Model trained and saved at: hybrid_model.pkl



    E.g. tree_method = "hist", device = "cuda"

  pickle.dump(model, f)


In [26]:
import pickle
import pandas as pd

# Load the model
with open("hybrid_model.pkl", "rb") as f:
    model = pickle.load(f)

# Example test input (replace with real audio path and transcript)
test_features = {
    "duration": 3.2,
    "zero_crossing_rate": 0.052,
    "spectral_centroid": 3000.5,
    "rmse": 0.015,
    "num_words": 22,
    "avg_word_len": 4.1,
    "text_length": 105
}

# Convert to DataFrame
df = pd.DataFrame([test_features])

# Predict
predicted_score = model.predict(df)[0]
print(f"🎯 Predicted Grammar Score: {predicted_score:.2f}")


🎯 Predicted Grammar Score: 2.76



    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


## for cpu

In [27]:
import os
import librosa
import numpy as np
import pandas as pd
import language_tool_python
from xgboost import XGBRegressor
import pickle
from tqdm import tqdm

# === Configuration ===
AUDIO_DIR = "dataset/audios_train"               # Path to audio files
CSV_PATH = "outputs/grammar_scores.csv"          # CSV with transcript and grammar score
MODEL_SAVE_PATH = "hybrid_model_cpu.pkl"         # CPU-trained model save path

# === Initialize grammar checker
tool = language_tool_python.LanguageTool('en-US')

# === Audio feature extraction
def extract_audio_features(audio_path):
    y, sr = librosa.load(audio_path, sr=None)
    return {
        "duration": librosa.get_duration(y=y, sr=sr),
        "zero_crossing_rate": np.mean(librosa.feature.zero_crossing_rate(y)),
        "spectral_centroid": np.mean(librosa.feature.spectral_centroid(y=y, sr=sr)),
        "rmse": np.mean(librosa.feature.rms(y=y))
    }

# === Text feature extraction
def extract_text_features(text):
    words = text.split()
    return {
        "num_words": len(words),
        "avg_word_len": sum(len(w) for w in words) / max(len(words), 1),
        "text_length": len(text)
    }

# === Combine features
def extract_combined_features(audio_path, transcript):
    audio_feats = extract_audio_features(audio_path)
    text_feats = extract_text_features(transcript)
    return {**audio_feats, **text_feats}

# === Load CSV
df = pd.read_csv(CSV_PATH)

# Validate required columns
assert "filename" in df.columns and "label" in df.columns and "transcript" in df.columns, \
       "CSV must contain 'filename', 'label', and 'transcript' columns"

# === Feature extraction
features_list = []
print("🔍 Extracting features...")
for _, row in tqdm(df.iterrows(), total=len(df)):
    audio_path = os.path.join(AUDIO_DIR, row["filename"])
    text = row["transcript"]
    label = row["label"]

    try:
        feats = extract_combined_features(audio_path, text)
        feats["label"] = label
        features_list.append(feats)
    except Exception as e:
        print(f"⚠️ Skipping {row['filename']} due to error: {e}")

# === Create DataFrame
features_df = pd.DataFrame(features_list)
X = features_df.drop(columns=["label"])
y = features_df["label"]

# === Train model on CPU
print("🧠 Training XGBoost model on CPU...")
model = XGBRegressor(
    tree_method='hist',   # CPU-compatible tree method
    device='cpu',
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42
)

model.fit(X, y)

# === Save the model
with open(MODEL_SAVE_PATH, "wb") as f:
    pickle.dump(model, f)

print(f"\n✅ CPU model trained and saved at: {MODEL_SAVE_PATH}")


🔍 Extracting features...


 44%|████▍     | 195/444 [00:18<00:23, 10.44it/s]

⚠️ Skipping audio_147.wav due to error: 'float' object has no attribute 'split'


100%|██████████| 444/444 [00:50<00:00,  8.74it/s]


🧠 Training XGBoost model on CPU...
[03:51:54] INFO: C:\actions-runner\_work\xgboost\xgboost\src\data\iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (443, 7, 3101).

✅ CPU model trained and saved at: hybrid_model_cpu.pkl


In [3]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Zero", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-R1-Zero", trust_remote_code=True)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
A new version of the following files was downloaded from https://huggingface.co/deepseek-ai/DeepSeek-R1-Zero:
- configuration_deepseek.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/deepseek-ai/DeepSeek-R1-Zero:
- modeling_deepseek.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


ValueError: FP8 quantized models is only supported on GPUs with compute capability >= 9.0 (e.g H100)

In [5]:
import gradio as gr
import torch
import whisper
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
import language_tool_python
import pickle
import librosa
import numpy as np
import pandas as pd
import pyttsx3
import tempfile
import os

# Load Whisper model for transcription
whisper_model = whisper.load_model("base")

# Load 4-bit quantized DeepSeek grammar correction model
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
correction_tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-chat')
correction_model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-chat",
    device_map="auto",
    quantization_config=bnb_config
)
corrector = pipeline("text-generation", model=correction_model, tokenizer=correction_tokenizer)

# Load grammar scoring models
with open("outputs/grammar_scorer.pkl", "rb") as f:
    scorer_model = pickle.load(f)

with open("hybrid_model_cpu.pkl", "rb") as f:
    hybrid_model = pickle.load(f)

# Grammar checker
tool = language_tool_python.LanguageTool('en-US')

# Audio feature extractor
def extract_audio_features(audio_path):
    y, sr = librosa.load(audio_path, sr=None)
    features = {
        "duration": librosa.get_duration(y=y, sr=sr),
        "zero_crossing_rate": np.mean(librosa.feature.zero_crossing_rate(y)),
        "spectral_centroid": np.mean(librosa.feature.spectral_centroid(y=y, sr=sr)),
        "rmse": np.mean(librosa.feature.rms(y=y))
    }
    return features

# Text feature extractor
def extract_text_features(text):
    matches = tool.check(text)
    num_errors = len(matches)
    return {
        "num_grammar_errors": num_errors,
        "num_words": len(text.split()),
        "avg_word_len": sum(len(w) for w in text.split()) / max(len(text.split()), 1),
        "text_length": len(text)
    }

# Combine features
def extract_combined_features(audio_path, transcript):
    audio_feats = extract_audio_features(audio_path)
    text_feats = extract_text_features(transcript)
    hybrid_features = {**audio_feats, **text_feats}
    df = pd.DataFrame([hybrid_features])
    hybrid_score = hybrid_model.predict(df)[0]
    df['grammar_score_model'] = hybrid_score
    reduced_features = df[[  # final input for full scorer
        "duration", "zero_crossing_rate", "spectral_centroid", "rmse",
        "grammar_score_model", "num_words", "avg_word_len", "text_length"
    ]]
    return reduced_features, hybrid_score

# TTS engine setup
def speak(text):
    engine = pyttsx3.init()
    engine.say(text)
    engine.runAndWait()

# Main processing function
def process_input(audio=None, text_input=""):
    transcript = text_input

    # Step 1: Transcribe audio if available
    if audio:
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
            tmp_path = tmp.name
        os.rename(audio, tmp_path)
        result = whisper_model.transcribe(tmp_path)
        transcript = result['text']
    else:
        tmp_path = None  # No audio

    # Step 2: Grammar Correction
    correction = corrector(transcript, max_new_tokens=128)[0]['generated_text']

    # Step 3: Grammar Scoring
    if tmp_path:
        features, hybrid_score = extract_combined_features(tmp_path, transcript)
        final_score = scorer_model.predict(features)[0]
    else:
        # Use only text features for scoring
        text_feats = extract_text_features(transcript)
        df = pd.DataFrame([text_feats])
        final_score = hybrid_model.predict(df)[0]

    # Step 4: TTS
    speak(correction)

    return transcript, correction, final_score, correction

# Gradio UI
demo = gr.Interface(
    fn=process_input,
    inputs=[
        gr.Audio(source="microphone", type="filepath", optional=True, label="Record or Upload Audio"),
        gr.Textbox(label="Or Enter Text", placeholder="You can leave this blank if using audio.")
    ],
    outputs=[
        gr.Textbox(label="Transcript"),
        gr.Textbox(label="Corrected Text"),
        gr.Number(label="Grammar Score (0-5)"),
        gr.Textbox(label="Text Spoken Aloud")
    ],
    title="🗣️ AI Grammar Assistant (Audio/Text) with 4-bit DeepSeek",
    description="Upload or record audio, or enter text. The app transcribes (if needed), corrects grammar using a quantized LLM, scores grammar quality (0-5), and speaks the corrected text aloud."
)

demo.launch()


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previ

OSError: meta-llama/Llama-2-7b-chat does not appear to have a file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt or flax_model.msgpack.