In [1]:
import pandas as pd
from nltk.metrics import edit_distance
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import numpy as np

In [2]:
# Load the data
df = pd.read_csv(r'../Data/Version_03/articulation_errors_dataset.csv', encoding='ISO-8859-1')

In [3]:
df.head()

Unnamed: 0,Child ID,Age,Intended Word,Produced Word,Error Type,Target Sound,Error Context,Activity Recommended
0,1,6,rabbit,wabbit,Substitution,r,Substituted 'r' with 'w',Sound discrimination game
1,2,7,school,kool,Omission,s,Omitted 's' sound,Sound awareness game
2,3,5,sun,thun,Distortion,s,Distorted 's' to 'th',Tongue twisters
3,4,6,blue,buhlue,Addition,uh,Added extra sound 'uh',Sound identification
4,5,7,cat,cab,Substitution,t,Substituted 't' with 'b',Sound matching activity


In [4]:
df['Error Type'].value_counts()

Error Type
Substitution    1276
Omission         981
Addition         189
Distortion       137
No Error          48
Correct           44
No error           3
Repetition         1
Name: count, dtype: int64

In [5]:
#pip install soundex

In [6]:
import pandas as pd
from nltk.metrics import edit_distance
from collections import Counter
from difflib import SequenceMatcher
from soundex import Soundex  

# Initialize Soundex instance
soundex = Soundex()

def extract_features(row):
    correct_word = row['Intended Word']
    error_word = row['Produced Word']
    
    # Edit Distance
    edit_dist = edit_distance(correct_word, error_word)
    
    # Length Difference
    length_diff = abs(len(correct_word) - len(error_word))
    
    # First Character Difference (1 if different, 0 if the same)
    first_char_diff = int(correct_word[0] != error_word[0])
    
    # Last Character Difference (1 if different, 0 if the same)
    last_char_diff = int(correct_word[-1] != error_word[-1]) if len(correct_word) > 0 and len(error_word) > 0 else 0
    
    # Vowel Count Difference
    vowels = "AEIOUaeiou"
    correct_vowel_count = sum(1 for char in correct_word if char in vowels)
    error_vowel_count = sum(1 for char in error_word if char in vowels)
    vowel_count_diff = abs(correct_vowel_count - error_vowel_count)
    
    # Consonant Count Difference
    correct_consonant_count = len(correct_word) - correct_vowel_count
    error_consonant_count = len(error_word) - error_vowel_count
    consonant_count_diff = abs(correct_consonant_count - error_consonant_count)
    
    # Position of First Mismatch
    first_mismatch_pos = next((i for i, (c1, c2) in enumerate(zip(correct_word, error_word)) if c1 != c2), -1)
    
    # Character Frequency Difference
    correct_word_counter = Counter(correct_word)
    error_word_counter = Counter(error_word)
    char_freq_diff = sum(abs(correct_word_counter[char] - error_word_counter.get(char, 0)) for char in correct_word_counter)
    
    # Positional Difference (Sum of positional mismatches)
    positional_diff = sum(i for i, (c1, c2) in enumerate(zip(correct_word, error_word)) if c1 != c2)
    
    # Longest Common Substring Length
    common_substring_len = len(longest_common_substring(correct_word, error_word))
    
    # Vowel Position Difference (Total mismatched positions of vowels)
    vowel_diff = sum(1 for i, (c1, c2) in enumerate(zip(correct_word, error_word)) if c1 in vowels and c2 not in vowels)
    
    # Soundex Difference
    soundex_diff = int(soundex.soundex(correct_word) != soundex.soundex(error_word))
    
    # Prefix Difference (Checking similarity of the first 3 characters)
    prefix_diff = int(correct_word[:3] != error_word[:3]) if len(correct_word) >= 3 and len(error_word) >= 3 else 0
    
    # Suffix Difference (Checking similarity of the last 3 characters)
    suffix_diff = int(correct_word[-3:] != error_word[-3:]) if len(correct_word) >= 3 and len(error_word) >= 3 else 0

    return pd.Series([
        edit_dist, length_diff, first_char_diff, last_char_diff,
        vowel_count_diff, consonant_count_diff, first_mismatch_pos,
        char_freq_diff, positional_diff, common_substring_len,
        vowel_diff, soundex_diff, prefix_diff, suffix_diff
    ])

def longest_common_substring(str1, str2):
    """ Helper function to find the longest common substring. """
    seq_match = SequenceMatcher(None, str1, str2)
    match = seq_match.find_longest_match(0, len(str1), 0, len(str2))
    return str1[match.a: match.a + match.size]


In [7]:
# Step 4.1: Identify unmapped values
print("Unique values in df['Error Type']: ", df['Error Type'].unique())

Unique values in df['Error Type']:  ['Substitution' 'Omission' 'Distortion' 'Addition' 'No Error' 'No error'
 'Correct' nan 'Repetition']


In [8]:
df[['edit_dist', 'length_diff', 'first_char_diff', 'last_char_diff',
     'vowel_count_diff', 'consonant_count_diff', 'first_mismatch_pos',
     'char_freq_diff', 'positional_diff', 'common_substring_len',
     'vowel_diff', 'soundex_diff', 'prefix_diff', 'suffix_diff']] = df.apply(extract_features, axis=1)

In [9]:
# Map error_type to numerical labels for training
df['error_type_label'] = df['Error Type'].map({'Substitution': 0, 'Omission': 1, 'Distortion': 2, 'Addition': 3, 'No Error': 4, 'No error': 4, 'Correct': 4, 'None': 4, 'Repetition':5})

In [10]:
df = df.dropna(subset=['error_type_label'])

In [11]:
df.head()

Unnamed: 0,Child ID,Age,Intended Word,Produced Word,Error Type,Target Sound,Error Context,Activity Recommended,edit_dist,length_diff,...,consonant_count_diff,first_mismatch_pos,char_freq_diff,positional_diff,common_substring_len,vowel_diff,soundex_diff,prefix_diff,suffix_diff,error_type_label
0,1,6,rabbit,wabbit,Substitution,r,Substituted 'r' with 'w',Sound discrimination game,1,0,...,0,0,1,0,5,0,1,1,0,0.0
1,2,7,school,kool,Omission,s,Omitted 's' sound,Sound awareness game,3,2,...,2,0,3,6,3,1,1,1,0,1.0
2,3,5,sun,thun,Distortion,s,Distorted 's' to 'th',Tongue twisters,2,1,...,1,0,1,3,2,1,1,1,1,2.0
3,4,6,blue,buhlue,Addition,uh,Added extra sound 'uh',Sound identification,2,2,...,1,1,1,6,3,2,0,1,0,3.0
4,5,7,cat,cab,Substitution,t,Substituted 't' with 'b',Sound matching activity,1,0,...,0,2,1,2,2,0,1,1,1,0.0


In [12]:
# Step 2: Split Data into Train and Test Sets
X = df[['edit_dist','length_diff','first_char_diff','last_char_diff','vowel_count_diff','consonant_count_diff','first_mismatch_pos','char_freq_diff','positional_diff','common_substring_len','vowel_diff','soundex_diff','prefix_diff','suffix_diff']]
y = df['error_type_label']

In [13]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [14]:
from joblib import dump

# Save the scaler
dump(scaler, 'scaler.joblib')
print("Scaler saved as 'scaler.joblib'")

Scaler saved as 'scaler.joblib'


In [15]:
import pickle

# Save the scaler
with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)
print("Scaler saved as 'scaler.pkl'")

Scaler saved as 'scaler.pkl'


In [16]:
print(y.isnull().sum())

0


In [17]:
# Step 3: Train a Classifier (Random Forest)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2',None]
}

# Initialize RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=42)

# GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best parameters
print("Best parameters:", grid_search.best_params_)
best_rf_clf = grid_search.best_estimator_

# Train the optimized Random Forest model
best_rf_clf.fit(X_train, y_train)

Fitting 3 folds for each of 324 candidates, totalling 972 fits




Best parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}


In [18]:
# Step 4: Make Predictions and Evaluate
y_pred = best_rf_clf.predict(X_test)

In [19]:
# Convert numeric labels back to original error_type for interpretation
error_type = {0: 'Substitution', 1: 'Omission', 2: 'Distortion', 3: 'Addition', 4: 'No Error', 5: 'Repetition'}
y_test_labels = y_test.map(error_type)
y_pred_labels = pd.Series(y_pred).map(error_type)

In [20]:
# Print Classification Report
print("Classification Report:\n", classification_report(y_test_labels, y_pred_labels))
print("Accuracy:", accuracy_score(y_test_labels, y_pred_labels))

Classification Report:
               precision    recall  f1-score   support

    Addition       0.87      0.45      0.60        44
  Distortion       0.50      0.30      0.38        23
    No Error       0.77      1.00      0.87        17
    Omission       0.83      0.74      0.78       197
Substitution       0.76      0.91      0.83       255

    accuracy                           0.79       536
   macro avg       0.75      0.68      0.69       536
weighted avg       0.79      0.79      0.78       536

Accuracy: 0.7854477611940298


In [21]:
from joblib import dump

# Save the model
dump(best_rf_clf, 'best_rf_clf.joblib')
print("Model saved as 'best_rf_clf.joblib'")


Model saved as 'best_rf_clf.joblib'


In [22]:
import pickle

# Save the model
with open('best_rf_clf.pkl', 'wb') as file:
    pickle.dump(best_rf_clf, file)
print("Model saved as 'best_rf_clf.pkl'")


Model saved as 'best_rf_clf.pkl'


In [23]:
from xgboost import XGBClassifier
xgb_clf = XGBClassifier(random_state=42)
xgb_clf.fit(X_train, y_train)
y_pred = xgb_clf.predict(X_test)

In [25]:
# Convert numeric labels back to original error_type for interpretation
error_type = {0: 'Substitution', 1: 'Omission', 2: 'Distortion', 3: 'Addition', 4: 'No Error', 5: 'Repetition'}
y_test_labels = y_test.map(error_type)
y_pred_labels = pd.Series(y_pred).map(error_type)

# Print Classification Report
print("Classification Report:\n", classification_report(y_test_labels, y_pred_labels))
print("Accuracy:", accuracy_score(y_test_labels, y_pred_labels))

Classification Report:
               precision    recall  f1-score   support

    Addition       0.77      0.45      0.57        44
  Distortion       0.50      0.30      0.38        23
    No Error       0.77      1.00      0.87        17
    Omission       0.83      0.73      0.78       197
Substitution       0.77      0.91      0.83       255

    accuracy                           0.78       536
   macro avg       0.73      0.68      0.69       536
weighted avg       0.78      0.78      0.77       536

Accuracy: 0.7817164179104478


In [26]:
from sklearn.svm import SVC
svc_clf = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
svc_clf.fit(X_train, y_train)
y_pred = svc_clf.predict(X_test)

In [27]:
# Convert numeric labels back to original error_type for interpretation
error_type = {0: 'Substitution', 1: 'Omission', 2: 'Distortion', 3: 'Addition', 4: 'No Error', 5: 'Repetition'}
y_test_labels = y_test.map(error_type)
y_pred_labels = pd.Series(y_pred).map(error_type)

# Print Classification Report
print("Classification Report:\n", classification_report(y_test_labels, y_pred_labels))
print("Accuracy:", accuracy_score(y_test_labels, y_pred_labels))

Classification Report:
               precision    recall  f1-score   support

    Addition       0.82      0.20      0.33        44
  Distortion       1.00      0.09      0.16        23
    No Error       0.77      1.00      0.87        17
    Omission       0.78      0.74      0.76       197
Substitution       0.74      0.91      0.82       255

    accuracy                           0.76       536
   macro avg       0.82      0.59      0.59       536
weighted avg       0.77      0.76      0.73       536

Accuracy: 0.7574626865671642


In [28]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=5)
knn_clf.fit(X_train, y_train)
y_pred = knn_clf.predict(X_test)

# Convert numeric labels back to original error_type for interpretation
error_type = {0: 'Substitution', 1: 'Omission', 2: 'Distortion', 3: 'Addition', 4: 'No Error', 5: 'Repetition'}
y_test_labels = y_test.map(error_type)
y_pred_labels = pd.Series(y_pred).map(error_type)

# Print Classification Report
print("Classification Report:\n", classification_report(y_test_labels, y_pred_labels))
print("Accuracy:", accuracy_score(y_test_labels, y_pred_labels))

Classification Report:
               precision    recall  f1-score   support

    Addition       0.77      0.45      0.57        44
  Distortion       0.36      0.17      0.24        23
    No Error       0.80      0.94      0.86        17
    Omission       0.81      0.73      0.77       197
Substitution       0.75      0.89      0.82       255

    accuracy                           0.77       536
   macro avg       0.70      0.64      0.65       536
weighted avg       0.76      0.77      0.76       536

Accuracy: 0.7667910447761194


In [29]:
from sklearn.neural_network import MLPClassifier
mlp_clf = MLPClassifier(hidden_layer_sizes=(50, 30), max_iter=300, random_state=42)
mlp_clf.fit(X_train, y_train)
y_pred = mlp_clf.predict(X_test)

# Convert numeric labels back to original error_type for interpretation
error_type = {0: 'Substitution', 1: 'Omission', 2: 'Distortion', 3: 'Addition', 4: 'No Error', 5: 'Repetition'}
y_test_labels = y_test.map(error_type)
y_pred_labels = pd.Series(y_pred).map(error_type)

# Print Classification Report
print("Classification Report:\n", classification_report(y_test_labels, y_pred_labels))
print("Accuracy:", accuracy_score(y_test_labels, y_pred_labels))

Classification Report:
               precision    recall  f1-score   support

    Addition       0.83      0.45      0.59        44
  Distortion       0.35      0.26      0.30        23
    No Error       0.76      0.94      0.84        17
    Omission       0.81      0.76      0.78       197
Substitution       0.77      0.87      0.82       255

    accuracy                           0.77       536
   macro avg       0.71      0.66      0.67       536
weighted avg       0.77      0.77      0.76       536

Accuracy: 0.7723880597014925


In [30]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(penalty='l2', max_iter=200, random_state=42)
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)

# Convert numeric labels back to original error_type for interpretation
error_type = {0: 'Substitution', 1: 'Omission', 2: 'Distortion', 3: 'Addition', 4: 'No Error', 5: 'Repetition'}
y_test_labels = y_test.map(error_type)
y_pred_labels = pd.Series(y_pred).map(error_type)

# Print Classification Report
print("Classification Report:\n", classification_report(y_test_labels, y_pred_labels))
print("Accuracy:", accuracy_score(y_test_labels, y_pred_labels))

Classification Report:
               precision    recall  f1-score   support

    Addition       0.55      0.25      0.34        44
  Distortion       0.00      0.00      0.00        23
    No Error       0.77      1.00      0.87        17
    Omission       0.69      0.76      0.72       197
Substitution       0.74      0.80      0.76       255

    accuracy                           0.71       536
   macro avg       0.55      0.56      0.54       536
weighted avg       0.67      0.71      0.68       536

Accuracy: 0.7089552238805971
