In [1]:
import pickle

# Load the model
with open('best_rf_clf.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

In [2]:
import pickle

# Load the scaler
with open('scaler.pkl', 'rb') as file:
    loaded_scaler = pickle.load(file)

In [3]:
import pandas as pd
from nltk.metrics import edit_distance
from collections import Counter
from difflib import SequenceMatcher
from soundex import Soundex  # Make sure to install the 'soundex' package

# Initialize Soundex instance
soundex = Soundex()

def extract_features(row):
    correct_word = row['Intended Word']
    error_word = row['Produced Word']
    
    # Edit Distance
    edit_dist = edit_distance(correct_word, error_word)
    
    # Length Difference
    length_diff = abs(len(correct_word) - len(error_word))
    
    # First Character Difference (1 if different, 0 if the same)
    first_char_diff = int(correct_word[0] != error_word[0])
    
    # Last Character Difference (1 if different, 0 if the same)
    last_char_diff = int(correct_word[-1] != error_word[-1]) if len(correct_word) > 0 and len(error_word) > 0 else 0
    
    # Vowel Count Difference
    vowels = "AEIOUaeiou"
    correct_vowel_count = sum(1 for char in correct_word if char in vowels)
    error_vowel_count = sum(1 for char in error_word if char in vowels)
    vowel_count_diff = abs(correct_vowel_count - error_vowel_count)
    
    # Consonant Count Difference
    correct_consonant_count = len(correct_word) - correct_vowel_count
    error_consonant_count = len(error_word) - error_vowel_count
    consonant_count_diff = abs(correct_consonant_count - error_consonant_count)
    
    # Position of First Mismatch
    first_mismatch_pos = next((i for i, (c1, c2) in enumerate(zip(correct_word, error_word)) if c1 != c2), -1)
    
    # Character Frequency Difference
    correct_word_counter = Counter(correct_word)
    error_word_counter = Counter(error_word)
    char_freq_diff = sum(abs(correct_word_counter[char] - error_word_counter.get(char, 0)) for char in correct_word_counter)
    
    # Positional Difference (Sum of positional mismatches)
    positional_diff = sum(i for i, (c1, c2) in enumerate(zip(correct_word, error_word)) if c1 != c2)
    
    # Longest Common Substring Length
    common_substring_len = len(longest_common_substring(correct_word, error_word))
    
    # Vowel Position Difference (Total mismatched positions of vowels)
    vowel_diff = sum(1 for i, (c1, c2) in enumerate(zip(correct_word, error_word)) if c1 in vowels and c2 not in vowels)
    
    # Soundex Difference
    soundex_diff = int(soundex.soundex(correct_word) != soundex.soundex(error_word))
    
    # Prefix Difference (Checking similarity of the first 3 characters)
    prefix_diff = int(correct_word[:3] != error_word[:3]) if len(correct_word) >= 3 and len(error_word) >= 3 else 0
    
    # Suffix Difference (Checking similarity of the last 3 characters)
    suffix_diff = int(correct_word[-3:] != error_word[-3:]) if len(correct_word) >= 3 and len(error_word) >= 3 else 0

    return pd.Series([
        edit_dist, length_diff, first_char_diff, last_char_diff,
        vowel_count_diff, consonant_count_diff, first_mismatch_pos,
        char_freq_diff, positional_diff, common_substring_len,
        vowel_diff, soundex_diff, prefix_diff, suffix_diff
    ])

def longest_common_substring(str1, str2):
    """ Helper function to find the longest common substring. """
    seq_match = SequenceMatcher(None, str1, str2)
    match = seq_match.find_longest_match(0, len(str1), 0, len(str2))
    return str1[match.a: match.a + match.size]


In [4]:
new_data = []
# Define a sample dataset (ensure it matches the original training structure)
"""new_data = pd.DataFrame({
    'Intended Word': ['window','apple','window','rabbit','tiger','star','key','cake'], 
    'Produced Word': ['window','appo','win-dow','wabbit','tiger','tar','kee','take'], 
})"""
new_data = pd.DataFrame({
    'Intended Word': ['School'], 
    'Produced Word': ['School'], 
})

In [5]:
new_data[['edit_dist', 'length_diff', 'first_char_diff', 'last_char_diff',
     'vowel_count_diff', 'consonant_count_diff', 'first_mismatch_pos',
     'char_freq_diff', 'positional_diff', 'common_substring_len',
     'vowel_diff', 'soundex_diff', 'prefix_diff', 'suffix_diff']] = new_data.apply(extract_features, axis=1)

In [6]:
new_data.head()

Unnamed: 0,Intended Word,Produced Word,edit_dist,length_diff,first_char_diff,last_char_diff,vowel_count_diff,consonant_count_diff,first_mismatch_pos,char_freq_diff,positional_diff,common_substring_len,vowel_diff,soundex_diff,prefix_diff,suffix_diff
0,School,School,0,0,0,0,0,0,-1,0,0,6,0,0,0,0


In [7]:
X = new_data[['edit_dist','length_diff','first_char_diff','last_char_diff','vowel_count_diff','consonant_count_diff','first_mismatch_pos','char_freq_diff','positional_diff','common_substring_len','vowel_diff','soundex_diff','prefix_diff','suffix_diff']]

In [8]:
X_scaled = loaded_scaler.transform(X)

In [9]:
# Step 3: Make predictions
y_pred = loaded_model.predict(X_scaled)
print("Predicted labels:", y_pred)

Predicted labels: [4.]


 {0: 'Substitution', 1: 'Omission', 2: 'Distortion', 3: 'Addition', 4: 'No Error', 5: 'Repetition'}