<a href="https://colab.research.google.com/github/jullazarovych/DL_math_misunderstandings/blob/main/math_misunderstandings_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade --force-reinstall textblob nltk
!apt-get update
!apt-get install -y libenchant-2-2
!pip install pyenchant --force-reinstall
!pip install sentence-transformers faiss-cpu


In [None]:
import pandas as pd
import re
from textblob import TextBlob
import enchant
import concurrent.futures
from tqdm import tqdm
import enchant
from google.colab import drive

In [None]:
drive.mount('/content/drive')
BASE_DATA_PATH = '/content/drive/My Drive/nlp_math_misunderstanding/data'

train = pd.read_csv(BASE_DATA_PATH + '/raw/train.csv')
total_rows = len(train)
print("Total rows:", total_rows)


max_length = 250
train = train[train["StudentExplanation"].str.len() <= max_length].copy()
rows_after_length_filter = len(train)
print(f"Rows after length filter: {rows_after_length_filter} "
      f"(cleaned {total_rows - rows_after_length_filter} rows)")

In [None]:
contractions_dict = {
    "1 3rd": "1/3", "3 6th": "3/6", "isn't": "is not", "divisible": "divide", "thee": "the",
    "3 9ths": "3/9", "aren't": "are not", "isn’t": "is not", "it's": "it is", "cancell": "cancel",
    "1 over3": "1/3", "aren’t": "are not", "coulered": "coloured", "srent": "are not",
    "denominater": "denominator", "numerater": "numerator", "simplifys": "simplifies", "to1/3": "to 1/3",
    "3/9is": "3/9", "and3": "and 3", "3fd": "3", "3over9": "3 over 9", "3÷3=1": "3/3=1", "9÷3=3": "9/3=3",
    "wasn’t": "was not", "it’s": "it is", "donometer": "denominator", "denomantor": "denominator",
    "denominter": "denominator", "equalatiral": "equivalent", "evquivlant": "equivalent", "simplafide": "simplified",
    "get1/3": "get 1/3", "simlifyed": "simplified", "3s": "3", "3\\9": "3/9", "1\\3": "1/3", "isnt": "is not",
    "3 9th": "3/9", "simplflied": "simplified", "sipmpilist": "simplest", "simplifed": "simplified",
    "numeriiator": "numerator", "simpflyed": "simplified", "3/9'is": "3/9 is", "÷": "/", "wich": "which",
    "weren't": "were not", "that's": "that is", "kink": "know", "you'll": "you will", "1out": "1 out", "is1/3": "is 1/3",
    "wouldnt": "would not", "don't": "do not", "wouldn’t": "would not", "wouldn't": "would not", "3/9s": "3/9",
    "1third": "1/3", "3rd": "3", "itmwould": "it would", "aremshaded": "are shaded", "6/9ths": "6/9", "get1/3": "get 1/3",
    "numeriiator": "numerator", "didvideequally": "divide equally", "9th": "9", "⅓": "1/3", "and³/9": "and 3/9",
    "arnt": "are not", "3/6th": "3/6", "3-6ths": "3-6", "coulerded": "coloured", "splitt": "split", "9m": "9",
    "3are": "3 are", "3/9ths": "3/9", "8th": "8", "[[": "", "52m=1102pm": "52 m = 1102 pm", "1/12i": "1/12 i",
    "6x1": "6 x 1", "1x1 ": "1 x 1", "6x2": "6 x 2", "2x6": "2 x 6", "2x6=12": "2 x 6 = 12", "1x1=1": "1 x 1 = 1",
    "1/2x1/6": "1/2 x 1/6", "got1/12": "got 1/12", "1/12th": "1/12", "6times": "6 times", "is1/12": "is 1/12",
    "6x2=12": "6 x 2 = 12", "6x1=6": "6 x 1 = 6", "6th": "6", "1x6": "1 x 6", "3x2": "3 x 2", "2x6=12:1/12": "2 x 6 = 12:1/12",
    "x3": "x 3", "x3=72": "x 3 = 72", "2x24=48": "2 x 24 = 48", "72left": "72 left", "72and": "72 and", "3 5ths": "3/5",
    "3/5s": "3/5", "red2/5": "red 2/5", "20s": "20", "times23": "times 23", "5ths": "5", "24x": "24 x", "5th": "5",
    "because1/5": "because 1/5", "by3": "by 3", "divid": "divide", "5divide": "5 divide", "xby": "x by", "5s": "x 5",
    "24s": "x 24", "4x6": "4 x 6", "12s": "x 12", "fith": "fifth", "fithis": "fifth", "10ths": "10", "the1/5": "the 1/5",
    "put24": "put 24", "70s": "70", "x24": "x 24", "x6": "x 6", "get72": "get 72", "360/5divide": "360/5 divide",
    "1/10=3/15times": "1/10=3/15 times", "pkus": "plus", "5x3": "5 x 3", "ax3": "a x 3", "3x3": "3 x 3", "3x5": "3 x 5",
    "of10": "of 10", "3x": "3 x", "5x3=15": "5 x 3 = 15", "3x5=15": "3 x 5 = 15", "3x3=9": "3 x 3 = 9", "3by3": "3 by 3",
    "a=b4": "a=b 4", "wirth": "worth", "is9": "is 9", "took5": "took 5", "to15": "to 15", "15ths": "15", "9-4which": "9-4 which",
    "5is": "5 is", "9x3": "9 x 3", "3x6": "3 x 6", "15s": "15", "5x2=10": "5 x 2 = 10", "15a": "15", "15to": "15 to",
    "9by3": "9 by 3", "9x2=18": "9 x 2 = 18", "15a=90": "15 a = 90", "6x3": "6 x 3", "3a=18": "3 a = 18", "3/5this": "3/5 this",
    "o6": "6", "3/5but": "3/5 but", "3/5x2=6/10": "3/5 x 2 = 6/10", "9x2": "9 x 2", "9/15s": "9/15", "2x3=6": "2 x 3 = 6",
    "2x3": "2 x 3", "x2": "x 2", "2x5": "2 x 5", "6x3=18and": "6 x 3 = 18 and", "9 15ths": "9/15", "6 10ths": "6/10",
    "3=30so": "3 = 30 so", "x2=6/10": "x 2 = 6/10", "30ths": "30", "6x3=18": "6 x 3 = 18", "15a": "15 a", "3a": "3 a",
    "5x": "5  x", "3rds": "3", "15i": "15 i", "3/5i": "3/5 i", "6/10s": "6/10", "by15": "by 15", "9/15s": "9/15", "3/5s": "3/5",
    "9x2": "9 x 2", "18/30th": "18/30", "simplify9/15": "simplify 9/15", "18divide": "18 divide", "10x": "10 x", "30and": "30 and",
    "15which": "15 which", "3then": "3 then", "3which": "3 which", "15now": "15 now", "15a=90": "15 a = 90", "=15a=90": "= 15 a = 90",
    "3a/30": "3 a / 30", "6x3=18": "6 x 3 = 18", "x2=10": "x 2 = 10", "2over5": "2 over 5", "a10": "a 10", "3/5so": "3/5 so",
    "3times": "3 times", "x2/3": "x 2/3", "3x": "3 x", "2x5": "2 x 5", "ax15": "a x 15", "so10": "so 10", "6h": "6 h", "2y=24": "2y=24",
    "2y=": "2 y =", "1y": "1 y", "12s": "12 s", "2x": "2 x", "y2": "y 2", "2xy": "2 x y", "is12": "is 12", "2xy=24": "2 x y = 24",
    "1y=12": "1 y = 12", "x12=24": "x 12 = 24", "2y=2y": "2 y = 2 y", "24and": "24 and", "2it": "2 it", "2times": "2 times",
    "2so": "2 so", "24so": "24 so", "1w": "1 w", "2xtables": "2 x tables", "24=2y": "24 = 2 y", "2ys": "2 y s", "2which": "2 which",
    "24divided": "24 divided", "2y=24": "2 y = 24", "2and": "2 and", "2*12is": "2 * 12 is", "yx2=24": "y x 2 = 24", "10x": "10 x",
    "12x": "12 x", "it2": "it 2", "2divided": "2 divided", "2ys": "2 y s", "x12=24": "x 12 = 24", "2y=24it": "2 y = 24 it", "24cm": "24 cm",
    "=24that": "= 24 that", "12the": "12", "12ves": "12", "(2y=24)": "( 2 y = 24 )", "24i": "24 i", "multiply24": "multiply 24",
    "2y": "2 y", "2times": "2 times", "its------2y=24-------48": "its ------ 2 y = 24 ------- 48", "3x1=3": "3 x 1 = 3", "1/3x5/1": "1/3 x 5/1",
    "by2": "by 2", "2x5=10": "2 x 5 = 10", "get10/3": "get 10/3", "and1/5": "and 1/5", "(5x2)3which": "( 5 x 2 ) 3 which", "1x3": "1 x 3",
    "3r1": "3/1", "2/3x5": "2/3 x 5", "2/3x5/1=10/3": "2/3 x 5/1 = 10/3", "2x5=1o": "2 x 5 = 10", "2/3x5/1=10/33": "2/3 x 5/1 = 10/33",
    "3x1": "3 x 1", "1/3 10/3=or": "1/3 10/3 = or", "2times5=": "2 times 5 =", "(2x5)": "( 2 x 5 )", "10it": "10 it",
    "10/3simplified": "10/3 simplified", "5/1x2/3": "5/1 x 2/3", "3s": "3", "1s": "1", "by2/3": "by 2/3", "1x3": "1 x 3", "to3": "to 3",
    "3and1/3": "3 and 1/3", "3x1=1": "3 x 1 = 1", "2x5=103x1=3": "2 x 5 = 10 ; 3 x 1 = 3", "over1": "over 1", "5x2": "5 x 2", "3x1=": "3 x 1 =",
    "1/3x": "1/3 x", "5/1and": "5/1 and", "x5": "x 5", "10/3=3r1": "10/3 = 3 r 1", "1so": "1 so", "3r3": "3/3", "1x3": "1 x 3", "1then": "1 then",
    "10/3if": "10/3 if", "5/1to": "5/1 to", "2/3x": "2/3 x", "2/3x5=": "2/3 x 5 =", "o66": "66", "3x1==3": "3 x 1 == 3", "10 15ths": "10/15",
    "10 fifteenths": "10/15", "5times": "5 times", "3is": "3 is", "2times5": "2 times 5", "3times5": "3 times 5", "2/3times": "2/3 times",
    "1/5th": "1/5", "2/3x5": "2/3 x 5", "2/15s": "2/15 s", "1/5x2/3": "1/5 x 2/3", "3and": "3 and", "1x2=2": "1 x 2 = 2", "3x5=15": "3 x 5 = 15",
    "2/3x1/5=2/15": "2/3 x 1/5 = 2/15", "2x1": "2 x 1", "2x1=2": "2 x 1 = 2", "5x3=15": "5 x 3 = 15", "3x5": "3 x 5", "1x2=2": "1 x 2 = 2",
    "3x5=15": "3 x 5 = 15", "its": "it is", "0s": "0", "hole": "whole", "desemel": "decimal", "collum": "column", "devided": "divided",
    "devis": "divide", "bythe": "by the", "drnominators": "denominators", "numorators": "numerators", "demoniter": "denominator",
    "demonanaterr": "denominator", "dinominator": "denominator", "nermoratater": "numerator", ":)": "", "know(::(:(:(:(:(:(:((:(:(:((:(:": "know",
    "4s": "4", "closr": "close", "stilll": "still", "itll": "it will", "0ne third": "1/3"
}

In [None]:
def normalize_text_watch(text):
    if not isinstance(text, str):
        return "", ""

    original = text.strip()
    text_expanded = original.lower()

    for contraction, full_form in contractions_dict.items():
        pattern = re.compile(r"\b" + re.escape(contraction) + r"\b", flags=re.IGNORECASE)
        text_expanded = pattern.sub(full_form, text_expanded)

    text_expanded = re.sub(r"\s+", " ", text_expanded)
    text_expanded = re.sub(r"(?<=[a-z])\.(?=[a-z])", " ", text_expanded)
    text_expanded = re.sub(r"(?<=[a-z]),(?=[a-z])", " ", text_expanded)
    text_expanded = re.sub(r"(\d)\s*/\s*(\d)", r"\1/\2", text_expanded)
    text_expanded = re.sub(r"[^a-z0-9\s/=\*\+\-%\.\,\(\)\[\]\:;]", "", text_expanded)

    text_expanded = text_expanded.replace(".", "").replace(",", "")

    return original, text_expanded


train["StudentExplanation"] = train["StudentExplanation"].apply(normalize_text_watch)
print("Cleaning completed.")
rows_after_cleaning = len(train)
print(f"Rows after cleaning: {rows_after_cleaning} "
      f"(cleaned {rows_after_length_filter - rows_after_cleaning} rows from previous step)")

train = train.drop_duplicates(subset=["StudentExplanation"]).copy()
rows_after_dedup = len(train)
print(f"Rows after deduplication: {rows_after_dedup} "
      f"(removed {rows_after_cleaning - rows_after_dedup} duplicates, "
      f"total cleaned: {total_rows - rows_after_dedup})")

In [None]:
dict_en = enchant.Dict("en_US")

WHITELIST = {
    "decimal", "denominator", "numerator", "equivalent",
    "fraction", "fractions", "simplify", "simplified",
    "multiplying", "dividing", "multiply", "divide",
    "factor", "factors", "percent", "percentage",
    "expression", "equation", "variable", "coefficient", "lcm"
}

SPECIAL_FIX = {
    r"d[eo]nom[a-z]*": "denominator",
    r"numer[a-z]*": "numerator",
    r"equiv[a-z]*": "equivalent",
}

def apply_special_fix(word):
    for pattern, correct in SPECIAL_FIX.items():
        if re.fullmatch(pattern, word):
            return correct
    return None


def smart_correct_spelling(text):
    words = re.findall(r"[a-zA-Z0-9/]+|\S", text)
    corrected_words = []

    for w in words:
        lw = w.lower()

        if not re.match(r"^[a-zA-Z]+$", w):
            corrected_words.append(w)
            continue

        fixed = apply_special_fix(lw)
        if fixed:
            corrected_words.append(fixed)
            continue

        if lw in WHITELIST:
            corrected_words.append(w)
            continue

        if dict_en.check(w):
            corrected_words.append(w)
            continue

        corrected = str(TextBlob(w).correct())

        if abs(len(corrected) - len(w)) > 3:
            corrected_words.append(w)
        else:
            corrected_words.append(corrected)

    result = ""
    for i, x in enumerate(corrected_words):
        if i > 0 and not (x.startswith("-") or corrected_words[i-1].endswith("-")):
            result += " "
        result += x
    return result

def correct_spelling(text):
    return smart_correct_spelling(text)


def process_in_parallel(texts, func, num_workers=4):
    results = []
    with concurrent.futures.ProcessPoolExecutor(max_workers=num_workers) as executor:
        for out in tqdm(executor.map(func, texts), total=len(texts), desc="Correcting"):
            results.append(out)
    return results

train["clean_exp"] = train["StudentExplanation"].apply(lambda x: x[1] if isinstance(x, tuple) else x)
rows_after_correction = len(train)
train["clean_exp"] = process_in_parallel(train["clean_exp"].tolist(), smart_correct_spelling, num_workers=4)

train = train.drop_duplicates(subset=["clean_exp"]).copy()
train["StudentExplanation"] = train["clean_exp"]
train.drop(columns=["clean_exp"], inplace=True)
print(f"Rows after spelling correction and deduplication: {len(train)}")

train = train.drop_duplicates(subset=["StudentExplanation"]).copy()
rows_after_final_dedup = len(train)
print(f"Rows after final deduplication: {rows_after_final_dedup} "
      f"(removed {rows_after_correction - rows_after_final_dedup} duplicates)")

In [None]:
def clean_latex(text):
    text = re.sub(r'\\[()\[\]]', '', text)
    text = text.replace("\\", "")
    text = re.sub(r'\s+([,.!?;:])', r'\1', text)
    text = re.sub(r'(?<=\b\d)\s+(?=[a-zA-Z]\b)', '', text)
    text = re.sub(r'(?<=\b[a-zA-Z])\s+(?=\d\b)', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    return text

train["QuestionText"] = train["QuestionText"].apply(clean_latex)
train['MC_Answer'] = train["MC_Answer"].apply(clean_latex)

In [None]:
train["Misconception"] = train["Misconception"].fillna("NA")
def normalize_label(label):
    if pd.isna(label) or label == "NA":
        return "NA"

    label = label.strip()
    label = label.lower()
    label = label.replace("-", "_")
    label = label.replace(" ", "_")
    label = re.sub(r"_+", "_", label)

    return label

train["Misconception"] = train["Misconception"].apply(normalize_label)
mapping = { "inversion": "inverse_operation" }

train["Misconception"] = train["Misconception"].apply(
    lambda x: mapping.get(x, x)
)

print("Unique Misconception groups:")
print(train["Misconception"].unique())
print("Number of unique Misconception:", train["Misconception"].nunique())

In [None]:
train["Combined"] = train['QuestionText'] + " || " + train['MC_Answer'] + " || " + train['StudentExplanation']

In [None]:
train.to_csv(BASE_DATA_PATH + '/processed/train_v2.csv', index=False, encoding="utf-8")
print("Saved processed DataFrame to CSV:", BASE_DATA_PATH + '/processed/train_v2.csv')