In [1]:
import pandas as pd
import re
from typing import List

In [2]:
path_in = "C:\\Users\\87738\\Desktop\\ERP\\math_anxiety_2020_2024.csv"
df = pd.read_csv(path_in)

In [3]:
if "lang" in df.columns:
    df = df[df["lang"].astype(str).str.lower().eq("en")].copy()
if "matches_query" in df.columns:
    pass

In [4]:
url_re = re.compile(r"https?://\S+|www\.\S+")
mention_re = re.compile(r"@\w+")
hashtag_re = re.compile(r"#\w+")
non_ascii_re = re.compile(r"[^\x00-\x7F]+")
multi_space_re = re.compile(r"\s+")

In [5]:
def basic_clean(text: str) -> str:
    if not isinstance(text, str):
        return ""
    t = text
    t = url_re.sub(" ", t)
    t = mention_re.sub(" ", t)
    t = hashtag_re.sub(" ", t)
    t = non_ascii_re.sub(" ", t)  # remove non-ascii (non-English chars)
    # remove special symbols except common punctuation
    t = re.sub(r"[^A-Za-z0-9\.\,\!\?\'\-\s]", " ", t)
    t = t.lower()
    t = multi_space_re.sub(" ", t).strip()
    return t

text_col = "text" if "text" in df.columns else df.columns[0]
df["clean_text"] = df[text_col].astype(str).map(basic_clean)

# Drop rows that became empty after cleaning
df = df[df["clean_text"].str.len() > 0].copy()

In [6]:
token_re = re.compile(r"[A-Za-z]+(?:'[A-Za-z]+)?|\d+")
def tokenize(s: str) -> List[str]:
    return token_re.findall(s)

df["tokens"] = df["clean_text"].map(tokenize)

In [7]:
lemmas_method = None
lemmas = []
try:
    import spacy
    try:
        nlp = spacy.load("en_core_web_sm")
        lemmas_method = "spacy_lemmatize_en_core_web_sm"
    except Exception:
        try:
            nlp = spacy.blank("en")
            if "lemmatizer" not in nlp.pipe_names:
                from spacy.lang.en import English
                nlp = English()
                raise RuntimeError("spaCy model not available with lemmatizer")
            lemmas_method = "spacy_lemmatize_blank_en"
        except Exception:
            nlp = None
            lemmas_method = None
    if nlp:
        joined = df["tokens"].map(lambda toks: " ".join(toks)).tolist()
        docs = list(nlp.pipe(joined, batch_size=500))
        for doc in docs:
            lemmas.append([t.lemma_ if t.lemma_ != "" else t.text for t in doc])
except Exception:
    nlp = None
    lemmas_method = None

In [8]:
if lemmas_method is None:
    try:
        import nltk
        from nltk.stem import WordNetLemmatizer
        lemmatizer = WordNetLemmatizer()
        def nltk_lemmatize(tokens):
            return [lemmatizer.lemmatize(tok) for tok in tokens]
        df["lemmas"] = df["tokens"].map(nltk_lemmatize)
        lemmas_method = "nltk_wordnet_lemmatizer"
    except Exception:
        stemmer = PorterStemmer()
        def porter_stem(tokens):
            return [stemmer.stem(tok) for tok in tokens]
        df["lemmas"] = df["tokens"].map(porter_stem)
        lemmas_method = "nltk_porter_stemmer_fallback"
if lemmas and "lemmas" not in df.columns:
    df["lemmas"] = lemmas
df["processed_text"] = df["lemmas"].map(lambda toks: " ".join(toks))

out_path = r"C:\Users\87738\Documents\math_anxiety_tweets_preprocessed.csv"
df.to_csv(out_path, index=False)
