In [2]:
import re
import json
import spacy
import unicodedata
import pandas as pd
from datasets import load_dataset
from nltk.stem import SnowballStemmer
import openpyxl

  from .autonotebook import tqdm as notebook_tqdm


In [1]:
# Load Excel and clean column names
src_file = "Task Statements.xlsx"
df = pd.read_excel(src_file)
df.rename(columns=lambda c: c.strip(), inplace=True)

# Set relevant columns
task_col = "Task"
title_col = "Title"
print("Task column:", task_col, "example ->", df[task_col].iloc[0])
print("Title column:", title_col, "example ->", df[title_col].iloc[0])

# NLP Preparation
nlp = spacy.load("en_core_web_sm")
stemmer = SnowballStemmer("english")

def normalize(text: str) -> str:
    text = unicodedata.normalize("NFKD", str(text))
    text = "".join(c for c in text if not unicodedata.combining(c))
    text = text.lower()
    text = re.sub(r"https?://\S+|www\.\S+|\b[\w\.-]+@[\w\.-]+\.\w+\b", "", text)
    text = re.sub(r"[.!?]", " ", text)
    text = re.sub(r"\b(\d+)\b", r"NUM_\1", text)
    return re.sub(r"\s+", " ", text).strip()

def standardize(text: str) -> str:
    doc = nlp(normalize(text))
    tokens = [
        tok.lemma_.lower()
        for tok in doc
        if not tok.is_stop and not tok.is_punct and tok.lemma_.strip()
    ]
    return " ".join(tokens)

# Apply standardization
df["task_standardized"] = df[task_col].astype(str).apply(standardize)

# Select columns to save
out_cols = ["O*NET-SOC Code", title_col, task_col, "task_standardized"]
df[out_cols].to_csv("Task_Statements_standardized.csv", index=False, encoding="utf-8")

print(f"Saved to Task_Statements_standardized.csv  (total {len(df)} rows)")


Task column: Task example -> Direct or coordinate an organization's financial or budget activities to fund operations, maximize investments, or increase efficiency.
Title column: Title example -> Chief Executives
Saved to Task_Statements_standardized.csv  (total 18796 rows)
