In [1]:
import os
import re
import json
import gzip
import pickle
import fasttext
import spacy
from pathlib import Path
from typing import Optional, Dict, Any, List

from datasets import load_dataset
from tqdm import tqdm

from dolma import BaseTagger, add_tagger
from dolma.core.data_types import DocResult, Document, Span
from collections import Counter, defaultdict


OUTPUT_TOKENIZED_DIR = Path("data/output/tokenized_c4")
OUTPUT_TOKENIZED_DIR.mkdir(parents=True, exist_ok=True)

PROGRESS_PATH = Path("data/state/c4_progress.json")
PROGRESS_PATH.parent.mkdir(parents=True, exist_ok=True)

MODEL_PATH = "utils/autotuned_fasttext_model.bin"
LOCAL_C4_FOLDER = Path("/Users/kaionamartinson/Desktop/Cultural-Analytics/dolma/c4")


SHARD_SIZE = 500_000

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_path = "utils/autotuned_fasttext_model.bin"

try:
    # Load the FastText model
    model = fasttext.load_model(model_path)
    print(f"FastText model loaded successfully from {model_path}")
except ValueError as e:
    print(
        f"Error loading model: {e}. It might be that the file is corrupted or not a valid FastText model."
    )
except FileNotFoundError:
    print(f"Error: Model file not found at {model_path}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

FastText model loaded successfully from utils/autotuned_fasttext_model.bin


### Utility Functions & Packages

In [3]:
import pickle
from pathlib import Path

AAPI_KEYWORDS_PATH = Path(
    "/Users/kaionamartinson/Desktop/Cultural-Analytics/Cultural-Analytics-AAPI/data/aapiGroups.pkl"
)

with AAPI_KEYWORDS_PATH.open("rb") as f:
    AAPI_KEYWORDS = pickle.load(f)

# Safety check
if not isinstance(AAPI_KEYWORDS, set):
    AAPI_KEYWORDS = set(AAPI_KEYWORDS)

print(f"Loaded {len(AAPI_KEYWORDS)} AAPI keywords")

Loaded 66 AAPI keywords


In [4]:
def load_model():
    try:
        # Load the FastText model
        model = fasttext.load_model(MODEL_PATH)
        return model
        print(f"FastText model loaded successfully from {MODEL_PATH}")
    except ValueError as e:
        print(
            f"Error loading model: {e}. It might be that the file is corrupted or not a valid FastText model."
        )
    except FileNotFoundError:
        print(f"Error: Model file not found at {MODEL_PATH}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

In [5]:
def _safe_here() -> Path:
    """Return a reasonable 'here' directory, even in notebooks/REPL."""
    if "__file__" in globals():
        return Path(__file__).resolve()
    return Path.cwd()


def _find_pickle(explicit: Optional[str]) -> Path:
    """
    Find the aapiGroups.pkl file.

    Search order:
      1. Explicit path argument
      2. Env var: AAPI_KEYWORDS_PICKLE
      3. Common repo-relative locations
    """
    tried: List[str] = []

    def _check(path: Path) -> Optional[Path]:
        tried.append(str(path))
        if path.exists():
            return path.resolve()
        return None

    if explicit:
        p = Path(explicit).expanduser().resolve()
        found = _check(p)
        if found:
            return found

    env_path = os.environ.get("AAPI_KEYWORDS_PICKLE")
    if env_path:
        p = Path(env_path).expanduser().resolve()
        found = _check(p)
        if found:
            return found

    here = _safe_here()
    repo_root = here.parents[2] if len(here.parents) >= 3 else here

    candidates = [
        repo_root / "data" / "aapiGroups.pkl",
        here.parent / "data" / "aapiGroups.pkl",
        repo_root / "utils" / "data" / "aapiGroups.pkl",
        Path.cwd() / "data" / "aapiGroups.pkl",
    ]

    for cand in candidates:
        found = _check(cand)
        if found:
            return found

    raise FileNotFoundError(
        "Could not locate aapiGroups.pkl. Searched:\n  - "
        + "\n  - ".join(tried)
        + "\nTip: set env var AAPI_KEYWORDS_PICKLE=/abs/path/to/aapiGroups.pkl "
        "or pass keyword_pickle=... when constructing AAPIKeywordsTagger."
    )

### Dolma Tagger

In [6]:
@add_tagger("aapi_keywords_v1")
class AAPIKeywordsTagger(BaseTagger):
    """
    Tags documents that contain any AAPI-related keyword loaded from a pickle.
    """

    def __init__(self, keyword_pickle: Optional[str] = None) -> None:
        super().__init__()

        self.keyword_pickle = _find_pickle(keyword_pickle)

        with self.keyword_pickle.open("rb") as f:
            raw_terms = pickle.load(f)

        terms = [str(t).strip().lower() for t in list(raw_terms)]
        if not terms:
            raise ValueError(f"No terms found in {self.keyword_pickle}")

        pattern = r"\b(" + "|".join(re.escape(t) for t in terms) + r")\b"
        self.regex = re.compile(pattern, flags=re.IGNORECASE)

    def predict(self, doc: Document) -> DocResult:
        text = doc.text or ""
        matches = self.regex.findall(text)
        if not matches:
            # no matches → score 0
            span = Span(start=0, end=0, type="aapi_keyword", score=0.0)
            return DocResult(doc=doc, spans=[span])

        # unique matches → score is count of unique AAPI terms present
        unique_matches = {m.lower() for m in matches}
        score = float(len(unique_matches))

        span = Span(
            start=0,
            end=len(text),
            type="aapi_keyword",
            score=score,
        )
        return DocResult(doc=doc, spans=[span])

### Dolma Mixer

In [7]:
def mix_aapi_doc(result: DocResult) -> Optional[Dict[str, Any]]:
    """
    Given a DocResult from AAPIKeywordsTagger, return a JSON-serializable dict
    if score > 0, else return None (filter out the doc).
    """
    doc = result.doc
    spans = result.spans or []

    score = 0.0
    if spans:
        score = float(spans[0].score or 0.0)

    if score <= 0.0:
        return None

    return {
        "id": doc.id,
        "text": doc.text,
        # "source": getattr(doc, "source", None),
        # "aapi_score": score,
        # "aapi_spans": [
        #     {
        #         "start": s.start,
        #         "end": s.end,
        #         "type": s.type,
        #     }
        #     for s in spans
        # ],
    }

### Tokenizer

In [8]:
class AAPITokenizer:

    def __init__(self, keyword_pickle: Optional[str] = None) -> None:
        super().__init__()

        self.keyword_pickle = _find_pickle(keyword_pickle)
        with self.keyword_pickle.open("rb") as f:
            raw_terms = pickle.load(f)
        terms = [str(t).strip().lower() for t in list(raw_terms)]
        if not terms:
            raise ValueError(f"No terms found in {self.keyword_pickle}")

        self.aapi_groups_set = set(terms)

        self.nlp = spacy.load("en_core_web_sm")
        for token_text in self.aapi_groups_set:
            self.nlp.tokenizer.add_special_case(token_text, [{"ORTH": token_text}])

    def tokenize(self, data: Dict[str, Any]) -> Dict[str, Any]:
        return self.nlp(data["text"])  # .to_json() changed this

### Process Helpers

In [9]:
def save_progress(count: int) -> None:
    """
    Save how many C4 docs we've fully traversed/processed (for resume).
    """
    PROGRESS_PATH.parent.mkdir(parents=True, exist_ok=True)
    with PROGRESS_PATH.open("w", encoding="utf-8") as f:
        json.dump({"c4_docs_done": count}, f)


def load_progress() -> int:
    """
    Load how many C4 docs we've previously processed. Returns 0 if none.
    """
    if PROGRESS_PATH.exists():
        with PROGRESS_PATH.open("r", encoding="utf-8") as f:
            data = json.load(f)
        return int(data.get("c4_docs_done", 0))
    return 0


def open_new_shard(out_dir: Path, shard_idx: int) -> gzip.GzipFile:
    """
    Open a new gzip'd JSONL shard for writing mixed docs.
    """
    out_dir.mkdir(parents=True, exist_ok=True)
    shard_path = out_dir / f"mixed.{shard_idx:09d}.jsonl.gz"
    return gzip.open(shard_path, "wt", encoding="utf-8")

### Verbs & Adjs

In [10]:
def build_group_lexicon(groups):
    """
    Map surface forms -> canonical ethnicity.
    Handles simple plurals like 'filipinos' -> 'filipino'.
    """
    lex = {}
    for g in groups:
        g_l = g.lower()
        lex[g_l] = g_l
        lex[g_l + "s"] = g_l  # naive plural
    return lex

Adjectives

In [11]:
STOP_ADJ = {"north", "south", "east", "west", "central"}


def is_directional(adj_lemma: str) -> bool:
    return adj_lemma in STOP_ADJ or any(s in adj_lemma for s in STOP_ADJ)


def get_ethnicity_cache(sentence, group_lexicon):
    """
    Precompute ethnicity tokens in the sentence.
    Returns:
      - eth_by_i: token index -> group
      - groups_in_sentence: set of group strings
    """
    eth_by_i = {}
    groups_in_sentence = set()
    for tok in sentence:
        form = tok.text.lower()
        eth = group_lexicon.get(form)
        if eth:
            eth_by_i[tok.i] = eth
            groups_in_sentence.add(eth)
    return eth_by_i, groups_in_sentence


def get_subjects_for_pred(pred):
    """
    Find subjects (nsubj/nsubjpass) associated with this predicate
    (either the ADJ itself or its governing AUX/VERB).
    """
    subs = [c for c in pred.children if c.dep_ in {"nsubj", "nsubjpass"}]
    if subs:
        return subs

    head = pred.head
    visited = set()
    while head is not None and head not in visited:
        visited.add(head)
        subs = [c for c in head.children if c.dep_ in {"nsubj", "nsubjpass"}]
        if subs:
            return subs
        if head.dep_ in {"conj", "acomp", "attr"} or head.pos_ == "AUX":
            head = head.head
        else:
            break
    return []


# -------------------------
# 3. Main collector
# -------------------------
def collect_adj(sentence, group_lexicon, ethnicity_dict):
    # Precompute ethnicity info for this sentence
    eth_by_i, groups_in_sentence = get_ethnicity_cache(sentence, group_lexicon)
    if not groups_in_sentence:
        return  # no relevant groups, skip

    for adj in sentence:
        if adj.pos_ != "ADJ":
            continue

        lemma = adj.lemma_.lower()
        if not lemma or len(lemma) <= 2:
            continue
        if is_directional(lemma):
            continue
        if adj.ent_type_ == "NORP":  # don't count ethnicity adjectives themselves
            continue

        subjects = get_subjects_for_pred(adj)
        attached = False

        # 1) Attach via subject subtrees
        for subj_tok in subjects:
            for tok in subj_tok.subtree:
                eth = eth_by_i.get(tok.i)
                if eth is None:
                    continue
                counter = ethnicity_dict.get(eth)
                if counter is None:
                    counter = ethnicity_dict[eth] = Counter()
                counter[lemma] += 1
                attached = True

        # 2) Fallback: no subject found, but groups exist in sentence
        # Useful for weird fragments like "Filipinos hardworking."
        if not attached:
            for eth in groups_in_sentence:
                counter = ethnicity_dict.get(eth)
                if counter is None:
                    counter = ethnicity_dict[eth] = Counter()
                counter[lemma] += 1

verbs

In [12]:
def get_subjects_for_pred(pred):
    subs = [c for c in pred.children if c.dep_ in {"nsubj", "nsubjpass"}]
    if subs:
        return subs

    head = pred.head
    visited = set()
    while head is not None and head not in visited:
        visited.add(head)
        subs = [c for c in head.children if c.dep_ in {"nsubj", "nsubjpass"}]
        if subs:
            return subs
        if head.dep_ in {"conj", "acomp", "attr"} or head.pos_ == "AUX":
            head = head.head
        else:
            break
    return []


BAD_VERBS = {"be", "have", "do"}
BAD_LEMMAS = {"orient"}  # add more if needed


def collect_verb(sentence, group_lexicon, ethnicity_dict, use_fallback=False):
    eth_by_i = {}
    groups_in_sentence = set()

    for tok in sentence:
        eth = group_lexicon.get(tok.text.lower())
        if eth:
            eth_by_i[tok.i] = eth
            groups_in_sentence.add(eth)

    if not groups_in_sentence:
        return

    for token in sentence:
        if token.pos_ not in {"VERB", "AUX"}:
            continue

        lemma = token.lemma_.lower()

        if lemma in BAD_VERBS or lemma in BAD_LEMMAS:
            continue

        if token.tag_ not in {"VB", "VBD", "VBG", "VBN", "VBP", "VBZ"}:
            continue

        # ---- NEW: relative clause verbs ----
        if token.dep_ == "relcl":
            head = token.head
            for tok in head.subtree:
                eth = group_lexicon.get(tok.text.lower())
                if eth:
                    ethnicity_dict.setdefault(eth, Counter())[lemma] += 1
            continue

        # ---- NEW: participle / reduced clause verbs ----
        if token.dep_ in {"acl", "acl:relcl"}:
            head = token.head
            for tok in head.subtree:
                eth = group_lexicon.get(tok.text.lower())
                if eth:
                    ethnicity_dict.setdefault(eth, Counter())[lemma] += 1
            continue

        subjects = get_subjects_for_pred(token)
        attached = False

        for subj in subjects:
            for tok in subj.subtree:
                eth = eth_by_i.get(tok.i)
                if eth:
                    ethnicity_dict.setdefault(eth, Counter())[lemma] += 1
                    attached = True

        if not attached and use_fallback and len(groups_in_sentence) == 1:
            eth = next(iter(groups_in_sentence))
            ethnicity_dict.setdefault(eth, Counter())[lemma] += 1

### Windows and nouns

window

In [13]:
def window_mask_sentence(ethnicity_term, sent_tokens, tokens_lower, window=5):
    """
    sent_tokens: list of token strings (in order)
    aapi_groups_set: set of lowercase group terms
    window: number of tokens to keep on each side

    Returns: a masked window string or None if no ethnicity term found.
    """

    # Find its index in the ordered list
    idx = tokens_lower.index(ethnicity_term)

    # Mask token
    tokens_masked = sent_tokens[:]  # copy list
    tokens_masked[idx] = "[ETHNICITY]"

    # Window boundaries
    start = max(0, idx - window)
    end = min(len(tokens_masked), idx + window + 1)
    # final join
    return " ".join(tokens_masked[start:end])

nouns

In [14]:
def ethnicity_modified_nouns(sentence, aapi_groups_set=None):
    """
    Find nouns modified by an ethnicity term.
    Returns: list of (ethnicity, noun) pairs.

    aapi_groups_set: optional set of lowercase ethnicity strings to restrict to.
                     If None, we use spaCy's NORP tag only.
    """
    for token in sentence:
        # we want the head noun: "girls", "women", "doctors"
        if token.pos_ != "NOUN":
            continue

        noun_lemma = token.lemma_.lower()

        for child in token.children:
            # modifier must be attached to the noun
            if child.dep_ not in {"amod", "compound"}:
                continue

            child_lower = child.text.lower()

            if not (child.ent_type_ == "NORP") or (
                aapi_groups_set is not None and child_lower in aapi_groups_set
            ):
                continue

            return noun_lemma

### Actual Process

In [15]:
# packages
nlp = spacy.load("en_core_web_sm")
#

In [16]:
model = load_model()

model.predict("new cat")

(('__label__0',), array([1.00000882]))

In [17]:
def save_artifacts_safely(
    noun_heads_counter, aapi_counter_pass, verb_ethnicity_dict, adj_ethnicity_dict
):
    """
    Safely save artifacts so crashes don't corrupt your file.
    Uses atomic replace: either old file or new file, never half-written.
    """

    data = {
        "noun_heads_counter": noun_heads_counter,
        "aapi_counter_pass": aapi_counter_pass,
        "verb_ethnicity_dict": verb_ethnicity_dict,
        "adj_ethnicity_dict": adj_ethnicity_dict,
    }

    tmp_path = "ethnicity_artifacts.pkl.tmp"
    final_path = "ethnicity_artifacts.pkl"

    # Write to a temp file first
    with open(tmp_path, "wb") as f:
        pickle.dump(data, f)
        f.flush()
        os.fsync(f.fileno())  # force write to disk

    # Atomic replace: either old file or the new one, never half-written
    os.replace(tmp_path, final_path)

In [18]:
from datasets import load_dataset

ARTIFACTS_PATH = Path("ethnicity_artifacts.pkl")

In [19]:
import gzip
import json
from pathlib import Path


def iter_local_c4_files(folder_path):
    """get c4 and it"""
    folder_path = Path(folder_path)

    for gz_file in sorted(folder_path.glob("*.json.gz")):
        print(f"Reading: {gz_file}")

        with gzip.open(gz_file, "rt", encoding="utf-8") as f:
            for line in f:
                try:
                    data = json.loads(line)
                    yield data
                except Exception as e:
                    print(f"Skipping bad line: {e}")
                    continue
            print("ending reading this file, no more lines")

In [20]:
from tqdm import tqdm

In [21]:
def run_loop(
    noun_heads_counter,
    aapi_counter_pass,
    verb_ethnicity_dict,
    adj_ethnicity_dict,
    out_dirname: Path = OUTPUT_TOKENIZED_DIR,
) -> None:
    """
    Stream Dolma C4 docs, tag them with AAPIKeywordsTagger, filter them via
    mix_aapi_doc, and write the kept ones into sharded .jsonl.gz files.

    Resumable via PROGRESS_PATH (counts how many C4 docs have been processed).
    """
    traversed = 0

    tagger = AAPIKeywordsTagger()
    tokenizer = AAPITokenizer()

    ds = iter_local_c4_files(LOCAL_C4_FOLDER)

    pbar = tqdm(
        total=None,
        desc="Processing C4 docs",
        mininterval=0.5,
        dynamic_ncols=True,
        leave=True,
    )

    docs_in_shard = 0

    for data in ds:
        pbar.update(1)
        doc = Document(
            id=data["id"],
            text=data["text"],
            source=data.get("source"),
        )

        tagged = tagger.predict(doc)
        mixed = mix_aapi_doc(tagged)

        if mixed is not None:
            tokenized = tokenizer.tokenize(mixed)

            for sentence in tokenized.sents:
                # tokenize and lower
                tokens = [t.text for t in sentence]
                tokens_lower = [t.lower() for t in tokens]

                # checking if ethnicity is in sentence and taking those ethnicity if true
                overlap = set(tokens_lower) & AAPI_KEYWORDS
                if not overlap:
                    continue  # skip if no ethnicity term
                overlap_eth = list(overlap)[0]

                # some collection before filtering
                noun_heads_counter[ethnicity_modified_nouns(sentence, overlap)] += 1
                aapi_counter_pass[overlap_eth] += 1

                # running through my model
                window_text = (
                    window_mask_sentence(overlap_eth, tokens, tokens_lower)
                    .replace("\n", " ")
                    .strip()
                )
                if model.predict(window_text)[0][0] == "__label__0":
                    continue

                aapi_counter_pass[
                    overlap_eth
                ] += 1  # stating that the ethnicity passes model

                lex = build_group_lexicon(overlap)
                collect_verb(sentence, lex, verb_ethnicity_dict)
                collect_adj(sentence, lex, adj_ethnicity_dict)

            if docs_in_shard % 1000 == 0:
                save_artifacts_safely(
                    noun_heads_counter,
                    aapi_counter_pass,
                    verb_ethnicity_dict,
                    adj_ethnicity_dict,
                )

            docs_in_shard += 1

            traversed += 1
            if traversed % 1000 == 0:
                save_progress(traversed)

    pbar.close()
    save_artifacts_safely(
        noun_heads_counter, aapi_counter_pass, verb_ethnicity_dict, adj_ethnicity_dict
    )

    save_progress(traversed)

In [22]:
noun_heads_counter = Counter()
aapi_counter_pass = Counter()
verb_ethnicity_dict = {}
adj_ethnicity_dict = {}

### Running the code

In [23]:
run_loop(noun_heads_counter, aapi_counter_pass, verb_ethnicity_dict, adj_ethnicity_dict)

Processing C4 docs: 0it [00:00, ?it/s]

Reading: /Users/kaionamartinson/Desktop/Cultural-Analytics/dolma/c4/c4-0000.json.gz


Processing C4 docs: 1551317it [3:14:24, 98.11it/s] 

ending reading this file, no more lines
Reading: /Users/kaionamartinson/Desktop/Cultural-Analytics/dolma/c4/c4-0001.json.gz


Processing C4 docs: 1553843it [3:14:51, 101.24it/s]

KeyboardInterrupt: 

Processing C4 docs: 1553941it [3:15:08, 101.24it/s]

In [None]:
import pickle

with open("ethnicity_artifacts.pkl", "rb") as f:
    data = pickle.load(f)

# unpack
noun_heads_counter = data["noun_heads_counter"]
aapi_counter_pass = data["aapi_counter_pass"]
verb_ethnicity_dict = data["verb_ethnicity_dict"]
adj_ethnicity_dict = data["adj_ethnicity_dict"]

In [None]:
adj_ethnicity_dict

Counter({'chinese': 2})



In [None]:
noun_heads_counter

Counter({None: 1091,
         'community': 4,
         'agreement': 3,
         'dish': 2,
         'official': 2,
         'tribe': 2,
         'immigrant': 2,
         'rule': 2,
         'newspaper': 2,
         'delegation': 2,
         'culture': 1,
         'center': 1,
         'attorney': 1,
         'population': 1,
         'wing': 1,
         '~.': 1,
         'audience': 1,
         'ally': 1,
         'card': 1,
         'phalai': 1,
         'baht': 1,
         'petroglyph': 1,
         'frontier': 1,
         'bandit': 1,
         'symbol': 1,
         'citizen': 1,
         'weave': 1,
         'tourist': 1,
         'airline': 1,
         'capital': 1,
         'emperor': 1,
         'front': 1,
         'corporation': 1,
         'government': 1,
         'uranium': 1,
         'city': 1,
         'man': 1,
         'army': 1,
         'country': 1,
         'church': 1,
         'principal': 1,
         'girl': 1,
         'dynasty': 1,
         'coast': 1,
         