In [1]:
import nltk
from datasets import load_dataset
import pandas as pd
import spacy
import ahocorasick

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
def sentence_split(batch):
    examples = {
        "sentence": [],
        "id": [],
    }
    for i, text in enumerate(batch["text"]):
        date = int(batch["dump"][i][8:12])
        for idx, sent in enumerate(nltk.sent_tokenize(text)):
            examples["sentence"].append(sent)
            examples["id"].append(batch["id"][i] + f"-{idx}")
    return examples

In [4]:
corpus = "HuggingFaceFW/fineweb-edu"
subset = "sample-10BT"

In [5]:
automaton = ahocorasick.Automaton()
with open("adverbs.txt") as adverbs:
    for idx, line in enumerate(adverbs.readlines()):
        line = line.strip()
        if not line.endswith("ly"):
            continue
        automaton.add_word(line, idx)

automaton.make_automaton()

FileNotFoundError: [Errno 2] No such file or directory: 'adverbs.txt'

In [None]:
data = load_dataset(corpus, name=subset, split="train")
columns = data.column_names
data = data.filter(lambda x: x["language"] == "en", num_proc=4)
data = data.map(sentence_split, remove_columns=columns, batched=True, num_proc=4)
data = data.filter(lambda x: len(x["sentence"]) > 10 and\
                   sum([1 for end_index, val in automaton.iter(x["sentence"].lower())]) == 1, num_proc=4)

Using the latest cached version of the dataset since HuggingFaceFW/fineweb-edu couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'sample-10BT' at /home/martins_32048/.cache/huggingface/datasets/HuggingFaceFW___fineweb-edu/sample-10BT/0.0.0/4863ab07d7520451e6f73e2912ad8bfee7d97c11 (last modified on Thu May  8 04:44:09 2025).


Loading dataset shards:   0%|          | 0/98 [00:00<?, ?it/s]

Filter (num_proc=4):   0%|          | 0/372422720 [00:00<?, ? examples/s]

In [None]:
print(data)

Dataset({
    features: ['id', 'sentence'],
    num_rows: 56830279
})


In [None]:
data.to_parquet("all_adverbs.parquet")

Creating parquet from Arrow format:   0%|          | 0/56831 [00:00<?, ?ba/s]

12112775938

In [None]:
from datasets import Dataset

# Load the Parquet file back into a Hugging Face Dataset
data = Dataset.from_parquet("all_adverbs.parquet")

Loading dataset shards:   0%|          | 0/25 [00:00<?, ?it/s]

In [None]:
def adverb_position(token):
    """
    Classify adverb position relative to its verbal head.
    Categories: 'sentence-initial', 'pre-verbal', 'post-verbal'.
    """
    # climb to a verbal head if possible
    head = token.head
    while head.pos_ not in {"VERB", "AUX"} and head.head != head:
        head = head.head

    if head.pos_ not in {"VERB", "AUX"}:
        # fallback: just return 'pre-verbal' if not initial
        return "sentence-initial" if token.i == 0 else "pre-verbal"

    if token.i == 0:
        return "sentence-initial"
    elif token.i < head.i:
        return "pre-verbal"
    else:
        return "post-verbal"

In [None]:
# Assumes spaCy Doc, with the *target ADV token passed in*.
# Returns one of the class labels above.

VERB_CLASSES = {
    "motion": {
        "go","come","walk","run","move","travel","drive","fly","ride","swim",
        "jump","climb","crawl","slide","roll","march","leap","hurry","stroll","wander"
    },
    "contact_caused_motion": {
        "hit","strike","kick","push","pull","throw","toss","shove","drag","carry","lift","drop",
        "put","place","set","lay","press","attach","install","insert","stick","fix","raise","lower","load","unload"
    },
    "change_of_state": {
        "break","open","close","shut","melt","freeze","grow","shrink","dry","heal","crack","burn",
        "cool","warm","improve","worsen","increase","decrease","change","become","turn"
    },
    "creation_consumption": {
        "make","build","create","cook","write","compose","draw","paint","produce","generate",
        "eat","drink","read","watch","bake","brew","prepare","design","craft","cook"
    },
    "perception": {
        "see","hear","feel","smell","taste","notice","recognize","perceive","observe",
        "watch","listen","spot","glance","gaze","stare","scan","look"
    },
    "cognition": {
        "think","know","believe","understand","realize","decide","remember","forget",
        "consider","assume","infer","plan","guess","doubt","suppose","figure","learn"
    },
    "psychological": {
        "like","love","hate","enjoy","prefer","fear","miss","appreciate","desire","regret",
        "resent","admire","envy","worry","hope","wish","dread","cherish","value","loathe"
    },
    "communication": {
        "say","tell","ask","speak","talk","mention","report","state","argue","explain",
        "describe","suggest","claim","shout","reply","answer","admit","warn","announce","discuss",
        "note","remark","comment","declare","confess","insist"
    },
    "transfer_possession": {
        "give","get","take","bring","send","offer","receive","buy","sell","pay","lend","borrow",
        "deliver","return","hand","pass","grant","present","allocate","assign","ship","supply"
    },
    "posture_location_existence": {
        "be","remain","stay","sit","stand","lie","exist","occur","happen","persist","live","reside","dwell","remain"
    },
    "aspectual_phase": {
        "begin","start","continue","stop","finish","cease","keep","resume","proceed","remain","persist","try","attempt"
    },
    "causation_manipulation": {
        "make","let","allow","help","force","compel","cause","enable","keep","hold","prevent","stop","block","drive","lead"
    },
    "light_support": {
        "do","have","take","give","make"  # support-verb uses; coarse but helpful
    },
    "emission_weather": {
        "shine","glow","flash","sparkle","smell","stink","reek","ring","buzz","hum","beep","rain","snow","hail","thunder"
    }
}

VERB_POS = {"VERB", "AUX"}

def _lexical_verbal_head(token):
    """Climb from an ADV to its nearest verbal ancestor; prefer lexical VERB over AUX."""
    head = token.head
    # climb until we hit a verbal node or root
    while head.pos_ not in VERB_POS and head.head != head:
        head = head.head
    # if AUX, climb one more step to try to reach the lexical verb (e.g., 'has eaten' -> 'eat')
    if head.pos_ == "AUX" and head.head != head and head.head.pos_ == "VERB":
        head = head.head
    return head

def classify_verb_lemma(lemma: str) -> str:
    l = lemma.lower()
    # fast path exact match
    for cls, lemmas in VERB_CLASSES.items():
        if l in lemmas:
            return cls
    # simple normalization/backoff for frequent surface forms
    # map 'putting','puts' -> 'put', etc., but spaCy's lemma usually handles this.
    return "other"

def adverb_governing_verb_class(adv_token):
    """
    adv_token: spaCy Token with pos_ == 'ADV' (the *target* adverb).
    Returns one of the class labels above or 'other' if not found.
    """
    head = _lexical_verbal_head(adv_token)
    if head.pos_ in VERB_POS:
        return classify_verb_lemma(head.lemma_)
    return "other"

In [None]:
def get_sentence_mood(doc):
    """
    Returns one of: 'interrogative', 'imperative', 'exclamative', 'declarative', 'fragment'
    Heuristics tuned for English UD/spaCy parses.
    """

    text = doc.text.strip()

    # 0) No finite verb? Treat as fragment (headlines, ellipses, etc.)
    has_finite = any(t.morph.get("VerbForm") in (["Fin"],) or t.tag_ in {"VBD","VBP","VBZ","MD"} for t in doc if t.pos_ in {"VERB","AUX"})
    if not has_finite and not text.endswith((".", "?", "!")):
        return "fragment"

    # 1) Interrogative: punctuation or structure (wh-fronting, subject–aux inversion)
    if text.endswith("?"):
        return "interrogative"
    wh_tags = {"WDT","WP","WP$","WRB"}
    wh_fronted = any(t.tag_ in wh_tags and t.i < doc.root.i for t in doc)
    # subject–aux inversion: an AUX precedes the subject and is close to the left edge
    subjects = [t for t in doc if t.dep_ in {"nsubj","nsubjpass","expl"}]
    aux_left = any(aux.dep_ in {"aux","auxpass"} and (not subjects or aux.i < min(s.i for s in subjects)) for aux in doc if aux.pos_ == "AUX")
    if wh_fronted or aux_left:
        return "interrogative"

    # 2) Imperative:
    root = doc[:].root  # same as doc.root
    has_subject = any(t.dep_ in {"nsubj","nsubjpass","expl"} for t in doc)
    # Bare imperative: root in base form, no subject (Go home.)
    bare_imp = (root.pos_ in {"VERB","AUX"} and root.tag_ == "VB" and not has_subject)
    # Negative imperative with do-support: Don't move.  (neg + aux 'do', root often VB)
    neg_imp = any(t.dep_ == "neg" for t in doc) and any(t.lemma_ == "do" and t.dep_ in {"aux","auxpass"} for t in doc) and root.tag_ == "VB"
    # Let's imperative: Let's go / let us go
    lets_imp = any(t.lemma_ == "let" and t.head == root for t in doc) and any(t.text.lower() in {"'s","us"} for t in doc)
    # Polite marker as weak cue (please/ kindly) near left edge + VB root, no subject
    polite_imp = any(t.lower_ in {"please","kindly"} and t.i <= 2 for t in doc) and root.tag_ == "VB" and not has_subject

    if bare_imp or neg_imp or lets_imp or polite_imp:
        return "imperative"

    # 3) Exclamative:
    # prototypical 'How/What' exclamatives or ending '!'
    if text.endswith("!"):
        return "exclamative"
    exclamative_how_what = any(t.lower_ in {"how","what"} and t.i < doc.root.i for t in doc)
    if exclamative_how_what:
        return "exclamative"

    # 4) Default
    return "declarative"

In [None]:
def comma_intonation_feature(adv_token):
    """
    adv_token: the target spaCy Token (pos_ == 'ADV').
    Returns one of:
      - 'comma-delimited' (comma before and after)
      - 'comma-after'     (comma immediately after)
      - 'comma-before'    (comma immediately before)
      - 'none'
    """
    i = adv_token.i
    doc = adv_token.doc

    before = (i-1 >= 0 and doc[i-1].text == ",")
    after = (i+1 < len(doc) and doc[i+1].text == ",")

    if before and after:
        return "comma-delimited"
    elif after:
        return "comma-after"
    elif before:
        return "comma-before"
    else:
        return "none"

In [None]:
def get_adverb_text(doc):
    for token in doc:
        if token.pos_ == "ADV":
            return token.text

In [None]:
def get_adverb_count(doc):
    count = 0
    for token in doc:
        if token.pos_ == "ADV":
            count += 1
    return count

In [None]:
def get_subject_animacy(doc):
    """
    Returns 'animate' if the subject is a pronoun or an entity/noun referring to humans,
    'inanimate' otherwise.
    """
    animate_pronouns = {"he","she","they","i","we","you","him","her","us","them","me"}
    animate_ents = {"PERSON","ORG","NORP"}  # treat ORG/NORP as animate-like for agentivity

    for token in doc:
        if token.dep_ == "nsubj":
            # pronoun subject
            if token.text.lower() in animate_pronouns:
                return "animate"
            # entity-labeled subject
            if token.ent_type_ in animate_ents:
                return "animate"
            # common nouns with human semantics (coarse heuristic)
            if token.lemma_.lower() in {"man","woman","child","person","student","teacher","guy","girl","boy","lady","gentleman"}:
                return "animate"
            # otherwise inanimate
            return "inanimate"

    return "none"  # no subject found

In [None]:
def has_negation(doc, adv_token):
    """
    Returns True if the clause containing the target adverb 
    has a negation marker (dep_ == 'neg'), else False.
    """
    # get the sentence or subtree containing the adverb
    sent = adv_token.sent
    for token in sent:
        if token.dep_ == "neg":
            return True
    return False

In [None]:
"""
def classify_adverb(token):
    lemma = token.lemma_.lower()

    temporal_adverbs = {
        "briefly", "soon", "later", "now", "yesterday", "today", "tomorrow", "already",
        "still", "recently", "eventually", "formerly", "instantly", "meanwhile", "afterward",
        "immediately", "lately", "presently", "shortly", "once", "then", "beforehand", "simultaneously"
    }
    locative_adverbs = {
        "here", "there", "everywhere", "somewhere", "anywhere", "abroad", "indoors", "outside", "home",
        "nearby", "above", "below", "upstairs", "downstairs", "overseas", "within", "underneath"
    }
    manner_adverbs = {
        "quickly", "carefully", "badly", "well", "happily", "sadly", "easily", "stupidly", "loudly", "safely",
        "calmly", "ardently", "correctly", "gracefully", "silently", "poorly", "angrily", "bravely", "clumsily",
        "gently", "neatly", "randomly"
    }
    degree_adverbs = {
        "very", "extremely", "somewhat", "too", "so", "barely", "highly", "totally", "completely", "exactly",
        "perfectly", "absolutely", "entirely", "deeply", "greatly", "moderately", "thoroughly", "almost", "hardly",
        "intensely"
    }
    frequency_adverbs = {
        "often", "sometimes", "rarely", "seldom", "always", "never", "occasionally", "frequently",
        "regularly", "usually", "hardly ever", "annually", "daily", "monthly", "hourly", "weekly"
    }
    epistemic_adverbs = {
        "possibly", "probably", "certainly", "surely", "undoubtedly", "maybe", "definitely", "clearly",
        "evidently", "presumably", "apparently", "conceivably", "seemingly", "likely", "arguably"
    }
    evaluative_adverbs = {
        "fortunately", "unfortunately", "surprisingly", "stupidly", "interestingly", "sadly", "hopefully", "honestly",
        "kindly", "politely", "quite", "frankly", "regrettably", "mercifully", "remarkably", "disappointingly",
        "amazingly", "tragically"
    }
    contrastive_adverbs = {
        "only", "just", "even", "also", "mainly", "mostly", "especially", "particularly",
        "however", "nevertheless", "nonetheless", "instead", "though", "still", "alternatively", "conversely"
    }

    if lemma in temporal_adverbs:
        return "temporal"
    elif lemma in locative_adverbs:
        return "locative"
    elif lemma in manner_adverbs:
        return "manner"
    elif lemma in degree_adverbs:
        return "degree"
    elif lemma in frequency_adverbs:
        return "frequency"
    elif lemma in epistemic_adverbs:
        return "epistemic"
    elif lemma in evaluative_adverbs:
        return "evaluative"
    elif lemma in contrastive_adverbs:
        return "contrastive"
    else:
        return "other"


def get_adverb_class(doc):
    for token in doc:
        if token.pos_ == "ADV":
            return classify_adverb(token)
    return "no_adverb"
    """

In [None]:
def grammatical_features(examples):
    doc = nlp(examples['sentence'])
    return {
        "adverb_position": get_adverb_position(doc),
        "verb_class": get_adverb_verb_class(doc),
        "sentence_mood": get_sentence_mood(doc),
        "comma_intonation": get_comma_intonation_adverbs(doc),
        "adverb_text": get_adverb_text(doc),
        "adverb_count": get_adverb_count(doc),
        "subject_animacy": get_subject_animacy(doc),
        "negation_scope": get_negation_scope(doc),
    }

In [None]:
from datasets import load_dataset

# Example: load your dataset
# data = load_dataset("csv", data_files="your_file.csv")["train"]

# Apply features
data_with_features = (
    data.shuffle(seed=42)  # shuffle reproducibly
        .map(
            grammatical_features,   # your custom feature extractor
            num_proc=4,             # parallelism (adjust to your CPU cores)
            remove_columns=["id"]   # drop 'id' column if not needed
        )
        .filter(lambda x: x["adverb_position"] != "unknown")  # keep only valid rows
)

Map:   0%|          | 0/56830279 [00:00<?, ? examples/s]

KeyboardInterrupt: 

In [None]:
data_with_features

Dataset({
    features: ['sentence', 'adverb_position', 'verb_class', 'sentence_mood', 'adverb_scope', 'comma_intonation', 'adverb_text', 'adverb_count', 'subject_animacy', 'negation_scope'],
    num_rows: 94494
})

In [None]:
pandas_data = data_with_features.to_pandas()

In [None]:
pandas_data.head()

Unnamed: 0,sentence,adverb_position,verb_class,sentence_mood,adverb_scope,comma_intonation,adverb_text,adverb_count,subject_animacy,negation_scope
0,In the 1901 annual report of the Commissioner ...,post-verbal,other,declarative,VP,False,only,1,inanimate,no_negation
1,"By the early 20th century, however, it had bec...",,other,declarative,S,True,however,2,inanimate,no_negation
2,For Esther to invite the king for a private me...,,other,declarative,VP,False,also,2,animate,no_negation
3,Vitamin A has directly involvement in the prod...,post-verbal,no_head_verb,declarative,unknown,False,directly,1,inanimate,no_negation
4,Comparing today’s current extinction crisis wi...,post-verbal,no_head_verb,declarative,unknown,False,ultimately,1,inanimate,no_negation


In [None]:
pandas_data.to_csv("all_adverbs_in_context.csv", index=False)