In [96]:
import nltk
from datasets import load_dataset
import pandas as pd
import spacy
import ahocorasick

In [97]:
nlp = spacy.load("en_core_web_sm")

In [98]:
def sentence_split(batch):
    examples = {
        "sentence": [],
        "id": [],
    }
    for i, text in enumerate(batch["text"]):
        date = int(batch["dump"][i][8:12])
        for idx, sent in enumerate(nltk.sent_tokenize(text)):
            examples["sentence"].append(sent)
            examples["id"].append(batch["id"][i] + f"-{idx}")
    return examples

In [99]:
corpus = "HuggingFaceFW/fineweb-edu"
subset = "sample-10BT"

In [100]:
automaton = ahocorasick.Automaton()
with open("adverbs.txt") as adverbs:
    for idx, line in enumerate(adverbs.readlines()):
        line = line.strip()
        if not line.endswith("ly"):
            continue
        automaton.add_word(line, idx)

automaton.make_automaton()

In [101]:
data = load_dataset(corpus, name=subset, split="train")
columns = data.column_names
data = data.filter(lambda x: x["language"] == "en", num_proc=4)
data = data.map(sentence_split, remove_columns=columns, batched=True, num_proc=4)
data = data.filter(lambda x: len(x["sentence"]) > 10 and\
                   sum([1 for end_index, val in automaton.iter(x["sentence"].lower())]) == 1, num_proc=4)

Using the latest cached version of the dataset since HuggingFaceFW/fineweb-edu couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'sample-10BT' at /home/martins_32048/.cache/huggingface/datasets/HuggingFaceFW___fineweb-edu/sample-10BT/0.0.0/4863ab07d7520451e6f73e2912ad8bfee7d97c11 (last modified on Thu May  8 04:44:09 2025).


Loading dataset shards:   0%|          | 0/98 [00:00<?, ?it/s]

Filter (num_proc=4):   0%|          | 0/372422720 [00:00<?, ? examples/s]

In [102]:
print(data)

Dataset({
    features: ['id', 'sentence'],
    num_rows: 56830279
})


In [242]:
data.to_parquet("all_adverbs.parquet")

Creating parquet from Arrow format:   0%|          | 0/56831 [00:00<?, ?ba/s]

12112775938

In [None]:
from datasets import Dataset

# Load the Parquet file back into a Hugging Face Dataset
data = Dataset.from_parquet("all_adverbs.parquet")

In [243]:
def get_adverb_position(doc):
    for token in doc:
        if token.pos_ == "ADV":
            verb = next((t for t in doc if t.dep_ == "ROOT" and t.pos_ == "VERB"), None)
            if token.i == 0:
                return "sentence-initial"
            elif verb and token.i + 1 == verb.i:
                return "pre-verbal"
            elif verb and token.i > verb.i:
                return "post-verbal"
            else:
                return 
    return "unknown"

In [244]:
def classify_verb(token):
    lemma = token.lemma_.lower()

    communication_verbs = {
        "say", "tell", "ask", "speak", "talk", "mention", "report", "state", "argue", "explain",
        "describe", "suggest", "claim", "shout", "reply", "answer", "admit", "warn", "announce", "discuss"
        }
    transfer_verbs = {
        "give", "get", "take", "bring", "send", "offer", "receive", "buy", "sell", "pay",
        "lend", "borrow", "deliver", "return", "hand", "pass", "grant", "present", "allocate", "assign"
        }
    stative_verbs = {
        "be", "have", "know", "believe", "understand", "want", "need",
        "own", "belong", "seem", "appear", "mean", "contain", "include", "consist", "matter"
        }
    motion_verbs = {
        "go", "come", "walk", "run", "move", "travel", "drive", "fly", "ride", "swim",
        "jump", "climb", "crawl", "slide", "roll", "march", "leap", "hurry", "stroll", "wander"
        }
    psychological_verbs = {
        "like", "love", "hate", "enjoy", "prefer", "fear", "miss", "appreciate", "desire", "regret",
        "resent", "admire", "envy", "worry", "hope", "wish", "dread", "cherish", "value", "loathe"
        }
    perception_verbs = {
        "see", "hear", "feel", "smell", "taste", "notice", "recognize", "perceive", "detect", "observe",
        "look", "listen", "watch", "glance", "gaze", "stare", "peek", "peep", "scan", "spot"
        }

    if lemma in motion_verbs:
        return "motion"
    elif lemma in psychological_verbs:
        return "psychological"
    elif lemma in communication_verbs:
        return "communication"
    elif lemma in transfer_verbs:
        return "transfer"
    elif lemma in stative_verbs:
        return "stative"
    else:
        return "other"

def get_adverb_verb_class(doc):
    for token in doc:
        if token.pos_ == "ADV":
            head = token.head
            if head.pos_ == "VERB":
                return classify_verb(head)
    return "no_head_verb"

In [245]:
def get_sentence_mood(doc):
    if doc[-1].text == "?":
        return "interrogative"

    # Check for imperative (command)
    root = [token for token in doc if token.dep_ == "ROOT"]
    if root:
        root = root[0]
        # Imperatives often start with a verb in base form and have no subject
        if root.tag_ == "VB":  # base form verb
            has_subject = any(token.dep_ in ("nsubj", "nsubjpass") for token in doc)
            if not has_subject:
                return "imperative"

    # Otherwise, assume declarative
    return "declarative"

In [246]:
def get_comma_intonation_adverbs(doc):
    results = []
    for i, token in enumerate(doc[:-1]):  # Avoid out-of-range
        if token.pos_ == "ADV" and doc[i+1].text == ",":
            return True
    return False

In [247]:
def get_adverb_scope(doc):
    for token in doc:
        if token.pos_ == "ADV":
            head = token.head
            if head.dep_ == "ROOT":
                return "S"
            elif head.pos_ == "VERB":
                return "VP"
    return "unknown"

In [248]:
def get_adverb_text(doc):
    for token in doc:
        if token.pos_ == "ADV":
            return token.text

In [249]:
def get_adverb_count(doc):
    count = 0
    for token in doc:
        if token.pos_ == "ADV":
            count += 1
    return count

In [250]:
def get_subject_animacy(doc):
    for token in doc:
        if token.dep_ == "nsubj":
            if token.ent_type_ in ("PERSON", "ORG"): return "animate"
            if token.text.lower() in {"he", "she", "they", "i", "we"}: return "animate"
    return "inanimate"

In [251]:
def get_negation_scope(doc):
    for adv in doc:
        if adv.pos_ == "ADV":
            for neg in doc:
                if neg.dep_ == "neg":
                    return "neg_before_adv" if neg.i < adv.i else "adv_before_neg"
    return "no_negation"

In [281]:
import spacy

nlp = spacy.load("en_core_web_sm")

def classify_adverb(token):
    lemma = token.lemma_.lower()

    temporal_adverbs = {
        "briefly", "soon", "later", "now", "yesterday", "today", "tomorrow", "already",
        "still", "recently", "eventually", "formerly", "instantly"
    }
    locative_adverbs = {
        "here", "there", "everywhere", "somewhere", "anywhere", "abroad", "indoors", "outside", "home"
    }
    manner_adverbs = {
        "quickly", "carefully", "badly", "well", "happily", "sadly", "easily", "stupidly", "loudly"
    }
    degree_adverbs = {
        "very", "extremely", "somewhat", "too", "so", "quite", "barely", "highly", "totally", "completely"
    }
    frequency_adverbs = {
        "often", "sometimes", "rarely", "seldom", "always", "never", "occasionally", "frequently"
    }
    modal_adverbs = {
        "possibly", "probably", "certainly", "surely", "undoubtedly", "maybe", "definitely", "clearly"
    }
    evaluative_adverbs = {
        "fortunately", "unfortunately", "surprisingly", "stupidly", "interestingly", "sadly", "hopefully", "honestly"
    }
    focus_adverbs = {
        "only", "just", "even", "also", "mainly", "mostly", "especially", "particularly"
    }

    if lemma in temporal_adverbs:
        return "temporal"
    elif lemma in locative_adverbs:
        return "locative"
    elif lemma in manner_adverbs:
        return "manner"
    elif lemma in degree_adverbs:
        return "degree"
    elif lemma in frequency_adverbs:
        return "frequency"
    elif lemma in modal_adverbs:
        return "modal"
    elif lemma in evaluative_adverbs:
        return "evaluative"
    elif lemma in focus_adverbs:
        return "focus"
    else:
        return "other"

def get_adverb_class(doc):
    for token in doc:
        if token.pos_ == "ADV":
            return classify_adverb(token)
    return "no_adverb"

In [282]:
def grammatical_features(examples):
    doc = nlp(examples['sentence'])
    return {
        "adverb_position": get_adverb_position(doc),
        "verb_class": get_adverb_verb_class(doc),
        "sentence_mood": get_sentence_mood(doc),
        "adverb_scope": get_adverb_scope(doc),
        "comma_intonation": get_comma_intonation_adverbs(doc),
        "adverb_text": get_adverb_text(doc),
        "adverb_count": get_adverb_count(doc),
        "subject_animacy": get_subject_animacy(doc),
        "negation_scope": get_negation_scope(doc),
        "lexical_adverb_category": get_adverb_class(doc),
    }

In [262]:
data_with_features = data.shuffle(seed=42).select(range(100_000)).map(
    grammatical_features,
    remove_columns=["id"]).filter(lambda x: x["adverb_position"] != "unknown")

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/100000 [00:00<?, ? examples/s]

In [263]:
data_with_features

Dataset({
    features: ['sentence', 'adverb_position', 'verb_class', 'sentence_mood', 'adverb_scope', 'comma_intonation', 'adverb_text', 'adverb_count', 'subject_animacy', 'negation_scope'],
    num_rows: 94494
})

In [264]:
pandas_data = data_with_features.to_pandas()

In [265]:
pandas_data.head()

Unnamed: 0,sentence,adverb_position,verb_class,sentence_mood,adverb_scope,comma_intonation,adverb_text,adverb_count,subject_animacy,negation_scope
0,In the 1901 annual report of the Commissioner ...,post-verbal,other,declarative,VP,False,only,1,inanimate,no_negation
1,"By the early 20th century, however, it had bec...",,other,declarative,S,True,however,2,inanimate,no_negation
2,For Esther to invite the king for a private me...,,other,declarative,VP,False,also,2,animate,no_negation
3,Vitamin A has directly involvement in the prod...,post-verbal,no_head_verb,declarative,unknown,False,directly,1,inanimate,no_negation
4,Comparing today’s current extinction crisis wi...,post-verbal,no_head_verb,declarative,unknown,False,ultimately,1,inanimate,no_negation


In [266]:
# pandas_data = pd.get_dummies(
#     pandas_data,
#     columns=["adverb_position", "adverb_class", "has_negation", "sentence_mood", "adverb_scope"],
#     drop_first=True)

In [267]:
pandas_data.head()

Unnamed: 0,sentence,adverb_position,verb_class,sentence_mood,adverb_scope,comma_intonation,adverb_text,adverb_count,subject_animacy,negation_scope
0,In the 1901 annual report of the Commissioner ...,post-verbal,other,declarative,VP,False,only,1,inanimate,no_negation
1,"By the early 20th century, however, it had bec...",,other,declarative,S,True,however,2,inanimate,no_negation
2,For Esther to invite the king for a private me...,,other,declarative,VP,False,also,2,animate,no_negation
3,Vitamin A has directly involvement in the prod...,post-verbal,no_head_verb,declarative,unknown,False,directly,1,inanimate,no_negation
4,Comparing today’s current extinction crisis wi...,post-verbal,no_head_verb,declarative,unknown,False,ultimately,1,inanimate,no_negation


In [268]:
pandas_data.to_csv("all_adverbs_in_context.csv", sep="\t", index=False)

In [269]:
feature_columns = pandas_data.columns.tolist()
feature_columns.remove("sentence")

# 2. Drop duplicate combinations
unique_feature_rows = pandas_data.drop_duplicates(subset=feature_columns)

In [270]:
unique_feature_rows.to_csv("adverbs_in_context.csv", sep="\t", index=False)

In [271]:
unique_feature_rows

Unnamed: 0,sentence,adverb_position,verb_class,sentence_mood,adverb_scope,comma_intonation,adverb_text,adverb_count,subject_animacy,negation_scope
0,In the 1901 annual report of the Commissioner ...,post-verbal,other,declarative,VP,False,only,1,inanimate,no_negation
1,"By the early 20th century, however, it had bec...",,other,declarative,S,True,however,2,inanimate,no_negation
2,For Esther to invite the king for a private me...,,other,declarative,VP,False,also,2,animate,no_negation
3,Vitamin A has directly involvement in the prod...,post-verbal,no_head_verb,declarative,unknown,False,directly,1,inanimate,no_negation
4,Comparing today’s current extinction crisis wi...,post-verbal,no_head_verb,declarative,unknown,False,ultimately,1,inanimate,no_negation
...,...,...,...,...,...,...,...,...,...,...
94472,"In the second column, write down what you do (...",post-verbal,other,declarative,VP,True,conversely,1,inanimate,adv_before_neg
94477,"Fortunately, Michael Lane of the CDC, who work...",sentence-initial,other,declarative,VP,True,Fortunately,1,animate,adv_before_neg
94484,The dust size and shape are obviously independ...,,other,declarative,S,False,obviously,2,inanimate,no_negation
94486,"😩\nDue to the abundance of word problems, I've...",,no_head_verb,declarative,unknown,False,very,2,animate,neg_before_adv


In [272]:
interest = ahocorasick.Automaton() 
with open("words_of_interest.txt") as adverbs:
    for idx, line in enumerate(adverbs.readlines()):
        line = line.strip()
        interest.add_word(line, idx)

interest.make_automaton()

In [273]:
words_of_interest = pandas_data[pandas_data.apply(
    lambda x: x['adverb_count'] == 1 and sum([1 for end_index, val in interest.iter(x["sentence"].lower())]) == 1 and not x['sentence'].lower().startswith("if")
, axis=1)].drop_duplicates(subset=feature_columns)

In [274]:
words_of_interest.to_csv("woi_adverbs.csv", sep="\t", index=False)

In [275]:
words_of_interest

Unnamed: 0,sentence,adverb_position,verb_class,sentence_mood,adverb_scope,comma_intonation,adverb_text,adverb_count,subject_animacy,negation_scope
61,It has honestly become quite a chore to do any...,pre-verbal,other,declarative,S,False,honestly,1,inanimate,no_negation
204,For extra information on essential oils kindly...,pre-verbal,other,declarative,S,False,kindly,1,inanimate,no_negation
452,Adding humus to the soil and using mulch gener...,,other,imperative,VP,False,generously,1,inanimate,no_negation
1328,"I am called upon to sing of the Parilia, and n...",post-verbal,other,declarative,VP,False,kindly,1,inanimate,neg_before_adv
1781,"I honestly doubt, yojana has changed its defin...",,other,declarative,VP,False,honestly,1,animate,no_negation
...,...,...,...,...,...,...,...,...,...,...
92364,(“The Path of the Righteous” – “Mesillas Yesha...,post-verbal,other,declarative,VP,False,humbly,1,inanimate,no_negation
92986,Asimov was one of the most prolific writers of...,post-verbal,communication,declarative,S,False,reluctantly,1,animate,no_negation
93723,The problem of mending or ending industrialism...,,other,declarative,VP,False,foolishly,1,inanimate,no_negation
93933,Wisely placed lawn ornaments in the landscape ...,sentence-initial,other,declarative,VP,False,Wisely,1,inanimate,no_negation


In [276]:
manner = pd.read_csv("Adverbs.csv")

In [277]:
manner

Unnamed: 0,sentence,adverb_text,adverb_type,adverb_position,verb_class,has_negation,sentence_mood,adverb_scope,comma_intonation,adverb_count
0,Say words silently to your child.,silently,manner,post-verbal,communication,False,imperative,S,False,1
1,"Charles said harshly, “Your crazy mother drown...",harshly,manner,post-verbal,communication,False,declarative,S,True,1
2,Check to see if you answered correctly.,correctly,manner,post-verbal,communication,False,declarative,VP,False,1
3,Doesn't that describe CPAs exactly?,exactly,manner,post-verbal,communication,True,interrogative,S,False,1
4,Do we talk harshly?,harshly,manner,post-verbal,communication,False,interrogative,S,False,1
...,...,...,...,...,...,...,...,...,...,...
172,Asimov was one of the most prolific writers of...,reluctantly,subject-oriented,post-verbal,communication,False,declarative,S,False,1
173,It seems that in our attempt to create our own...,selfishly,subject-oriented,post-verbal,other,False,declarative,VP,False,1
174,"Funakoshi died in 1957 at the age of 88, after...",humbly,subject-oriented,post-verbal,other,False,declarative,VP,False,1
175,Jehoshaphat wisely requests that they consult ...,wisely,subject-oriented,pre-verbal,other,False,declarative,S,False,1


In [278]:
man = manner[["sentence", "adverb_type"]].join(manner.apply(grammatical_features, axis=1).apply(pd.Series))

In [279]:
man

Unnamed: 0,sentence,adverb_type,adverb_position,verb_class,sentence_mood,adverb_scope,comma_intonation,adverb_text,adverb_count,subject_animacy,negation_scope
0,Say words silently to your child.,manner,post-verbal,communication,imperative,S,False,silently,1,inanimate,no_negation
1,"Charles said harshly, “Your crazy mother drown...",manner,post-verbal,communication,declarative,S,True,harshly,1,animate,no_negation
2,Check to see if you answered correctly.,manner,post-verbal,communication,declarative,VP,False,correctly,1,inanimate,no_negation
3,Doesn't that describe CPAs exactly?,manner,post-verbal,communication,interrogative,S,False,exactly,1,inanimate,neg_before_adv
4,Do we talk harshly?,manner,post-verbal,communication,interrogative,S,False,harshly,1,animate,no_negation
...,...,...,...,...,...,...,...,...,...,...,...
172,Asimov was one of the most prolific writers of...,subject-oriented,post-verbal,communication,declarative,S,False,reluctantly,1,animate,no_negation
173,It seems that in our attempt to create our own...,subject-oriented,post-verbal,other,declarative,VP,False,selfishly,1,inanimate,no_negation
174,"Funakoshi died in 1957 at the age of 88, after...",subject-oriented,post-verbal,other,declarative,VP,False,humbly,1,animate,no_negation
175,Jehoshaphat wisely requests that they consult ...,subject-oriented,pre-verbal,other,declarative,S,False,wisely,1,animate,no_negation


In [280]:
man.to_csv("new_features.csv", sep="\t", index=False)