In [1]:
import nltk
from datasets import load_dataset
import pandas as pd
import spacy
import ahocorasick
import json

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
def sentence_split(batch):
    examples = {
        "sentence": [],
        "id": [],
    }
    for i, text in enumerate(batch["text"]):
        date = int(batch["dump"][i][8:12])
        for idx, sent in enumerate(nltk.sent_tokenize(text)):
            examples["sentence"].append(sent)
            examples["id"].append(batch["id"][i] + f"-{idx}")
    return examples

In [4]:
corpus = "open_subtitles"
lang1 = "en"
lang2 = "fr"

In [5]:
automaton = ahocorasick.Automaton()
with open("adv_ends_ly.txt") as adverbs:
    for idx, line in enumerate(adverbs.readlines()):
        line = line.strip()
        if not line.endswith("ly"):
            continue
        automaton.add_word(line, idx)

automaton.make_automaton()

In [21]:
data = load_dataset(corpus, lang1=lang1, lang2=lang2, split="train")
data = data.map(lambda x: {"english": x["translation"]["en"]})

Map:   0%|          | 0/41763488 [00:00<?, ? examples/s]

In [22]:
data[0]

{'id': '0',
 'meta': {'year': 0,
  'imdbId': 1089124,
  'subtitleId': {'en': 4995691, 'fr': 4588599},
  'sentenceIds': {'en': [1], 'fr': [1]}},
 'translation': {'en': 'I never dreamed before',
  'fr': "I've never dreamed before I'm gonna knock the door"},
 'english': 'I never dreamed before'}

In [24]:
data = data.filter(lambda x:
                   sum([1 for end_index, val in automaton.iter(x["english"].lower())]) == 1, num_proc=4)

Filter (num_proc=4):   0%|          | 0/41763488 [00:00<?, ? examples/s]

In [25]:
print(data)

Dataset({
    features: ['id', 'meta', 'translation', 'english'],
    num_rows: 1691511
})


In [242]:
data.to_parquet("all_adverbs.parquet")

Creating parquet from Arrow format:   0%|          | 0/56831 [00:00<?, ?ba/s]

12112775938

In [61]:
from datasets import Dataset

# Load the Parquet file back into a Hugging Face Dataset
data = Dataset.from_parquet("all_adverbs.parquet")

Loading dataset shards:   0%|          | 0/25 [00:00<?, ?it/s]

In [62]:
def get_adverb_position(doc):
    for token in doc:
        if token.pos_ == "ADV":
            head = token.head
            if head.pos_ == "VERB":
                if token.i == 0:
                    return "sentence-initial"
                elif token.i + 1 == head.i:
                    return "imediate-pre-verbal"
                elif token.i < head.i:
                    return "pre-verbal"
                elif token.i == head.i + 1:
                    return "imediate-pos-verbal"
                elif token.i > head.i:
                    return "post-verbal"
                else:
                    return "same-position"
            else:
                return "not-modifying-verb"
    return "no-adverb-found"

In [63]:
def classify_verb(token):
    lemma = token.lemma_.lower()

    communication_verbs = {
        "say", "tell", "ask", "speak", "talk", "mention", "report", "state", "argue", "explain",
        "describe", "suggest", "claim", "shout", "reply", "answer", "admit", "warn", "announce", "discuss"
        }
    transfer_verbs = {
        "give", "get", "take", "bring", "send", "offer", "receive", "buy", "sell", "pay",
        "lend", "borrow", "deliver", "return", "hand", "pass", "grant", "present", "allocate", "assign"
        }
    stative_verbs = {
        "be", "have", "know", "believe", "understand", "want", "need",
        "own", "belong", "seem", "appear", "mean", "contain", "include", "consist", "matter"
        }
    motion_verbs = {
        "go", "come", "walk", "run", "move", "travel", "drive", "fly", "ride", "swim",
        "jump", "climb", "crawl", "slide", "roll", "march", "leap", "hurry", "stroll", "wander"
        }
    psychological_verbs = {
        "like", "love", "hate", "enjoy", "prefer", "fear", "miss", "appreciate", "desire", "regret",
        "resent", "admire", "envy", "worry", "hope", "wish", "dread", "cherish", "value", "loathe"
        }
    perception_verbs = {
        "see", "hear", "feel", "smell", "taste", "notice", "recognize", "perceive", "detect", "observe",
        "look", "listen", "watch", "glance", "gaze", "stare", "peek", "peep", "scan", "spot"
        }

    if lemma in motion_verbs:
        return "motion"
    elif lemma in psychological_verbs:
        return "psychological"
    elif lemma in communication_verbs:
        return "communication"
    elif lemma in transfer_verbs:
        return "transfer"
    elif lemma in stative_verbs:
        return "stative"
    else:
        return "other"

def get_adverb_verb_class(doc):
    for token in doc:
        if token.pos_ == "ADV":
            head = token.head
            if head.pos_ == "VERB":
                return classify_verb(head)
    return "no_head_verb"

In [64]:
def get_sentence_mood(doc):
    if doc[-1].text == "?":
        return "interrogative"

    # Check for imperative (command)
    root = [token for token in doc if token.dep_ == "ROOT"]
    if root:
        root = root[0]
        # Imperatives often start with a verb in base form and have no subject
        if root.tag_ == "VB":  # base form verb
            has_subject = any(token.dep_ in ("nsubj", "nsubjpass") for token in doc)
            if not has_subject:
                return "imperative"

    # Otherwise, assume declarative
    return "declarative"

In [65]:
def get_comma_intonation_adverbs(doc):
    results = []
    for i, token in enumerate(doc[:-1]):  # Avoid out-of-range
        if token.pos_ == "ADV" and doc[i+1].text == ",":
            return True
    return False

In [66]:
def get_adverb_scope(doc):
    for token in doc:
        if token.pos_ == "ADV":
            head = token.head
            if head.dep_ == "ROOT":
                return "S"
            elif head.pos_ == "VERB":
                return "VP"
    return "unknown"

In [67]:
def get_adverb_text(doc):
    for token in doc:
        if token.pos_ == "ADV":
            return token.text

In [68]:
def get_adverb_count(doc):
    count = 0
    for token in doc:
        if token.pos_ == "ADV":
            count += 1
    return count

In [69]:
def get_subject_animacy(doc):
    for token in doc:
        if token.dep_ == "nsubj":
            if token.ent_type_ in ("PERSON", "ORG"): return "animate"
            if token.text.lower() in {"he", "she", "they", "i", "we"}: return "animate"
    return "inanimate"

In [70]:
def get_negation_scope(doc):
    for adv in doc:
        if adv.pos_ == "ADV":
            for neg in doc:
                if neg.dep_ == "neg":
                    return "neg_before_adv" if neg.i < adv.i else "adv_before_neg"
    return "no_negation"

In [109]:
def classify_adverb(token):
    lemma = token.lemma_.lower()

    temporal_adverbs = {
        "briefly", "soon", "later", "now", "yesterday", "today", "tomorrow", "already",
        "still", "recently", "eventually", "formerly", "instantly", "meanwhile", "afterward",
        "immediately", "lately", "presently", "shortly", "once", "then", "beforehand", "simultaneously"
    }
    locative_adverbs = {
        "here", "there", "everywhere", "somewhere", "anywhere", "abroad", "indoors", "outside", "home",
        "nearby", "above", "below", "upstairs", "downstairs", "overseas", "within", "underneath"
    }
    manner_adverbs = {
        "quickly", "carefully", "badly", "well", "happily", "sadly", "easily", "stupidly", "loudly", "safely",
        "calmly", "ardently", "correctly", "gracefully", "silently", "poorly", "angrily", "bravely", "clumsily",
        "gently", "neatly", "randomly"
    }
    degree_adverbs = {
        "very", "extremely", "somewhat", "too", "so", "barely", "highly", "totally", "completely", "exactly",
        "perfectly", "absolutely", "entirely", "deeply", "greatly", "moderately", "thoroughly", "almost", "hardly",
        "intensely"
    }
    frequency_adverbs = {
        "often", "sometimes", "rarely", "seldom", "always", "never", "occasionally", "frequently",
        "regularly", "usually", "hardly ever", "annually", "daily", "monthly", "hourly", "weekly"
    }
    epistemic_adverbs = {
        "possibly", "probably", "certainly", "surely", "undoubtedly", "maybe", "definitely", "clearly",
        "evidently", "presumably", "apparently", "conceivably", "seemingly", "likely", "arguably"
    }
    evaluative_adverbs = {
        "fortunately", "unfortunately", "surprisingly", "stupidly", "interestingly", "sadly", "hopefully", "honestly",
        "kindly", "politely", "quite", "frankly", "regrettably", "mercifully", "remarkably", "disappointingly",
        "amazingly", "tragically"
    }
    contrastive_adverbs = {
        "only", "just", "even", "also", "mainly", "mostly", "especially", "particularly",
        "however", "nevertheless", "nonetheless", "instead", "though", "still", "alternatively", "conversely"
    }

    if lemma in temporal_adverbs:
        return "temporal"
    elif lemma in locative_adverbs:
        return "locative"
    elif lemma in manner_adverbs:
        return "manner"
    elif lemma in degree_adverbs:
        return "degree"
    elif lemma in frequency_adverbs:
        return "frequency"
    elif lemma in epistemic_adverbs:
        return "epistemic"
    elif lemma in evaluative_adverbs:
        return "evaluative"
    elif lemma in contrastive_adverbs:
        return "contrastive"
    else:
        return "other"


def get_adverb_class(doc):
    for token in doc:
        if token.pos_ == "ADV":
            return classify_adverb(token)
    return "no_adverb"

In [110]:
def grammatical_features(examples):
    doc = nlp(examples['sentence'])
    return {
        "adverb_position": get_adverb_position(doc),
        "verb_class": get_adverb_verb_class(doc),
        "sentence_mood": get_sentence_mood(doc),
        "adverb_scope": get_adverb_scope(doc),
        "comma_intonation": get_comma_intonation_adverbs(doc),
        "adverb_text": get_adverb_text(doc),
        "adverb_count": get_adverb_count(doc),
        "subject_animacy": get_subject_animacy(doc),
        "negation_scope": get_negation_scope(doc),
        "lexical_adverb_category": get_adverb_class(doc),
    }

In [73]:
data_with_features = data.shuffle(seed=42).map(
    grammatical_features, num
    remove_columns=["id"]).filter(lambda x: x["adverb_position"] != "unknown")

Map:   0%|          | 0/56830279 [00:00<?, ? examples/s]

KeyboardInterrupt: 

In [263]:
data_with_features

Dataset({
    features: ['sentence', 'adverb_position', 'verb_class', 'sentence_mood', 'adverb_scope', 'comma_intonation', 'adverb_text', 'adverb_count', 'subject_animacy', 'negation_scope'],
    num_rows: 94494
})

In [264]:
pandas_data = data_with_features.to_pandas()

In [265]:
pandas_data.head()

Unnamed: 0,sentence,adverb_position,verb_class,sentence_mood,adverb_scope,comma_intonation,adverb_text,adverb_count,subject_animacy,negation_scope
0,In the 1901 annual report of the Commissioner ...,post-verbal,other,declarative,VP,False,only,1,inanimate,no_negation
1,"By the early 20th century, however, it had bec...",,other,declarative,S,True,however,2,inanimate,no_negation
2,For Esther to invite the king for a private me...,,other,declarative,VP,False,also,2,animate,no_negation
3,Vitamin A has directly involvement in the prod...,post-verbal,no_head_verb,declarative,unknown,False,directly,1,inanimate,no_negation
4,Comparing today’s current extinction crisis wi...,post-verbal,no_head_verb,declarative,unknown,False,ultimately,1,inanimate,no_negation


In [266]:
# pandas_data = pd.get_dummies(
#     pandas_data,
#     columns=["adverb_position", "adverb_class", "has_negation", "sentence_mood", "adverb_scope"],
#     drop_first=True)

In [267]:
pandas_data.head()

Unnamed: 0,sentence,adverb_position,verb_class,sentence_mood,adverb_scope,comma_intonation,adverb_text,adverb_count,subject_animacy,negation_scope
0,In the 1901 annual report of the Commissioner ...,post-verbal,other,declarative,VP,False,only,1,inanimate,no_negation
1,"By the early 20th century, however, it had bec...",,other,declarative,S,True,however,2,inanimate,no_negation
2,For Esther to invite the king for a private me...,,other,declarative,VP,False,also,2,animate,no_negation
3,Vitamin A has directly involvement in the prod...,post-verbal,no_head_verb,declarative,unknown,False,directly,1,inanimate,no_negation
4,Comparing today’s current extinction crisis wi...,post-verbal,no_head_verb,declarative,unknown,False,ultimately,1,inanimate,no_negation


In [268]:
pandas_data.to_csv("all_adverbs_in_context.csv", index=False)

In [269]:
feature_columns = pandas_data.columns.tolist()
feature_columns.remove("sentence")

# 2. Drop duplicate combinations
unique_feature_rows = pandas_data.drop_duplicates(subset=feature_columns)

In [270]:
unique_feature_rows.to_csv("adverbs_in_context.csv", sep="\t", index=False)

In [271]:
unique_feature_rows

Unnamed: 0,sentence,adverb_position,verb_class,sentence_mood,adverb_scope,comma_intonation,adverb_text,adverb_count,subject_animacy,negation_scope
0,In the 1901 annual report of the Commissioner ...,post-verbal,other,declarative,VP,False,only,1,inanimate,no_negation
1,"By the early 20th century, however, it had bec...",,other,declarative,S,True,however,2,inanimate,no_negation
2,For Esther to invite the king for a private me...,,other,declarative,VP,False,also,2,animate,no_negation
3,Vitamin A has directly involvement in the prod...,post-verbal,no_head_verb,declarative,unknown,False,directly,1,inanimate,no_negation
4,Comparing today’s current extinction crisis wi...,post-verbal,no_head_verb,declarative,unknown,False,ultimately,1,inanimate,no_negation
...,...,...,...,...,...,...,...,...,...,...
94472,"In the second column, write down what you do (...",post-verbal,other,declarative,VP,True,conversely,1,inanimate,adv_before_neg
94477,"Fortunately, Michael Lane of the CDC, who work...",sentence-initial,other,declarative,VP,True,Fortunately,1,animate,adv_before_neg
94484,The dust size and shape are obviously independ...,,other,declarative,S,False,obviously,2,inanimate,no_negation
94486,"😩\nDue to the abundance of word problems, I've...",,no_head_verb,declarative,unknown,False,very,2,animate,neg_before_adv


In [272]:
interest = ahocorasick.Automaton() 
with open("words_of_interest.txt") as adverbs:
    for idx, line in enumerate(adverbs.readlines()):
        line = line.strip()
        interest.add_word(line, idx)

interest.make_automaton()

In [273]:
words_of_interest = pandas_data[pandas_data.apply(
    lambda x: x['adverb_count'] == 1 and sum([1 for end_index, val in interest.iter(x["sentence"].lower())]) == 1 and not x['sentence'].lower().startswith("if")
, axis=1)].drop_duplicates(subset=feature_columns)

In [274]:
words_of_interest.to_csv("woi_adverbs.csv", sep="\t", index=False)

In [275]:
words_of_interest

Unnamed: 0,sentence,adverb_position,verb_class,sentence_mood,adverb_scope,comma_intonation,adverb_text,adverb_count,subject_animacy,negation_scope
61,It has honestly become quite a chore to do any...,pre-verbal,other,declarative,S,False,honestly,1,inanimate,no_negation
204,For extra information on essential oils kindly...,pre-verbal,other,declarative,S,False,kindly,1,inanimate,no_negation
452,Adding humus to the soil and using mulch gener...,,other,imperative,VP,False,generously,1,inanimate,no_negation
1328,"I am called upon to sing of the Parilia, and n...",post-verbal,other,declarative,VP,False,kindly,1,inanimate,neg_before_adv
1781,"I honestly doubt, yojana has changed its defin...",,other,declarative,VP,False,honestly,1,animate,no_negation
...,...,...,...,...,...,...,...,...,...,...
92364,(“The Path of the Righteous” – “Mesillas Yesha...,post-verbal,other,declarative,VP,False,humbly,1,inanimate,no_negation
92986,Asimov was one of the most prolific writers of...,post-verbal,communication,declarative,S,False,reluctantly,1,animate,no_negation
93723,The problem of mending or ending industrialism...,,other,declarative,VP,False,foolishly,1,inanimate,no_negation
93933,Wisely placed lawn ornaments in the landscape ...,sentence-initial,other,declarative,VP,False,Wisely,1,inanimate,no_negation


In [113]:
manner = pd.read_csv("subori.csv")

In [114]:
manner

Unnamed: 0,sentence,adverb_text,lexical_adverb_category,adverb_type,adverb_position,verb_class,negation_scope,sentence_mood,adverb_scope,comma_intonation,subject_animacy
0,Prepare for an honest conversation with your k...,honestly,evaluative,manner,not-modifying-verb,no_head_verb,no_negation,imperative,unknown,False,inanimate
1,But what exactly sparked the rise of the holis...,exactly,other,context-free,not-modifying-verb,no_head_verb,no_negation,interrogative,unknown,False,inanimate
2,"Clearly, Larry is a power-hungry jerk.",Clearly,modal,context-free,not-modifying-verb,no_head_verb,no_negation,declarative,S,True,animate
3,"Unfortunately, it isnt a simple answer.",Unfortunately,evaluative,context-free,not-modifying-verb,no_head_verb,adv_before_neg,declarative,S,True,inanimate
4,"Unfortunately, it didn’t work.",Unfortunately,evaluative,context-free,not-modifying-verb,no_head_verb,no_negation,declarative,unknown,True,inanimate
...,...,...,...,...,...,...,...,...,...,...,...
251,"“(Gilmore, Smith 31) God needs his followers t...",obediently,other,manner,imediate-pre-verbal,other,no_negation,declarative,VP,False,inanimate
252,The woman in blue listens to the discussion ta...,passively,other,manner,imediate-pos-verbal,other,no_negation,declarative,VP,False,inanimate
253,They did not expect to have an opening in the ...,so,degree,context-free,pre-verbal,motion,neg_before_adv,declarative,VP,False,animate
254,"At a book reading he gave in Washington, sever...",angrily,other,manner,imediate-pre-verbal,other,no_negation,declarative,S,False,animate


In [115]:
manner.sentence = manner.sentence.str.replace("<mark>", "")
manner.sentence = manner.sentence.str.replace("</mark>", "")
manner.sentence = manner.sentence.str.replace("\n", "").replace("\r", "")
manner

Unnamed: 0,sentence,adverb_text,lexical_adverb_category,adverb_type,adverb_position,verb_class,negation_scope,sentence_mood,adverb_scope,comma_intonation,subject_animacy
0,Prepare for an honest conversation with your k...,honestly,evaluative,manner,not-modifying-verb,no_head_verb,no_negation,imperative,unknown,False,inanimate
1,But what exactly sparked the rise of the holis...,exactly,other,context-free,not-modifying-verb,no_head_verb,no_negation,interrogative,unknown,False,inanimate
2,"Clearly, Larry is a power-hungry jerk.",Clearly,modal,context-free,not-modifying-verb,no_head_verb,no_negation,declarative,S,True,animate
3,"Unfortunately, it isnt a simple answer.",Unfortunately,evaluative,context-free,not-modifying-verb,no_head_verb,adv_before_neg,declarative,S,True,inanimate
4,"Unfortunately, it didn’t work.",Unfortunately,evaluative,context-free,not-modifying-verb,no_head_verb,no_negation,declarative,unknown,True,inanimate
...,...,...,...,...,...,...,...,...,...,...,...
251,"“(Gilmore, Smith 31) God needs his followers t...",obediently,other,manner,imediate-pre-verbal,other,no_negation,declarative,VP,False,inanimate
252,The woman in blue listens to the discussion ta...,passively,other,manner,imediate-pos-verbal,other,no_negation,declarative,VP,False,inanimate
253,They did not expect to have an opening in the ...,so,degree,context-free,pre-verbal,motion,neg_before_adv,declarative,VP,False,animate
254,"At a book reading he gave in Washington, sever...",angrily,other,manner,imediate-pre-verbal,other,no_negation,declarative,S,False,animate


In [106]:
man = manner[["sentence", "adverb_type"]].join(manner.apply(grammatical_features, axis=1).apply(pd.Series))

In [107]:
man = man.apply(lambda x: x.str.replace('\n', ' ').replace('\r', ' ') if isinstance(x, str) else x)

In [108]:
man.to_csv("subori.csv", sep="\t", index=False)

In [101]:
manner = pd.read_csv("subori.csv")
manner

Unnamed: 0,sentence,adverb_type,adverb_position,verb_class,sentence_mood,adverb_scope,comma_intonation,adverb_text,adverb_count,subject_animacy,negation_scope,lexical_adverb_category
0,Prepare for an honest conversation with your k...,manner,not-modifying-verb,no_head_verb,imperative,unknown,False,honestly,1,inanimate,no_negation,evaluative
1,But what exactly sparked the rise of the holis...,context-free,not-modifying-verb,no_head_verb,interrogative,unknown,False,exactly,1,inanimate,no_negation,other
2,"Clearly, Larry is a power-hungry jerk.",context-free,not-modifying-verb,no_head_verb,declarative,S,True,Clearly,1,animate,no_negation,modal
3,"Unfortunately, it isnt a simple answer.",context-free,not-modifying-verb,no_head_verb,declarative,S,True,Unfortunately,1,inanimate,adv_before_neg,evaluative
4,"Unfortunately, it didn’t work.",context-free,not-modifying-verb,no_head_verb,declarative,unknown,True,Unfortunately,1,inanimate,no_negation,evaluative
...,...,...,...,...,...,...,...,...,...,...,...,...
1860,But meantime a French brigade had driven von d...,,pre-verbal,motion,declarative,S,False,meantime,3,animate,no_negation,other
1861,"I see no greatness, nor any kind of superiorit...",,not-modifying-verb,no_head_verb,declarative,unknown,False,peculiarly,2,animate,no_negation,other
1862,A possible and fairly common example is this: ...,,not-modifying-verb,other,declarative,VP,False,fairly,2,animate,no_negation,other
1863,This strategy has been used in recent years fo...,,imediate-pos-verbal,motion,declarative,VP,False,slower,2,inanimate,no_negation,other
