In [75]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
import pandas as pd
from tqdm.notebook import tqdm
import ahocorasick

In [51]:
adv = pd.read_csv("subori.csv", sep="\t")

adv.sentence = adv.apply(lambda x: x["sentence"].replace(x['adverb_text'], f"<mark>{x['adverb_text']}</mark>"), axis=1)

In [52]:
adv

Unnamed: 0,sentence,adverb_type,adverb_position,verb_class,sentence_mood,adverb_scope,comma_intonation,adverb_text,adverb_count,subject_animacy,negation_scope,lexical_adverb_category
0,Prepare for an honest conversation with your k...,manner,not-modifying-verb,no_head_verb,imperative,unknown,False,honestly,1,inanimate,no_negation,evaluative
1,But what <mark>exactly</mark> sparked the rise...,speaker-oriented,not-modifying-verb,no_head_verb,interrogative,unknown,False,exactly,1,inanimate,no_negation,other
2,"<mark>Clearly</mark>, Larry is a power-hungry ...",speaker-oriented,not-modifying-verb,no_head_verb,declarative,S,True,Clearly,1,animate,no_negation,modal
3,"<mark>Unfortunately</mark>, it isnt a simple a...",speaker-oriented,not-modifying-verb,no_head_verb,declarative,S,True,Unfortunately,1,inanimate,adv_before_neg,evaluative
4,"<mark>Unfortunately</mark>, it didn’t work.",speaker-oriented,not-modifying-verb,no_head_verb,declarative,unknown,True,Unfortunately,1,inanimate,no_negation,evaluative
...,...,...,...,...,...,...,...,...,...,...,...,...
244,Kingsley <mark>briefly</mark> served on the Fl...,context-free,imediate-pre-verbal,other,declarative,S,False,briefly,1,animate,no_negation,temporal
245,The <mark>digitally</mark>-programmable potent...,manner,not-modifying-verb,no_head_verb,declarative,unknown,False,digitally,1,inanimate,no_negation,other
246,These <mark>highly</mark> trained people will ...,context-free,imediate-pre-verbal,other,declarative,VP,False,highly,1,inanimate,no_negation,degree
247,"He <mark>eventually</mark> settled in Chicago,...",context-free,imediate-pre-verbal,other,declarative,S,False,eventually,1,animate,no_negation,temporal


In [53]:
# Create encoder and fit
label_encoder = LabelEncoder()

texts = adv.sentence.values.tolist()
labels = [int(label == 'subject-oriented') for label in adv.adverb_type.values]

In [54]:
# Train/test split
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2)

In [56]:
num_labels = 2

In [65]:
# Tokenizer and model
model_name = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [66]:
# Tokenize
def tokenize_fn(texts):
    return tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

In [67]:
train_encodings = tokenize_fn(train_texts)
val_encodings = tokenize_fn(val_texts)

In [68]:
# Create Hugging Face Datasets
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': torch.tensor(train_labels)
})
val_dataset = Dataset.from_dict({
    'input_ids': val_encodings['input_ids'],
    'attention_mask': val_encodings['attention_mask'],
    'labels': torch.tensor(val_labels)
})

In [69]:
# Metric
def compute_metrics(pred):
    preds = pred.predictions.argmax(-1)
    labels = pred.label_ids
    return {"accuracy": accuracy_score(labels, preds), "f1": f1_score(labels, preds, average="weighted")}

In [70]:
# Training config
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    load_best_model_at_end=True,
)

In [71]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [72]:
# Train and evaluate
trainer.train()
trainer.evaluate()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,1.146406,0.76,0.656364
2,No log,0.447893,0.76,0.72878
3,No log,0.709731,0.78,0.725647
4,No log,1.275411,0.74,0.675765
5,No log,1.41176,0.76,0.688239


{'eval_loss': 0.44789260625839233,
 'eval_accuracy': 0.76,
 'eval_f1': 0.728780487804878,
 'eval_runtime': 5.5222,
 'eval_samples_per_second': 9.054,
 'eval_steps_per_second': 1.268,
 'epoch': 5.0}

In [73]:
def tokenize_function(example):
    return tokenizer(example["sentence"], truncation=True, padding=True)

In [76]:
words = ahocorasick.Automaton()

with open("subject-oriented-lexicons.txt") as fin:
    for idx, line in enumerate(fin.readlines()):
        line = line.strip()
        words.add_word(line, idx)
        
words.make_automaton()

In [79]:
# Load the Parquet file back into a Hugging Face Dataset
data = Dataset.from_parquet("all_adverbs.parquet").filter(lambda x: sum([1 for end_index, val in words.iter(x["sentence"].lower())]) == 1).filter(lambda x: x["sentence"] not in adv.sentence)
data = data.map(lambda x: {"sentence": x["sentence"].replace(x["adverb_text"], f"<mark>{x['adverb_text']}</mark>")})

Generating train split: 0 examples [00:00, ? examples/s]

Loading dataset shards:   0%|          | 0/25 [00:00<?, ?it/s]

Filter:   0%|          | 0/56830279 [00:00<?, ? examples/s]

Filter:   0%|          | 0/397814 [00:00<?, ? examples/s]

Map:   0%|          | 0/397814 [00:00<?, ? examples/s]

KeyError: 'adverb_text'

In [78]:
predict_dataset = data.map(tokenize_function, batched=True)
predict_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

count = 0
# Predict
with open("adverb_discover.csv", "w+") as fout:
    fout.write("sentence\tsubjectoriented_prob\n")
    for example in tqdm(predict_dataset):
        decode = tokenizer.decode(example["input_ids"], skip_special_tokens=True)
        if len(decode) > 160:
            continue
        predictions = trainer.predict([example])
        pred = torch.tensor(predictions.predictions[0]).softmax(dim=-1)
        if pred[-1] < .5:
            continue
        fout.write(f"{decode}\t{pred[-1].item()}\n")
        count += 1
        print(count, f"{decode}\t{pred[-1].item()}\n")

Map:   0%|          | 0/718 [00:00<?, ? examples/s]

  0%|          | 0/718 [00:00<?, ?it/s]

1 For extra information on essential oils <mark>kindly</mark> check out: http://ikhoedep.com/	0.5755279064178467



2 I am called upon to sing of the Parilia, and not in vain shall be the call, if <mark>kindly</mark> Pales favours me.	0.5923221111297607



3 It seems that in our attempt to create our own little wilderness, peat compost users are <mark>selfishly</mark> robbing another.	0.710014820098877



4 He would like to inform his parents of this good news, but is fearful that they may <mark>angrily</mark> reject him.	0.6857823133468628



5 <mark>Kindly</mark> share the investment principles that you follow.	0.6729834675788879



6 Mr Downes flew at him in a rage, <mark>angrily</mark> blaming him for the murder and even throwing Mr Cuffini’s glasses to the ground!	0.5400448441505432



7 It promised to net an enormous sum, and Brodie, <mark>foolishly</mark> feeling invulnerable because of his successful double-life, set the play in motion.	0.9144092798233032



8 “(Gilmore, Smith 31) God needs his followers to speak the reality in the face of falsehood and bids us to <mark>obediently</mark> follow his commands.	0.5442089438438416



9 The woman in blue listens to the discussion taking place to her right with her hands crossed <mark>passively</mark> on the table.	0.537598192691803



10 <mark>Kindly</mark> visit our order/inquiry page for further assistance.	0.6048296689987183



11 As an added bonus John McMaster has <mark>kindly</mark> offered to donate original material and photographs relating to his family to the John Oxley Library.	0.9230715036392212



12 And Aaron said unto Moses, Alas, my lord, I beseech thee, lay not the sin upon us, wherein we have done <mark>foolishly</mark>, and wherein we have sinned.	0.5200846791267395



13 They did not expect to have an opening in the ERK trial but one happened on Thursday <mark>so</mark> they eagerly jumped on it.	0.5044088363647461



14 <mark>Eagerly</mark>, King Izates invited him to become his teacher, to which the rabbi agreed.	0.6545388698577881



15 At a book reading he gave in Washington, several protesters <mark>angrily</mark> heckled him before being forced by the bookstore's employees to leave.	0.640628457069397



16 The <mark>eagerly</mark>-expected explanations of Lord Palmerston's ejection from the Cabinet were made in the debate on the Address.	0.527032732963562

