In [None]:
import unicodedata

#-----------------------------------------------------------
# Preprocess 
#-----------------------------------------------------------

# Converts FOL symbols to tokenizable representation for preprocessing.
def fol_preprocess(expression):
    replacements = {
        '∀': ' FORALL ',
        '∃': ' EXISTS ',
        '¬': 'NOT ',
        '∧': 'AND',
        '⊕': 'XOR',
        '∨': 'OR',
        '→': 'THEN',
        '↔': 'IFF',
    }
    for symbol, replacement in replacements.items():
        expression = expression.replace(symbol, replacement)
    return expression

# Normalization
def nl_preprocess(sentence):
    return unicodedata.normalize('NFKC', sentence.lower())


In [None]:
from datasets import Dataset
from sklearn.model_selection import train_test_split
import pandas as pd

#-----------------------------------------------------------
# Tokenization and loading the dataset with preprocessing
#-----------------------------------------------------------

dataset_path = "PATH"

def tokenize_data(dataset, tokenizer):
    def tokenize_function(examples):
        model_inputs = tokenizer(
            examples['NL'],
            max_length=64,
            truncation=True,
            padding='max_length'
        )
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(
                examples['FOL'],
                max_length=64,
                truncation=True,
                padding='max_length'
            )
        model_inputs['labels'] = labels['input_ids']
        return model_inputs
    return dataset.map(tokenize_function, batched=True)

def load_train_val_test_data(tokenizer):
    dataset = pd.read_excel(dataset_path)
    train_data, test_data = train_test_split(dataset, test_size=0.03, random_state=42)
    train_data, validation_data = train_test_split(train_data, test_size=0.1, random_state=42)

    print(f"Train Dataset size: {len(train_data)}")
    print(f"Validation Dataset size: {len(validation_data)}")
    print(f"Test Dataset size: {len(test_data)}")

    train_data = Dataset.from_pandas(train_data)
    validation_data = Dataset.from_pandas(validation_data)
    test_data = Dataset.from_pandas(test_data)

    train_data = train_data.map(lambda x: {'NL': 'translate English to First-order Logic: ' + nl_preprocess(x['NL_sentence']), 'FOL': fol_preprocess(x['FOL_expression'])})
    validation_data = validation_data.map(lambda x: {'NL': 'translate English to First-order Logic: ' + nl_preprocess(x['NL_sentence']), 'FOL': fol_preprocess(x['FOL_expression'])})
    test_data = test_data.map(lambda x: {'NL': 'translate English to First-order Logic: ' + nl_preprocess(x['NL_sentence']), 'FOL': fol_preprocess(x['FOL_expression'])})

    train_data = tokenize_data(train_data, tokenizer)
    validation_data = tokenize_data(validation_data, tokenizer)
    test_data = tokenize_data(test_data, tokenizer)

    return train_data, validation_data, test_data


In [None]:
from transformers import AutoTokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq, Trainer, TrainingArguments
import torch

#-----------------------------------------------------------
# Training Setup
#-----------------------------------------------------------

model_name = "google-t5/t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
train_dataset, validation_dataset, test_dataset = load_train_val_test_data(tokenizer)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def model_init():
    return T5ForConditionalGeneration.from_pretrained(model_name, device_map="auto")

def check_tokenization(encodings, tokenizer):
   for i in range(min(5, len(encodings))): 
       print("NL:", tokenizer.decode(encodings['input_ids'][i], skip_special_tokens=True))
       print("FOL:", tokenizer.decode(encodings['labels'][i], skip_special_tokens=True))

check_tokenization(train_dataset, tokenizer)

In [None]:
#-----------------------------------------------------------
# Training
#-----------------------------------------------------------

training_args = TrainingArguments(
    output_dir='./model_t5_base',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=4,
    logging_dir='./logs',
    evaluation_strategy="epoch",
    logging_steps=5,
    save_strategy="epoch",
    save_total_limit=4,
    load_best_model_at_end=True,
    warmup_steps=100,
    learning_rate=1e-4,
    weight_decay=1e-6,
    adam_beta2=0.999,
    optim="adamw_torch",
    adam_epsilon=1e-8,
    gradient_accumulation_steps=4,
    disable_tqdm=False,
    report_to="none"
)

model = model_init()
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
tokenizer.pad_token = tokenizer.eos_token

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
)

trainer.train()

evaluation_results = trainer.evaluate(test_dataset)
print(evaluation_results)

model_path = './model_t5_base/trained_model'
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)
