# BusinessRule-to-Cypher Fine-Tuning


Install dependencies

In [None]:
!pip3 install -q -U peft==0.8
!pip3 install -q -U trl==0.12
!pip3 install -q -U accelerate==0.34
!pip3 install -q -U datasets
!pip3 install -q -U transformers==4.46
!pip3 install -U bitsandbytes

Import libraries

In [None]:
import os
import transformers
import pandas as pd
from datasets import Dataset
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model, AutoPeftModelForCausalLM, PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

Hugging Face Authentication

In [None]:
#Pass the Hugging Face API token for authentication.
os.environ["HF_TOKEN"] = "YOUR_HF_TOKEN"

Load the pretrained model

In [4]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
# Specify the model ID for loading the pre-trained model.
model_id = "mistralai/Mistral-7B-Instruct-v0.3"

# Load the tokenizer associated with the pre-trained model.
tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ['HF_TOKEN'])

# Load the pre-trained model with specific configurations.
# - `device_map={"":0}`: Map the model to the device with ID 0 (typically GPU 0); adjust as necessary for your hardware setup.
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config,
                                             device_map={"":0},
                                             token=os.environ['HF_TOKEN'])

Use the pretrained model for inference

In [None]:
text = f''' <s>[SYSTEM_PROMPT]Consider the following schema information of a Neo4j graph database storing event logs:
                Node Types: 
                Event (with properties Activity, Actor, Timestamp)
                Entity (with properties EntityType, ID)
                Relationship Types: 
                Event -[:DF]-> Event
                Event -[:CORR]-> Entity
                Entity -[:REL]- Entity[/SYSTEM_PROMPT]
                [INST]I want to check the following business rule: At least 3 events of type "Register Customer Order" or "Create Transport Document" must be performed for each customer order.
                The relevant key values for this query are:  ["Register Customer Order", "Create Transport Document"] ["Customer Order"]  []
                Create a corresponding Cypher query that returns true if the rule is satisfied, false otherwise. Ensure that the query is syntactically correct, adheres to the database schema and leverages the key values effectively.[/INST]'''
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)


with torch.cuda.amp.autocast(dtype=torch.bfloat16): 
    outputs = model.generate(**inputs, max_new_tokens=200, return_dict_in_generate=True,  # Wichtige Option für zusätzliche Infos
        output_scores=True, pad_token_id=tokenizer.eos_token_id)
    output_decoded = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
print(output_decoded)

LoRA configurations

In [None]:
# Create a configuration object for Low-Rank Adaptation (LoRA).

lora_config = LoraConfig(
    r = 4,
    lora_alpha = 8,
    # `target_modules`: List of module names within the model where LoRA layers will be applied.
    target_modules = ["q_proj", "o_proj", "k_proj", "v_proj",
                      "gate_proj", "up_proj", "down_proj"],
    task_type = "CAUSAL_LM",
    lora_dropout=0.1,
)

# Use the get_peft_model() function to create a PeftModel from the base model and the lora_config.
lora_model = get_peft_model(model, lora_config)

Load the training data

In [None]:
train_dataset = pd.read_csv("path/to/data_collection.csv", delimiter=',')
training_dataset = Dataset.from_pandas(train_dataset).shuffle(seed=42)

Input formatting

In [16]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['NL input'])):
        text = f'''<s>[SYSTEM_PROMPT]Consider the following schema information of a Neo4j graph database storing event logs:
                Node Types: 
                Event (with properties Activity, Actor, Timestamp)
                Entity (with properties EntityType, ID)
                Relationship Types: 
                Event -[:DF]-> Event
                Event -[:CORR]-> Entity
                Entity -[:REL]- Entity[/SYSTEM_PROMPT]
                [INST]I want to check the following business rule: {example['NL input'][i]}
                The relevant key values for this query are: {example['Key Values'][i]}
                Create a corresponding Cypher query that returns true if the rule is satisfied, false otherwise. Ensure that the query is syntactically correct, adheres to the database schema, and leverages the key values effectively.[/INST]
                {example['Cypher Query'][i]} ###EOA</s>'''
        output_texts.append(text)
    return output_texts


Training configurations

In [None]:

# Initialize the SFTTrainer for fine-tuning the model with specific training parameters and configurations.
trainer = SFTTrainer(
    model=model,
    train_dataset=training_dataset,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=2,  
        gradient_accumulation_steps=4,  
        warmup_steps=50,
        num_train_epochs = 5,
        learning_rate=1e-4,
        bf16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="adamw_hf",
        save_strategy="no",
    ),
    peft_config=lora_config,
    formatting_func=formatting_prompts_func
)

Train the model

In [None]:
trainer.train()

Save the fine-tuned LoRA weights

In [None]:
# We only save the extra PEFT weights that were trained, making it efficient to store and load.
lora_model.save_pretrained("lora_adapter_7B")

Load the fine-tuned model

In [None]:
# Load your fine-tuned model and a suitable tokenizer.
model = AutoPeftModelForCausalLM.from_pretrained("lora_adapter_7B", device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3", token=os.environ['HF_TOKEN'])

Use the fine-tuned model for inference

In [None]:
text = f''' <s>[SYSTEM_PROMPT]Consider the following schema information of a Neo4j graph database storing event logs:
                Node Types: 
                Event (with properties Activity, Actor, Timestamp)
                Entity (with properties EntityType, ID)
                Relationship Types: 
                Event -[:DF]-> Event
                Event -[:CORR]-> Entity
                Entity -[:REL]- Entity[/SYSTEM_PROMPT]
                [INST]I want to check the following business rule: Before a "Book Vehicles" event occurs, at least 2 "Create Transport Document" events should have happened for the same transport document.
                The relevant key values for this query are: Activities: ["Create Transport Document", "Book Vehicles"], EntityType: ["Transport Document"], Actor: []
                Create a corresponding Cypher query that returns true if the rule is satisfied, false otherwise. Ensure that the query is syntactically correct, adheres to the database schema and leverages the key values effectively.[/INST]'''
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

with torch.cuda.amp.autocast(dtype=torch.bfloat16): 
    outputs = model.generate(**inputs, max_new_tokens=200, return_dict_in_generate=True,
        output_scores=True, pad_token_id=tokenizer.eos_token_id, eos_token_id=tokenizer.eos_token_id )
    output_decoded = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
print(output_decoded)