In [1]:
import transformers
import peft
import pandas as pd
import datasets
import torch

device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [2]:
def load_model(model_name: str):
    # load model
    m = transformers.AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map='cuda',
        trust_remote_code=False,
        revision='main',
    )
    
    return m.to(device)

model_name = 'tiiuae/falcon-7b'
m = load_model(model_name=model_name)

config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/16.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
def load_and_prepare_dataset(model_name: str):
    df = pd.read_csv('/content/data_openai_api_with_mask.csv')
    df = df[['text', 'is_major_principle_here', 'ethical_us']]
    transparency = "Transparency: Transparency can typically be understood in two ways: the transparency of the AI technology itself and the transparency of the AI organisations developing and using it. Throughout our analysis, transparency was regularly discussed directly, or in relation to processes required to ensure it, such as explainability, understandability and communication."
    n_malef = "Non-maleficence: The principle of non-maleficence gained attention, and in its most basic form, it means to do no harm or avoid doing harm to others"
    resp = "Responsibility: The principle of responsibility is base on responsibility of the developers and stakeholders over the AI, accountability, liability and acting with integrity"
    privacy = "Privacy: Related to privacy of personal data, because of the large abundance of data that is required for AI to work, it is important that individuals privacy is not jeopardised as a result"
    benefiecnce = "Beneficence: Beneficence essentially means to do good, to carry out an activity with the intention of benefitting someone or society as a whole"
    f_and_a = "Freedom and autonomy: Democratic societies place value in freedom and autonomy, and it is important that AI use does not encumber or harm these for us"
    sus = "Sustainability: All fields and disciplines are affected and need to incorporate sustainability agendas"
    dig = "Dignity: Human dignity is the recognition that individuals have inherent worth and that their rights should be respected"
    j_and_f = "Justice and fairness: The issue of discrimination and unfair results resulting from algorithms has become a significant concern. It is imperative that systems are designed to ensure that they are free from any form of unfairness and inequality."
    trust = "Trust: Trust is built by keeping promises, making sure systems work properly and protecting data responsibly. Organisations must prove their trustworthiness by ensuring that their technologies are secure and effective."
    instruction = f'You are an ethical requirements engineer translating requirements to ethical user stories based on one of the 10 ethical principles:  {transparency}\n {n_malef}\n {resp}\n {privacy}\n {benefiecnce}\n {f_and_a}\n {sus}\n {dig}\n {j_and_f}\n {trust}\n  and the TEMPLATE:\n As a <persona> i want to <do something> <so that> \n\n Now following this template and the ethical principles definition choose one of the ethical principles and transform the requirement below into a brief description of an ethical user story based on the choosen ethical principle, substituting <persona> for the one that requires this functionality, <do something> for what the <persona> wants to do and <so that> to the end goal of the functionality'

    df['data'] = df.apply((lambda row: f'''### Human: {instruction} \n{row['text']} \n  ### Assistant: {row['ethical_us']}'''), axis=1)
    
    t = transformers.AutoTokenizer.from_pretrained(model_name, use_fast=True)
    t.add_special_tokens({'pad_token': '<PAD>'})
    dt = datasets.Dataset.from_pandas(df)
    dt = dt.train_test_split(test_size=0.3)

    t.pad_token = t.eos_token
    data_collator = transformers.DataCollatorForLanguageModeling(t, mlm=False)

    return dt, data_collator, t

data, data_collator, t = load_and_prepare_dataset(model_name=model_name)


In [4]:
def tokenize_function(examples):
    # extract text
    text = examples["data"]

    #tokenize and truncate text
    t.truncation_side = "left"
    tokenized_inputs = t(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    return tokenized_inputs

tokenized_data = data.map(tokenize_function, batched=True)

Map:   0%|          | 0/765 [00:00<?, ? examples/s]

Map:   0%|          | 0/328 [00:00<?, ? examples/s]

In [5]:
def train_model(model, lr, batch_size, num_epochs, tokenized_data, collator):
    model.train() # training state
    model.gradient_checkpointing_enable()
    model = peft.prepare_model_for_kbit_training(model) # turn into qlora

    # lora config
    config = peft.LoraConfig(
        r=32,
        lora_alpha=64,
        target_modules=["q_proj"],
        lora_dropout=.1,
        bias="none",
        task_type="CAUSAL_LM"
    )
    config.inference_mode = False

    model = peft.get_peft_model(model, config) # model in lora style

    training_args = transformers.TrainingArguments(
        output_dir= "checkpoints_output",
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        # weight_decay=0.01,
        logging_strategy="epoch",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        warmup_steps=2,
        fp16=False,
        optim="paged_adamw_8bit",
    )

    trainer = transformers.Trainer(
        model=model,
        train_dataset=tokenized_data["train"],
        eval_dataset=tokenized_data["test"],
        args=training_args,
        data_collator=collator
    )

    # train model
    model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
    trainer.train()

    # renable warnings
    model.config.use_cache = True

    return model

final_model = train_model(m, 2e-4, 4, 10, tokenized_data, data_collator)



Epoch,Training Loss,Validation Loss
1,3.0388,3.020001
2,3.0381,3.020001
3,3.0386,3.020001




In [None]:
transparency = "Transparency: Transparency can typically be understood in two ways: the transparency of the AI technology itself and the transparency of the AI organisations developing and using it. Throughout our analysis, transparency was regularly discussed directly, or in relation to processes required to ensure it, such as explainability, understandability and communication."
n_malef = "Non-maleficence: The principle of non-maleficence gained attention, and in its most basic form, it means to do no harm or avoid doing harm to others"
resp = "Responsibility: The principle of responsibility is base on responsibility of the developers and stakeholders over the AI, accountability, liability and acting with integrity"
privacy = "Privacy: Related to privacy of personal data, because of the large abundance of data that is required for AI to work, it is important that individuals privacy is not jeopardised as a result"
benefiecnce = "Beneficence: Beneficence essentially means to do good, to carry out an activity with the intention of benefitting someone or society as a whole"
f_and_a = "Freedom and autonomy: Democratic societies place value in freedom and autonomy, and it is important that AI use does not encumber or harm these for us"
sus = "Sustainability: All fields and disciplines are affected and need to incorporate sustainability agendas"
dig = "Dignity: Human dignity is the recognition that individuals have inherent worth and that their rights should be respected"
j_and_f = "Justice and fairness: The issue of discrimination and unfair results resulting from algorithms has become a significant concern. It is imperative that systems are designed to ensure that they are free from any form of unfairness and inequality."
trust = "Trust: Trust is built by keeping promises, making sure systems work properly and protecting data responsibly. Organisations must prove their trustworthiness by ensuring that their technologies are secure and effective."
instruction = f'You are an ethical requirements engineer translating requirements to ethical user stories based on one of the 10 ethical principles:  {transparency}\n {n_malef}\n {resp}\n {privacy}\n {benefiecnce}\n {f_and_a}\n {sus}\n {dig}\n {j_and_f}\n {trust}\n  and the TEMPLATE:\n As a <persona> i want to <do something> <so that> \n\n Now following this template and the ethical principles definition choose one of the ethical principles and transform the requirement below into a brief description of an ethical user story based on the choosen ethical principle, substituting <persona> for the one that requires this functionality, <do something> for what the <persona> wants to do and <so that> to the end goal of the functionality'

prompt_template = lambda comment: f'''### Human: {instruction} \n{comment} \n'''
comment = "The system shall show the meetings to the user in ascending time order by default."
prompt = prompt_template(comment)

final_model.eval()
inputs = t(prompt, return_tensors="pt")
outputs = final_model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=1024)
print(t.batch_decode(outputs)[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> [INST] You are a ethical requirements engineer translating requirements to ethical user stories based on 10 ethical principles of: Transparency, Non-maleficence Responsibility, Privacy, Beneficence, Freedom and autonomy, Sustainability, Dignity, Justice and Trustand the template listed below
 TEMPLATE:
 As a [persona] i want to [do something] [so that] 

 Now following this template and the ethical principles definition, transform the requirement below into a brief description of an ethical user story, substituing [persona] for the one that requires this functionality, [do something] for what the [persona] wants to do and [so that] to the end goal of the functionality 
The system shall show the meetings to the user in ascending time order by default. 
[/INST] As a user who values Transparency and Beneficence, I want the system to display the meetings in ascending time order by default, so that I can easily understand the scheduling of my events and make informed decisions about my 