In [None]:
!pip install -q -U transformers datasets evaluate
!pip install -q -U bitsandbytes
!pip install -q -U trl

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m57.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from huggingface_hub import login
from google.colab import userdata
HF_API_KEY = userdata.get('HF_API_KEY')

login(HF_API_KEY)

# Data Generation

In [None]:
import json
import string

allowed_dependencies = {
     'acomp','advmod','agent','amod','attr','aux','auxpass',
     'case','cc','ccomp','compound','conj','det','dobj',
     'nmod','nsubj','nsubjpass',
     'pcomp','pobj','prep','poss','ROOT','xcomp'
}

def has_required_dependencies(doc, allowed_dependencies):
    if not {token.dep_ for token in doc}.issubset(allowed_dependencies):
        return False

    return ("is a" in doc.text.lower() or "is an" in doc.text.lower()) or \
            (any(token.dep_ == 'ROOT' for token in doc) and \
            any(token.dep_ in {'nsubj', 'nsubjpass'} for token in doc) and \
            any(token.dep_ in {'dobj', 'pobj'} for token in doc))

from spacy import load
nlp = load("en_core_web_sm")
with open("DSA_knowledge.txt", "r") as file:
    sentences = file.read()

In [None]:
all_graphs = []
unhandled_sentences=set()
error_sentences = set()

from logging import error
for sentence in [s.strip().rstrip(string.punctuation) for s in sentences.strip().split('\n') if s.strip()]:
  doc = nlp(sentence)
  # displacy.render(doc, style="dep", jupyter=True, options={'distance': 90})
  if not has_required_dependencies(doc, allowed_dependencies):
      unhandled_sentences.add(sentence)
      continue

  try:
    temp_graph = {
        "nodes": {},  # {'nodes': {0: {'pos': 0, 'label': 'X', 'dep': 'nsubj'}, 4: {'pos': 4, 'label': 'Y', 'dep': 'pobj'}},
        "edges": [],  # 'edges': [(0, 4, 'is subclass of')]}
        "sentence": sentence
    }

    edge_mapping = {
        'subject_nodes': {},  # {1: {0}} # multiple subject nodes possible
        'object_nodes': {},   # {1: 4}
        'edge_ids': set()     # {1}
    }

    temp_graph["nodes"] = {token['id']: {"pos": token['id'], "label": doc.text, "dep": token['dep']}
                          for token, doc in zip(doc.to_json()['tokens'], doc)}

    temp_graph["edges"] = [(token['head'], token['id'], token['dep'])
                            for token in doc.to_json()['tokens'] if token['head'] != token['id']]

    root_node = list(filter(lambda node: temp_graph["nodes"][node]['dep'] == 'ROOT', temp_graph["nodes"]))[0]
    stopping = False
    while not stopping:
      for edge in sorted(temp_graph["edges"], key=lambda x: abs(x[0] - x[1])):

        source_pos, target_pos, meta = edge

        if source_pos not in temp_graph["nodes"] or target_pos not in temp_graph["nodes"]:
            continue
        #print(edge)
        source_metadata = temp_graph["nodes"][source_pos]
        target_metadata = temp_graph["nodes"][target_pos]
        try:
            match (source_metadata, meta, target_metadata):
                case {'label': s, **source}, 'compound' | 'amod' | 'aux' |'auxpass' | 'advmod', {'label': t, **target}:
                    source_metadata['label'] = f"{t} {s}"
                    temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == source_pos and edge[1] == target_pos), temp_graph['edges']))
                    del temp_graph['nodes'][target_pos]
                    continue

                case {'label': s, **source}, 'agent', {'label': t, **target}:
                    source_metadata['label'] = f"{s} {t}"
                    next_node = next((n for src, n, label in temp_graph["edges"] if src == target_pos and label == 'pobj'), None)
                    edge_mapping['edge_ids'].add(source_pos)
                    edge_mapping['object_nodes'][source_pos] = next_node
                    temp_graph['edges'].append((source_pos, next_node, 'pobj'))
                    temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == source_pos and edge[1] == target_pos), temp_graph['edges']))
                    temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == target_pos and edge[1] == next_node), temp_graph['edges']))
                    del temp_graph['nodes'][target_pos]
                    continue

                case {'label': s, **source}, 'case' | 'cc', {'label': t, **target}:
                    temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == source_pos and edge[1] == target_pos), temp_graph['edges']))
                    del temp_graph['nodes'][target_pos]
                    continue

                case {'label': s, **source}, 'det', {'label': t, **target}:
                    temp_graph['nodes'][source_pos]['det'] = t
                    temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == source_pos and edge[1] == target_pos), temp_graph['edges']))
                    del temp_graph['nodes'][target_pos]
                    continue

                case {'label': s, **source}, 'attr'|'acomp', {'label': 'subclass'|'attribute'|'dimension'|'kind'|'threat'|'result'|'type'|'equal'|'form', **target}: #is(head)--attr--subclass(tail)--prep--of(child)--pobj--Risk
                    next_node = next((n for src, n, label in temp_graph["edges"] if src == target_pos and label == 'prep'), None)
                    obj_node = next((n for src, n, label in temp_graph["edges"] if src == next_node and label == 'pobj'), None)
                    source_metadata['label'] = f"{s} {target_metadata['label']} {temp_graph['nodes'][next_node]['label']}" #is-->issubclassof
                    edge_mapping['edge_ids'].add(source_pos)
                    edge_mapping['object_nodes'][source_pos] = obj_node
                    temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == source_pos and edge[1] == target_pos), temp_graph['edges'])) # remove edge: is--subclass
                    temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == target_pos and edge[1] == next_node), temp_graph['edges'])) # remove edge: subclass--of
                    temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == next_node and edge[1] == obj_node), temp_graph['edges'])) # remove edge: of--Y
                    temp_graph['edges'].append((source_pos, obj_node, 'pobj')) #connect edge from 'is' node to obj node
                    del temp_graph['nodes'][target_pos] # remove node: 'subclass'
                    del temp_graph['nodes'][next_node]  # remove node: 'of'
                    continue

                case {'label': s, **source}, 'attr'|'acomp', {'label': t, **target}: #is-attr-Y
                    edge_mapping['edge_ids'].add(source_pos)
                    edge_mapping['object_nodes'][source_pos] = target_pos
                    continue

                case {'dep': 'ROOT', 'label': s, **source}, 'prep'|'xcomp', {'label': t, **target}: #attributes(ROOT)--prep--to #helps--xcomp--see--pobj--X
                    source_metadata['label'] = f"{s} {t}"
                    next_node = next((n for src, n, label in temp_graph["edges"] if src == target_pos and label in {'pobj', 'dobj'}), None)
                    if next_node:
                      temp_graph['edges'].append((source_pos, next_node, 'pobj'))
                      temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == source_pos and edge[1] == target_pos), temp_graph['edges']))
                      temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == target_pos and edge[1] == next_node), temp_graph['edges']))
                      del temp_graph['nodes'][target_pos]
                    continue

                case {'label': s, **source}, 'prep', {'label': t, **target}: #*-dobj-assessment--prep--of|*-attr-(a)dimension-prep-of
                    if next((n for src, n, label in temp_graph["edges"] if n == source_pos and label == 'attr'), None) is None:
                      next_node = next((n for src, n, label in temp_graph["edges"] if src == target_pos and label in {'pobj'}), None)
                      if next_node: #Date-prep-of-pobj-birth
                        source_metadata['label'] = f"{s} {t} {temp_graph['nodes'][next_node]['label']}"
                        temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == source_pos and edge[1] == target_pos), temp_graph['edges']))
                        temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == target_pos and edge[1] == next_node), temp_graph['edges']))
                        del temp_graph['nodes'][target_pos]
                        del temp_graph['nodes'][next_node]
                      else:
                        edge_mapping['edge_ids'].add(target_pos)
                        edge_mapping['subject_nodes'].setdefault(target_pos, set()).add(source_pos)
                        temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == source_pos and edge[1] == target_pos), temp_graph['edges']))
                    continue

                case {'label': s, **source}, 'poss', {'label': t, **target}:
                    next_node = next((n for src, n, label in temp_graph["edges"] if src == source_pos and label == 'conj'), None)
                    if next_node:
                      temp_graph['edges'].append((temp_graph["nodes"][next_node]['label'],
                                                  temp_graph["nodes"][target_pos]['label'],
                                                  'of'))
                    continue

                case {'label': s, **source}, 'nmod', {'label': t, **target}:
                    source_metadata['label'] = f"{t} {s}"
                    incoming_node = next((src for src, n, label in temp_graph["edges"] if target == source_pos and label == 'nsubj'), None)
                    if 'conj' in target:
                        target['conj']['nodeId'] = target['conj']['text'] + f" {s}"
                        edge_mapping['subject_nodes'][incoming_node].add(target['conj']['nodeId'])
                    temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == source_pos and edge[1] == target_pos), temp_graph['edges']))
                    del temp_graph['nodes'][target_pos]
                    continue

                case {'label': s, **source}, 'conj', {'label': t, **target}:
                    temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == source_pos and edge[1] == target_pos), temp_graph['edges']))
                    temp_graph['nodes'][source_pos]['conj'] = {'text': t, 'nodeId': target_pos}
                    continue

                case {'label': s, **source}, 'pcomp', {'label': t, **target}: #in--pcomp--explaining--dobj--x
                    temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == source_pos and edge[1] == target_pos), temp_graph['edges']))
                    next_node = next((n for src, n, label in temp_graph["edges"] if src == target_pos and label in {'pobj', 'dobj'}), None)
                    if next_node:
                      temp_graph['nodes'][root_node]['label'] += f" {t}"  #if not work f" {temp_graph['nodes'][root_node]['label']} {t}
                      temp_graph['edges'].append((root_node, next_node, 'dobj'))
                      temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == target_pos and edge[1] == next_node), temp_graph['edges']))
                      del temp_graph['nodes'][source_pos]
                      del temp_graph['nodes'][target_pos]
                    continue

                case {'label': s, **source}, 'ccomp', {'label': t, **target}: #Design interface can help users understand AI decisions
                    next_node = next((n for src, n, label in temp_graph["edges"] if src == target_pos and label == 'nsubj'), None)
                    if next_node:
                      edge_mapping['object_nodes'][source_pos] = next_node
                    continue

                case {'label': s, **source}, 'nsubj' | 'nsubjpass', {'label': t, **target}:
                    edge_mapping['edge_ids'].add(source_pos)
                    edge_mapping['subject_nodes'].setdefault(source_pos, set()).add(target_pos)
                    if 'conj' in target:
                      edge_mapping['subject_nodes'][source_pos].add(target_metadata['nodeId'])
                    continue

                case {'label': s, **source}, 'dobj' | 'pobj', {'label': t, **target}:
                    if next((src for src, n, label in temp_graph["edges"] if src == source_pos and label == 'prep'), None):
                      source_metadata['label'] = f"{s} {t}"
                      temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == source_pos and edge[1] == target_pos), temp_graph['edges']))
                      del temp_graph['nodes'][target_pos]
                    #assign object outside loop
                    continue

                case another:
                    print("another:", edge)
                    unhandled_sentences.add(sentence)
                    stopping = True
                    continue

        except Exception as e:
                print(f"Error occurred in sentence: {sentence} with edge: {edge}, error: {e}")
                unhandled_sentences.add(sentence) # throw error
                error_sentences.add(sentence)
                stopping = True
                continue
      else:
          break

    # Update object nodes
    edge_mapping['object_nodes'].update({
        edge_id: next((tail for head, tail, meta in temp_graph['edges']
                      if meta in {'dobj', 'pobj'}), None)
        for edge_id in edge_mapping['edge_ids']
        if edge_id not in edge_mapping['object_nodes']
    })

    for edge_id, obj_node in edge_mapping['object_nodes'].items():
        if obj_node is None:
            print(f"Missing object node for edge ID: {edge_id}")

    # create final mapping
    for edge_id in edge_mapping['edge_ids']:
        subject_nodes = edge_mapping['subject_nodes'][edge_id]
        object_node = edge_mapping['object_nodes'][edge_id]
        edge_node = temp_graph['nodes'][edge_id]
        for subject_node in subject_nodes:
            #temp_graph['edges'].append((subject_node, object_node, edge_node['label']))
            temp_graph['edges'].append((temp_graph["nodes"][subject_node]['label'],
                                        temp_graph["nodes"][object_node]['label'],
                                        edge_node['label']))
            temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == edge_id and edge[1] == subject_node), temp_graph['edges']))

        temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == edge_id and edge[1] == object_node), temp_graph['edges']))
        del temp_graph['nodes'][edge_id]

    temp_graph['edges'] = list(set(temp_graph['edges'])-set([edge for edge in temp_graph['edges'] if edge[2] in allowed_dependencies]))

    # all_graphs.append(temp_graph) # we don't need nodes to be pat of json, as edges have them
    all_graphs.append({
      "edges": temp_graph["edges"],
      "sentence": temp_graph["sentence"]
    })


  except Exception as e:
    print(f"Failed to process sentence: {sentence}, error: {e}")
    unhandled_sentences.add(sentence)


with open("unhandled_sentences.txt", "w") as file:
  for unhandled in unhandled_sentences:
    file.write(unhandled + "\n")

# remove error sentences
for g in all_graphs:
  if g["sentence"] in error_sentences:
    all_graphs.remove(g)

# Training data - for fine tuning the HF model:
with open('graph_data.json', 'w') as json_file:
    json.dump(all_graphs, json_file, indent=4, ensure_ascii=False)

Error occurred in sentence: AI system’s capabilities and limitations should be communicated to users with edge: (8, 3, 'nsubjpass'), error: 'nodeId'
Error occurred in sentence: Minimisation and reporting of negative impacts is a dimension of Accountability with edge: (6, 0, 'nsubj'), error: 'nodeId'
Error occurred in sentence: Privacy and data governance ensures prevention of harm with edge: (3, 0, 'nmod'), error: None
Error occurred in sentence: Deception and unfair manipulation is a threat to freedom of individual with edge: (4, 0, 'nsubj'), error: 'nodeId'
Error occurred in sentence: Human agency and oversight is a requirement with edge: (4, 1, 'nsubj'), error: 'nodeId'
Error occurred in sentence: Technical robustness and safety is a requirement with edge: (4, 1, 'nsubj'), error: 'nodeId'
Error occurred in sentence: Privacy and data governance is a requirement with edge: (3, 0, 'nmod'), error: None


In [None]:
all_graphs[:5]

[{'edges': [('Data Preprocessing', 'Data Science Task', 'is subclass of')],
  'sentence': 'Data Preprocessing is subclass of Data Science Task'},
 {'edges': [('Supervised Learning', 'Data Science Task', 'is subclass of')],
  'sentence': 'Supervised Learning is subclass of Data Science Task'},
 {'edges': [('Unsupervised Learning', 'Data Science Task', 'is subclass of')],
  'sentence': 'Unsupervised Learning is subclass of Data Science Task'},
 {'edges': [('Regression', 'Supervised Learning', 'is subclass of')],
  'sentence': 'Regression is subclass of Supervised Learning'},
 {'edges': [('Classification', 'Supervised Learning', 'is subclass of')],
  'sentence': 'Classification is subclass of Supervised Learning'}]

# Data Structuring

In [None]:
from datasets import Dataset
import json
# Load your JSON file
with open('graph_data.json', 'r') as f:
    graph_data = json.load(f)

In [None]:
# Define a function to build a prompt from a data example
def format_instruction(sentence, edges):
    return f"""
Extract relationships (edges) from the given sentences. Each relationship should be a triplet in the format `(Subject, Object, Relation)`, where:

1. **Subject**: The main entity initiating the action or relationship.
2. **Object**: The entity affected by or related to the Subject.
3. **Relation**: The action or relationship connecting the Subject and Object.

Return the results as a list of dictionaries. Each dictionary should have two keys:
- `"sentence"`: The original sentence.
- `"edges"`: A list of triplets representing the extracted edges.


Example:
Input Sentence: "Privacy and data governance ensures prevention of harm"
Output: {{'edges': [['Privacy', 'harm', 'ensures prevention of'], ['data governance', 'harm', 'ensures prevention of']], 'sentence': 'Privacy and data governance ensures prevention of harm'}}

Task:
Input Sentence: "{sentence.strip()}"
Output: {edges}
"""


In [None]:
train_data_dict = {"text": []}  # Initialize an empty dictionary
for graph in graph_data:
    train_data_dict["text"].append(
        format_instruction(
            graph.get("sentence", ""),
            graph
            )
    )


In [None]:
# Convert the dictionary into a Hugging Face Dataset
dataset = Dataset.from_dict(train_data_dict)

dataset

Dataset({
    features: ['text'],
    num_rows: 252
})

In [None]:
# show an example
print(dataset[1]["text"])


Extract relationships (edges) from the given sentences. Each relationship should be a triplet in the format `(Subject, Object, Relation)`, where:

1. **Subject**: The main entity initiating the action or relationship.
2. **Object**: The entity affected by or related to the Subject.
3. **Relation**: The action or relationship connecting the Subject and Object.

Return the results as a list of dictionaries. Each dictionary should have two keys:
- `"sentence"`: The original sentence.
- `"edges"`: A list of triplets representing the extracted edges.


Example:
Input Sentence: "Privacy and data governance ensures prevention of harm"
Output: {'edges': [['Privacy', 'harm', 'ensures prevention of'], ['data governance', 'harm', 'ensures prevention of']], 'sentence': 'Privacy and data governance ensures prevention of harm'}

Task:
Input Sentence: "Supervised Learning is subclass of Data Science Task"
Output: {'edges': [['Supervised Learning', 'Data Science Task', 'is subclass of']], 'sentence':

In [None]:
train_dataset, val_dataset = dataset.train_test_split(test_size=0.1).values()

# Model Quantized Loading

In [None]:
from transformers import AutoTokenizer
model_name_or_path = "mistralai/Mistral-7B-Instruct-v0.3"

# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name_or_path,
    use_fast=True,
    token=HF_API_KEY
    )

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
from transformers import BitsAndBytesConfig
from transformers import AutoModelForCausalLM
import torch

# Setup the quantization configuarion
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                     # Q = 4 bits
    bnb_4bit_use_double_quant=True,        # double quantization, quantizing the quantization constants for saving an additional 0.4 bits per parameter
    bnb_4bit_quant_type="nf4",             # 4-bit NormalFloat Quantization (optimal for normal weights; enforces w ∈ [-1,1])
    bnb_4bit_compute_dtype=torch.bfloat16  # Dequantize to 16-bits before computations (as in the paper)
)
# Pass it while using the model
model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    quantization_config=bnb_config,
    device_map="auto",
    token=HF_API_KEY)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Test the model
ex_inp = format_instruction(graph_data[5]["sentence"],"")
inputs = tokenizer(ex_inp, return_tensors='pt')
inputs = inputs.to("cuda")
output_tokens = model.generate(
    inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    pad_token_id=tokenizer.pad_token_id,
    max_new_tokens=50,)[0]     # batch of tokens with one sequence
output = tokenizer.decode(output_tokens, skip_special_tokens=True)
print(output)


Extract relationships (edges) from the given sentences. Each relationship should be a triplet in the format `(Subject, Object, Relation)`, where:

1. **Subject**: The main entity initiating the action or relationship.
2. **Object**: The entity affected by or related to the Subject.
3. **Relation**: The action or relationship connecting the Subject and Object.

Return the results as a list of dictionaries. Each dictionary should have two keys:
- `"sentence"`: The original sentence.
- `"edges"`: A list of triplets representing the extracted edges.


Example:
Input Sentence: "Privacy and data governance ensures prevention of harm"
Output: {'edges': [['Privacy', 'harm', 'ensures prevention of'], ['data governance', 'harm', 'ensures prevention of']], 'sentence': 'Privacy and data governance ensures prevention of harm'}

Task:
Input Sentence: "Binary Classification is subclass of Classification"
Output: 
{'sentence': 'Binary Classification is subclass of Classification',
 'edges': [['Binary

# ADD Lora Layer

In [None]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
# Prepare the model for LoRA fine-tuning
lora_config = LoraConfig(
    r=16,  # rank of the low-rank matrices
    lora_alpha=64,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"  # LoRA fine-tuning for causal language modeling task
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

In [None]:
def save_model(model, tokenizer):
  # Define the save path for the fine-tuned model on Colab
  peft_model_path = "./fine-tuned-mistral"

  # Save the trained model
  model.save_pretrained(peft_model_path)

  # Save the tokenizer
  tokenizer.save_pretrained(peft_model_path)

save_model(model, tokenizer)

# Fine-Tuning

In [None]:
from trl import SFTConfig

training_arguments = SFTConfig(
    fp16=True,                           # Training computations in 16 bits
    dataset_text_field="text",           # Specify the text field in the dataset for training
    max_seq_length=512,                  # Set the maximum sequence length for the training data


    # batch-related
    per_device_train_batch_size=2,       # Batch Size
    gradient_accumulation_steps=4,       # Batch Size (Mathematically)

    # optimizer-related
    optim="paged_adamw_32bit",           # Variant of AdamW designed to be more efficient on 32-bit GPUs
    learning_rate=1e-4,                  # Learning Rate
    warmup_ratio=0.05,                   # After 5% of the data, learning rate has linearly  from 0 to 1e-4
    lr_scheduler_type="cosine",          # Adjust learning rate sinusoidally
    max_grad_norm=0.3,                   # Clip gradients if less than 0.3 (prevent gradient explosion)

    # epochs and saving
    num_train_epochs=5,                  # Number of Epochs
    save_strategy="epoch",               # Save after each epoch
    output_dir="./epoch-finetuned",        # Where to save the model

    # validation
    eval_strategy="steps",         # For the next argument
    eval_steps=0.1,                      # Evaluate after 20% of training steps

    # logging-related
    report_to=[],
    logging_steps=1,                     # Number of update steps between two logs
    group_by_length=True,                # Minimize padding by grouping sentences of similar length
    seed=42,                             # For consistent results
)
model.gradient_checkpointing_enable()    # Store less activations and recompute later
model.config.use_cache = False           # Disable using attention output cache. Should be enabled in inference.

In [None]:
# Import the SFTTrainer from HuggingFace TRL library
from trl import SFTTrainer

# Initialize the trainer
trainer = SFTTrainer(
    # Assign the model and tokenizer
    model=model,
    processing_class=tokenizer,

    # Provide the training and validation datasets
    train_dataset=train_dataset,
    eval_dataset=val_dataset,

    # Pass the LoRA configuration
    peft_config=lora_config,

    # Set the training hyperparameters
    args=training_arguments,
)

Map:   0%|          | 0/226 [00:00<?, ? examples/s]

Map:   0%|          | 0/26 [00:00<?, ? examples/s]

In [None]:
trainer.train()

Step,Training Loss,Validation Loss
14,0.041,0.069731
28,0.0476,0.067196
42,0.039,0.06701
56,0.0342,0.069256
70,0.0328,0.066113
84,0.0283,0.068049
98,0.0225,0.070813
112,0.0246,0.071423
126,0.0226,0.071374
140,0.0218,0.071644


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

TrainOutput(global_step=140, training_loss=0.033720861721251695, metrics={'train_runtime': 1677.8607, 'train_samples_per_second': 0.673, 'train_steps_per_second': 0.083, 'total_flos': 1.2849229947076608e+16, 'train_loss': 0.033720861721251695, 'epoch': 4.95575221238938})

In [None]:
save_model(trainer.model, tokenizer)

In [None]:
!zip -r /content/fine-tuned-mistral.zip /content/fine-tuned-mistral
from google.colab import files
files.download("/content/fine-tuned-mistral.zip")

updating: content/fine-tuned-mistral/ (stored 0%)
updating: content/fine-tuned-mistral/tokenizer_config.json (deflated 95%)
updating: content/fine-tuned-mistral/special_tokens_map.json (deflated 73%)
updating: content/fine-tuned-mistral/tokenizer.json (deflated 85%)
updating: content/fine-tuned-mistral/adapter_config.json (deflated 52%)
updating: content/fine-tuned-mistral/tokenizer.model (deflated 61%)
updating: content/fine-tuned-mistral/README.md (deflated 66%)
updating: content/fine-tuned-mistral/adapter_model.safetensors (deflated 7%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Evaluation of the model

In [None]:
!unzip /content/fine-tuned-mistral.zip

Archive:  /content/fine-tuned-mistral.zip
   creating: content/fine-tuned-mistral/
  inflating: content/fine-tuned-mistral/tokenizer_config.json  
  inflating: content/fine-tuned-mistral/special_tokens_map.json  
  inflating: content/fine-tuned-mistral/tokenizer.json  
  inflating: content/fine-tuned-mistral/adapter_config.json  
  inflating: content/fine-tuned-mistral/tokenizer.model  
  inflating: content/fine-tuned-mistral/README.md  
  inflating: content/fine-tuned-mistral/adapter_model.safetensors  


In [None]:
!pip install -q -U transformers
!pip install -q -U bitsandbytes

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m60.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from huggingface_hub import login
from google.colab import userdata
HF_API_KEY = userdata.get('HF_API_KEY')

login(HF_API_KEY)

In [None]:
# For loading a PEFT model, we need to use a special object for CausalLM from PEFT
# instead of the regular HuggingFace object.
from peft import AutoPeftModelForCausalLM
import torch
from transformers import BitsAndBytesConfig
from transformers import AutoTokenizer

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                     # Q = 4 bits
    bnb_4bit_use_double_quant=True,        # double quantization, quantizing the quantization constants for saving an additional 0.4 bits per parameter
    bnb_4bit_quant_type="nf4",             # 4-bit NormalFloat Quantization (optimal for normal weights; enforces w ∈ [-1,1])
    bnb_4bit_compute_dtype=torch.bfloat16  # Dequantize to 16-bits before computations (as in the paper)
)

# Load the fine-tuned model
peft_model_path = "./content/fine-tuned-mistral"
tuned_model = AutoPeftModelForCausalLM.from_pretrained(
    peft_model_path,
    quantization_config=bnb_config  # Load with 4-bit quantization
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(peft_model_path)

# Set the padding token to be the same as the end-of-sequence token
tokenizer.pad_token = tokenizer.eos_token

# Specify that padding should be added to the right side of the sequences
tokenizer.padding_side = "right"

# Enable attention cache during inference
tuned_model.config.use_cache = True

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [None]:
# Define a function to build a prompt from a data example
def format_instruction(sentence, edges):
    return f"""
Extract relationships (edges) from the given sentences. Each relationship should be a triplet in the format `(Subject, Object, Relation)`, where:

1. **Subject**: The main entity initiating the action or relationship.
2. **Object**: The entity affected by or related to the Subject.
3. **Relation**: The action or relationship connecting the Subject and Object.

Return the results as a list of dictionaries. Each dictionary should have two keys:
- `"sentence"`: The original sentence.
- `"edges"`: A list of triplets representing the extracted edges.


Example:
Input Sentence: "Privacy and data governance ensures prevention of harm"
Output: {{'edges': [['Privacy', 'harm', 'ensures prevention of'], ['data governance', 'harm', 'ensures prevention of']], 'sentence': 'Privacy and data governance ensures prevention of harm'}}

Task:
Input Sentence: "{sentence.strip()}"
Output: {edges}
"""


In [None]:
# Test the model
sent = "Binary Classification is subclass of Classification"
ex_inp = format_instruction(sent,"")
inputs = tokenizer(ex_inp, return_tensors='pt')
inputs = inputs.to("cuda")
output_tokens = tuned_model.generate(
    inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    pad_token_id=tokenizer.pad_token_id,
    max_new_tokens=50,)[0]     # batch of tokens with one sequence
output = tokenizer.decode(output_tokens, skip_special_tokens=True)
print(output)


Extract relationships (edges) from the given sentences. Each relationship should be a triplet in the format `(Subject, Object, Relation)`, where:

1. **Subject**: The main entity initiating the action or relationship.
2. **Object**: The entity affected by or related to the Subject.
3. **Relation**: The action or relationship connecting the Subject and Object.

Return the results as a list of dictionaries. Each dictionary should have two keys:
- `"sentence"`: The original sentence.
- `"edges"`: A list of triplets representing the extracted edges.


Example:
Input Sentence: "Privacy and data governance ensures prevention of harm"
Output: {'edges': [['Privacy', 'harm', 'ensures prevention of'], ['data governance', 'harm', 'ensures prevention of']], 'sentence': 'Privacy and data governance ensures prevention of harm'}

Task:
Input Sentence: "Binary Classification is subclass of Classification"
Output: 
{'edges': [['Binary Classification', 'Classification', 'is subclass of']], 'sentence': 

In [None]:
# Test the model
ex_inp = format_instruction(graph_data[5]["sentence"],"")
inputs = tokenizer(ex_inp, return_tensors='pt')
inputs = inputs.to("cuda")
output_tokens = tuned_model.generate(
    inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    pad_token_id=tokenizer.pad_token_id,
    max_new_tokens=50,)[0]     # batch of tokens with one sequence
output = tokenizer.decode(output_tokens, skip_special_tokens=True)
print(output)


Extract relationships (edges) from the given sentences. Each relationship should be a triplet in the format `(Subject, Object, Relation)`, where:

1. **Subject**: The main entity initiating the action or relationship.
2. **Object**: The entity affected by or related to the Subject.
3. **Relation**: The action or relationship connecting the Subject and Object.

Return the results as a list of dictionaries. Each dictionary should have two keys:
- `"sentence"`: The original sentence.
- `"edges"`: A list of triplets representing the extracted edges.


Example:
Input Sentence: "Privacy and data governance ensures prevention of harm"
Output: {'edges': [['Privacy', 'harm', 'ensures prevention of'], ['data governance', 'harm', 'ensures prevention of']], 'sentence': 'Privacy and data governance ensures prevention of harm'}

Task:
Input Sentence: "Binary Classification is subclass of Classification"
Output: 
{'edges': [['Binary Classification', 'Classification', 'is subclass of']], 'sentence': 

In [None]:
# error sentence
ex_inp = format_instruction(list(error_sentences)[1],"")
inputs = tokenizer(ex_inp, return_tensors='pt')
inputs = inputs.to("cuda")
output_tokens = tuned_model.generate(
    inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    pad_token_id=tokenizer.pad_token_id,
    max_new_tokens=50,)[0]     # batch of tokens with one sequence
output = tokenizer.decode(output_tokens, skip_special_tokens=True)
print(output)


Extract relationships (edges) from the given sentences. Each relationship should be a triplet in the format `(Subject, Object, Relation)`, where:

1. **Subject**: The main entity initiating the action or relationship.
2. **Object**: The entity affected by or related to the Subject.
3. **Relation**: The action or relationship connecting the Subject and Object.

Return the results as a list of dictionaries. Each dictionary should have two keys:
- `"sentence"`: The original sentence.
- `"edges"`: A list of triplets representing the extracted edges.


Example:
Input Sentence: "Privacy and data governance ensures prevention of harm"
Output: {'edges': [['Privacy', 'harm', 'ensures prevention of'], ['data governance', 'harm', 'ensures prevention of']], 'sentence': 'Privacy and data governance ensures prevention of harm'}

Task:
Input Sentence: "AI system’s capabilities and limitations should be communicated to users"
Output: 
{'edges': [['AI system’s capabilities', 'users', 'should be commun

# Host API for inferencing

In [None]:
!unzip /content/fine-tuned-mistral.zip

Archive:  /content/fine-tuned-mistral.zip
   creating: content/fine-tuned-mistral/
  inflating: content/fine-tuned-mistral/tokenizer_config.json  
  inflating: content/fine-tuned-mistral/special_tokens_map.json  
  inflating: content/fine-tuned-mistral/tokenizer.json  
  inflating: content/fine-tuned-mistral/adapter_config.json  
  inflating: content/fine-tuned-mistral/tokenizer.model  
  inflating: content/fine-tuned-mistral/README.md  
  inflating: content/fine-tuned-mistral/adapter_model.safetensors  


In [None]:
!pip install -q fastapi uvicorn
!pip install -q -U transformers
!pip install -q -U bitsandbytes

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m65.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!npm install --silent localtunnel

In [None]:
from google.colab import userdata
HF_API_KEY = userdata.get('HF_API_KEY')

In [None]:
app_file_str = """from fastapi import FastAPI
from pydantic import BaseModel

app = FastAPI()

# Post body model
class Message(BaseModel):
    content: str

from huggingface_hub import login
HF_API_KEY = """+"\""+HF_API_KEY+"\""+"""

login(HF_API_KEY)

# For loading a PEFT model, we need to use a special object for CausalLM from PEFT
# instead of the regular HuggingFace object.
from peft import AutoPeftModelForCausalLM
import torch
from transformers import BitsAndBytesConfig
from transformers import AutoTokenizer

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                     # Q = 4 bits
    bnb_4bit_use_double_quant=True,        # double quantization, quantizing the quantization constants for saving an additional 0.4 bits per parameter
    bnb_4bit_quant_type="nf4",             # 4-bit NormalFloat Quantization (optimal for normal weights; enforces w ∈ [-1,1])
    bnb_4bit_compute_dtype=torch.bfloat16  # Dequantize to 16-bits before computations (as in the paper)
)

# Load the fine-tuned model
peft_model_path = "./content/fine-tuned-mistral"
tuned_model = AutoPeftModelForCausalLM.from_pretrained(
    peft_model_path,
    quantization_config=bnb_config  # Load with 4-bit quantization
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(peft_model_path)

# Set the padding token to be the same as the end-of-sequence token
tokenizer.pad_token = tokenizer.eos_token

# Specify that padding should be added to the right side of the sequences
tokenizer.padding_side = "right"

# Enable attention cache during inference
tuned_model.config.use_cache = True

# Define a function to build a prompt from a data example
def format_instruction(sentence, edges):
    return f\"\"\"
Extract relationships (edges) from the given sentences. Each relationship should be a triplet in the format `(Subject, Object, Relation)`, where:

1. **Subject**: The main entity initiating the action or relationship.
2. **Object**: The entity affected by or related to the Subject.
3. **Relation**: The action or relationship connecting the Subject and Object.

Return the results as a list of dictionaries. Each dictionary should have two keys:
- `"sentence"`: The original sentence.
- `"edges"`: A list of triplets representing the extracted edges.


Example:
Input Sentence: "Privacy and data governance ensures prevention of harm"
Output: {{'edges': [['Privacy', 'harm', 'ensures prevention of'], ['data governance', 'harm', 'ensures prevention of']], 'sentence': 'Privacy and data governance ensures prevention of harm'}}

Task:
Input Sentence: "{sentence.strip()}"
Output: {edges}
\"\"\"

# Routes
# Post route
@app.post("/")
def create_item(message: Message):
    try:
        sent = message.content
        ex_inp = format_instruction(sent,"")
        inputs = tokenizer(ex_inp, return_tensors='pt')
        inputs = inputs.to("cuda")
        output_tokens = tuned_model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            pad_token_id=tokenizer.pad_token_id,
            max_new_tokens=50,)[0]     # batch of tokens with one sequence
        res = tokenizer.decode(output_tokens, skip_special_tokens=True)

        return {"response":res.replace(sent,"")}


    except Exception as e:
        return {"error": str(e), 'message': "An error occurred. Please try again."}

# Get route
@app.post('/')
async def root():
    return {'msg': 'use post'}
"""

with open("app.py", 'w') as file:
    # Write the code content to the file
    file.write(app_file_str)

In [None]:
# test working
!uvicorn app:app --host 0.0.0.0 --port 8000

2024-12-02 12:05:20.787547: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-02 12:05:20.820688: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-02 12:05:20.830659: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-02 12:05:20.853180: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
`low_cpu_mem_usage` was None, now default to 

In [None]:
!uvicorn app:app --host 0.0.0.0 --port 8000 &>/content/logs.txt &

In [None]:
!npx localtunnel --port 8000 & curl ipv4.icanhazip.com

34.16.176.92
[1G[0K⠙[1G[0Kyour url is: https://funny-sloths-suffer.loca.lt
