#### Set-up

In [None]:
# Install Libraries
%%capture
import warnings
warnings.filterwarnings("ignore")
# All LangChain libraries for implementing logic chaining
%pip install -U langchain
%pip install -U langchain_community
%pip install -U langchain-huggingface
%pip install -U langchain_experimental
%pip install -U langchain_openai

%pip install -U unstructured
%pip install -U sentence-transformers

%pip install -U Neo4jGraph
%pip install -U py2neo
%pip install -U spacy
%pip install -U rdflib-neo4j
# Langchain
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
from langchain_openai import ChatOpenAI

from spacy import load, displacy
import pandas as pd
from google.colab import userdata
import string

In [None]:
# Load from colab note
NEO4J_USERNAME = "neo4j"
NEO4J_URI = userdata.get('NEO4J_URI')
NEO4J_PASSWORD = userdata.get('NEO4J_PASSWORD')
HF_API_KEY = userdata.get('HF_API_KEY')

# Set up connection to graph instance using LangChain
kg = Neo4jGraph(
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD
)



In [None]:
from langchain_huggingface import HuggingFaceEndpoint

# define huggingface generation endpoint
hf_llm = HuggingFaceEndpoint(
    repo_id="mistralai/Mistral-7B-Instruct-v0.3", # Model Name
    task="text-generation",                       # task as generating a text response
    max_new_tokens=150,                           # maximum numbers of generated tokens
    do_sample=False,                              # disables sampling
    huggingfacehub_api_token=HF_API_KEY           # ü§ó huggingface API token
)

Caution: Run, only if you wish to delete the content of database

In [None]:
# clean the neo4j dataset
## All nodes and relationships.
kg.query("MATCH (n) DETACH DELETE n")
## All indexes and constraints.
kg.query("CALL apoc.schema.assert({},{},true) YIELD label, key RETURN *")

# check if the dataset empty
kg.query("MATCH (n) RETURN count(n)")

[{'count(n)': 0}]

#### Rule-Based Model

In [None]:
import json

allowed_dependencies = {
     'acomp','advmod','agent','amod','attr','aux','auxpass',
     'case','cc','ccomp','compound','conj','det','dobj',
     'nmod','nsubj','nsubjpass',
     'pcomp','pobj','prep','poss','ROOT','xcomp'
}

def has_required_dependencies(doc, allowed_dependencies):
    if not {token.dep_ for token in doc}.issubset(allowed_dependencies):
        return False

    return ("is a" in doc.text.lower() or "is an" in doc.text.lower()) or \
            (any(token.dep_ == 'ROOT' for token in doc) and \
            any(token.dep_ in {'nsubj', 'nsubjpass'} for token in doc) and \
            any(token.dep_ in {'dobj', 'pobj'} for token in doc))

nlp = load("en_core_web_sm")
unhandled_sentences=set()
all_graphs = []
with open("DSA_knowledge.txt", "r") as file:
    sentences = file.read()

for sentence in [s.strip().rstrip(string.punctuation) for s in sentences.strip().split('\n') if s.strip()]:
  doc = nlp(sentence)
  # displacy.render(doc, style="dep", jupyter=True, options={'distance': 90})
  if not has_required_dependencies(doc, allowed_dependencies):
      unhandled_sentences.add(sentence)
      continue

  try:
    temp_graph = {
        "nodes": {},  # {'nodes': {0: {'pos': 0, 'label': 'X', 'dep': 'nsubj'}, 4: {'pos': 4, 'label': 'Y', 'dep': 'pobj'}},
        "edges": [],  # 'edges': [(0, 4, 'is subclass of')]}
        "sentence": sentence
    }

    edge_mapping = {
        'subject_nodes': {},  # {1: {0}} # multiple subject nodes possible
        'object_nodes': {},   # {1: 4}
        'edge_ids': set()     # {1}
    }

    temp_graph["nodes"] = {token['id']: {"pos": token['id'], "label": doc.text, "dep": token['dep']}
                          for token, doc in zip(doc.to_json()['tokens'], doc)}

    temp_graph["edges"] = [(token['head'], token['id'], token['dep'])
                            for token in doc.to_json()['tokens'] if token['head'] != token['id']]

    root_node = list(filter(lambda node: temp_graph["nodes"][node]['dep'] == 'ROOT', temp_graph["nodes"]))[0]
    stopping = False
    while not stopping:
      for edge in sorted(temp_graph["edges"], key=lambda x: abs(x[0] - x[1])):

        source_pos, target_pos, meta = edge

        if source_pos not in temp_graph["nodes"] or target_pos not in temp_graph["nodes"]:
            continue
        #print(edge)
        source_metadata = temp_graph["nodes"][source_pos]
        target_metadata = temp_graph["nodes"][target_pos]
        try:
            match (source_metadata, meta, target_metadata):
                case {'label': s, **source}, 'compound' | 'amod' | 'aux' |'auxpass' | 'advmod', {'label': t, **target}:
                    source_metadata['label'] = f"{t} {s}"
                    temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == source_pos and edge[1] == target_pos), temp_graph['edges']))
                    del temp_graph['nodes'][target_pos]
                    continue

                case {'label': s, **source}, 'agent', {'label': t, **target}:
                    source_metadata['label'] = f"{s} {t}"
                    next_node = next((n for src, n, label in temp_graph["edges"] if src == target_pos and label == 'pobj'), None)
                    edge_mapping['edge_ids'].add(source_pos)
                    edge_mapping['object_nodes'][source_pos] = next_node
                    temp_graph['edges'].append((source_pos, next_node, 'pobj'))
                    temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == source_pos and edge[1] == target_pos), temp_graph['edges']))
                    temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == target_pos and edge[1] == next_node), temp_graph['edges']))
                    del temp_graph['nodes'][target_pos]
                    continue

                case {'label': s, **source}, 'case' | 'cc', {'label': t, **target}:
                    temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == source_pos and edge[1] == target_pos), temp_graph['edges']))
                    del temp_graph['nodes'][target_pos]
                    continue

                case {'label': s, **source}, 'det', {'label': t, **target}:
                    temp_graph['nodes'][source_pos]['det'] = t
                    temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == source_pos and edge[1] == target_pos), temp_graph['edges']))
                    del temp_graph['nodes'][target_pos]
                    continue

                case {'label': s, **source}, 'attr'|'acomp', {'label': 'subclass'|'attribute'|'dimension'|'kind'|'threat'|'result'|'type'|'equal'|'form', **target}: #is(head)--attr--subclass(tail)--prep--of(child)--pobj--Risk
                    next_node = next((n for src, n, label in temp_graph["edges"] if src == target_pos and label == 'prep'), None)
                    obj_node = next((n for src, n, label in temp_graph["edges"] if src == next_node and label == 'pobj'), None)
                    source_metadata['label'] = f"{s} {target_metadata['label']} {temp_graph['nodes'][next_node]['label']}" #is-->issubclassof
                    edge_mapping['edge_ids'].add(source_pos)
                    edge_mapping['object_nodes'][source_pos] = obj_node
                    temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == source_pos and edge[1] == target_pos), temp_graph['edges'])) # remove edge: is--subclass
                    temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == target_pos and edge[1] == next_node), temp_graph['edges'])) # remove edge: subclass--of
                    temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == next_node and edge[1] == obj_node), temp_graph['edges'])) # remove edge: of--Y
                    temp_graph['edges'].append((source_pos, obj_node, 'pobj')) #connect edge from 'is' node to obj node
                    del temp_graph['nodes'][target_pos] # remove node: 'subclass'
                    del temp_graph['nodes'][next_node]  # remove node: 'of'
                    continue

                case {'label': s, **source}, 'attr'|'acomp', {'label': t, **target}: #is-attr-Y
                    edge_mapping['edge_ids'].add(source_pos)
                    edge_mapping['object_nodes'][source_pos] = target_pos
                    continue

                case {'dep': 'ROOT', 'label': s, **source}, 'prep'|'xcomp', {'label': t, **target}: #attributes(ROOT)--prep--to #helps--xcomp--see--pobj--X
                    source_metadata['label'] = f"{s} {t}"
                    next_node = next((n for src, n, label in temp_graph["edges"] if src == target_pos and label in {'pobj', 'dobj'}), None)
                    if next_node:
                      temp_graph['edges'].append((source_pos, next_node, 'pobj'))
                      temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == source_pos and edge[1] == target_pos), temp_graph['edges']))
                      temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == target_pos and edge[1] == next_node), temp_graph['edges']))
                      del temp_graph['nodes'][target_pos]
                    continue

                case {'label': s, **source}, 'prep', {'label': t, **target}: #*-dobj-assessment--prep--of|*-attr-(a)dimension-prep-of
                    if next((n for src, n, label in temp_graph["edges"] if n == source_pos and label == 'attr'), None) is None:
                      next_node = next((n for src, n, label in temp_graph["edges"] if src == target_pos and label in {'pobj'}), None)
                      if next_node: #Date-prep-of-pobj-birth
                        source_metadata['label'] = f"{s} {t} {temp_graph['nodes'][next_node]['label']}"
                        temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == source_pos and edge[1] == target_pos), temp_graph['edges']))
                        temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == target_pos and edge[1] == next_node), temp_graph['edges']))
                        del temp_graph['nodes'][target_pos]
                        del temp_graph['nodes'][next_node]
                      else:
                        edge_mapping['edge_ids'].add(target_pos)
                        edge_mapping['subject_nodes'].setdefault(target_pos, set()).add(source_pos)
                        temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == source_pos and edge[1] == target_pos), temp_graph['edges']))
                    continue

                case {'label': s, **source}, 'poss', {'label': t, **target}:
                    next_node = next((n for src, n, label in temp_graph["edges"] if src == source_pos and label == 'conj'), None)
                    if next_node:
                      temp_graph['edges'].append((temp_graph["nodes"][next_node]['label'],
                                                  temp_graph["nodes"][target_pos]['label'],
                                                  'of'))
                    continue

                case {'label': s, **source}, 'nmod', {'label': t, **target}:
                    source_metadata['label'] = f"{t} {s}"
                    incoming_node = next((src for src, n, label in temp_graph["edges"] if target == source_pos and label == 'nsubj'), None)
                    if 'conj' in target:
                        target['conj']['nodeId'] = target['conj']['text'] + f" {s}"
                        edge_mapping['subject_nodes'][incoming_node].add(target['conj']['nodeId'])
                    temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == source_pos and edge[1] == target_pos), temp_graph['edges']))
                    del temp_graph['nodes'][target_pos]
                    continue

                case {'label': s, **source}, 'conj', {'label': t, **target}:
                    temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == source_pos and edge[1] == target_pos), temp_graph['edges']))
                    temp_graph['nodes'][source_pos]['conj'] = {'text': t, 'nodeId': target_pos}
                    continue

                case {'label': s, **source}, 'pcomp', {'label': t, **target}: #in--pcomp--explaining--dobj--x
                    temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == source_pos and edge[1] == target_pos), temp_graph['edges']))
                    next_node = next((n for src, n, label in temp_graph["edges"] if src == target_pos and label in {'pobj', 'dobj'}), None)
                    if next_node:
                      temp_graph['nodes'][root_node]['label'] += f" {t}"  #if not work f" {temp_graph['nodes'][root_node]['label']} {t}
                      temp_graph['edges'].append((root_node, next_node, 'dobj'))
                      temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == target_pos and edge[1] == next_node), temp_graph['edges']))
                      del temp_graph['nodes'][source_pos]
                      del temp_graph['nodes'][target_pos]
                    continue

                case {'label': s, **source}, 'ccomp', {'label': t, **target}: #Design interface can help users understand AI decisions
                    next_node = next((n for src, n, label in temp_graph["edges"] if src == target_pos and label == 'nsubj'), None)
                    if next_node:
                      edge_mapping['object_nodes'][source_pos] = next_node
                    continue

                case {'label': s, **source}, 'nsubj' | 'nsubjpass', {'label': t, **target}:
                    edge_mapping['edge_ids'].add(source_pos)
                    edge_mapping['subject_nodes'].setdefault(source_pos, set()).add(target_pos)
                    if 'conj' in target:
                      edge_mapping['subject_nodes'][source_pos].add(target_metadata['nodeId'])
                    continue

                case {'label': s, **source}, 'dobj' | 'pobj', {'label': t, **target}:
                    if next((src for src, n, label in temp_graph["edges"] if src == source_pos and label == 'prep'), None):
                      source_metadata['label'] = f"{s} {t}"
                      temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == source_pos and edge[1] == target_pos), temp_graph['edges']))
                      del temp_graph['nodes'][target_pos]
                    #assign object outside loop
                    continue

                case another:
                    print("another:", edge)
                    unhandled_sentences.add(sentence)
                    stopping = True
                    continue

        except Exception as e:
                print(f"Error occurred in sentence: {sentence} with edge: {edge}, error: {e}")
                unhandled_sentences.add(sentence) # throw error
                stopping = True
                continue
      else:
          break

    # Update object nodes
    edge_mapping['object_nodes'].update({
        edge_id: next((tail for head, tail, meta in temp_graph['edges']
                      if meta in {'dobj', 'pobj'}), None)
        for edge_id in edge_mapping['edge_ids']
        if edge_id not in edge_mapping['object_nodes']
    })

    for edge_id, obj_node in edge_mapping['object_nodes'].items():
        if obj_node is None:
            print(f"Missing object node for edge ID: {edge_id}")

    # create final mapping
    for edge_id in edge_mapping['edge_ids']:
        subject_nodes = edge_mapping['subject_nodes'][edge_id]
        object_node = edge_mapping['object_nodes'][edge_id]
        edge_node = temp_graph['nodes'][edge_id]
        for subject_node in subject_nodes:
            #temp_graph['edges'].append((subject_node, object_node, edge_node['label']))
            temp_graph['edges'].append((temp_graph["nodes"][subject_node]['label'],
                                        temp_graph["nodes"][object_node]['label'],
                                        edge_node['label']))
            temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == edge_id and edge[1] == subject_node), temp_graph['edges']))

        temp_graph['edges'] = list(filter(lambda edge: not (edge[0] == edge_id and edge[1] == object_node), temp_graph['edges']))
        del temp_graph['nodes'][edge_id]

    temp_graph['edges'] = list(set(temp_graph['edges'])-set([edge for edge in temp_graph['edges'] if edge[2] in allowed_dependencies]))

    # all_graphs.append(temp_graph) # we don't need nodes to be pat of json, as edges have them
    all_graphs.append({
      "edges": temp_graph["edges"],
      "sentence": temp_graph["sentence"]
    })

    for source, target, edge in temp_graph['edges']:
        kg.query(
          """
          CALL apoc.merge.node([$source_label], {label:$source_label}) YIELD node AS s
          CALL apoc.merge.node([$target_label], {label:$target_label}) YIELD node AS t
          CALL apoc.merge.relationship(s, $edge_label, {sentence: $sentence}, {}, t, {}) YIELD rel
          RETURN s,t,rel
          """
          , params={
          'source_label': temp_graph['nodes'][source]['label'].replace(" ", "_"),
          'target_label': temp_graph['nodes'][target]['label'].replace(" ", "_"),
          'edge_label': edge.replace(" ", "_"),
          'sentence': sentence
        })

  except Exception as e:
    print(f"Failed to process sentence: {sentence}, error: {e}")
    unhandled_sentences.add(sentence)


with open("unhandled_sentences.txt", "w") as file:
  for unhandled in unhandled_sentences:
    file.write(unhandled + "\n")

# Training data - for fine tuning the HF model:
with open('graph_data.json', 'w') as json_file:
    json.dump(all_graphs, json_file, indent=4, ensure_ascii=False)

In [None]:
import json
# load JSON file
def load_json(path):
  with open(path, 'r') as file:
    return json.load(file)

import textwrap
# Prints the text with lines wrapped to a maximum width of 80 characters
def clean_print(text):
    return print(textwrap.fill(text, width=80))

##### Query graph database

In [None]:
CYPHER_GENERATION_TEMPLATE = """Task:
Generate Cypher statement to query a Neo4j graph database.

Instructions:
* Only use the provided relationship types, node labels, and properties in the schema.
* Do not use any other relationship types, properties, or node labels that are not provided.
* Always follow the correct relationship direction.
* Ensure that the query follows the correct Cypher syntax.

Schema:
{schema}

Examples:
Here are a few examples of generated Cypher statements for particular questions:

# What is Bias?
    MATCH (s:Bias)-[r]-(t)
    RETURN s,r,t

# What might introduce Bias?
    MATCH (s)-[r:might_introduce]->(t:Bias)
    RETURN s,r,t

The question is:
{question}

The generated Cypher statement:"""

# Define schema
schema = """Node: Risk
Properties: label
Relationships: (Bias)-[:is]->(Risk)
"""

In [None]:
# build the query prompt template
from langchain.prompts.prompt import PromptTemplate
CYPHER_GENERATION_PROMPT = PromptTemplate(
    input_variables=["schema", "question"],
    template=CYPHER_GENERATION_TEMPLATE
)

# initilize the chain
from langchain.chains import GraphCypherQAChain
cypherChain = GraphCypherQAChain.from_llm(
    graph=kg,
    llm=hf_llm,
    #cypher_llm=hf_llm,                    # see intermediate steps
    cypher_prompt=CYPHER_GENERATION_PROMPT,   # cypher generation prompt
    verbose=True,
    allow_dangerous_requests=True
)


In [None]:
# test the chain
question = "What is Requirement?"
res = cypherChain.run(question)
clean_print('Question: '+question)
clean_print('Response: '+res)



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3m
MATCH (s:Requirement)-[r]-(t)
RETURN s,r,t
[0m
Full Context:
[32;1m[1;3m[{'s': {'label': 'Requirement'}, 'r': ({'label': 'Fairness'}, 'is', {'label': 'Requirement'}), 't': {'label': 'Fairness'}}, {'s': {'label': 'Requirement'}, 'r': ({'label': 'Explainability'}, 'is', {'label': 'Requirement'}), 't': {'label': 'Explainability'}}][0m

[1m> Finished chain.[0m
Question: What is Requirement?
Response:  Fairness and Explainability are Requirements.


In [None]:
question = "What is Bias?"
res = cypherChain.run(question)
clean_print('Question: '+question)
clean_print('Response: '+res)



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3m
MATCH (s:Bias)-[r]-(t)
RETURN s,r,t
[0m
Full Context:
[32;1m[1;3m[{'s': {'label': 'Bias'}, 'r': ({'label': 'Bias'}, 'is', {'label': 'Risk'}), 't': {'label': 'Risk'}}, {'s': {'label': 'Bias'}, 'r': ({'label': 'Algorithmic_Bias'}, 'is_subclass_of', {'label': 'Bias'}), 't': {'label': 'Algorithmic_Bias'}}, {'s': {'label': 'Bias'}, 'r': ({'label': 'Historical_Bias'}, 'is_subclass_of', {'label': 'Bias'}), 't': {'label': 'Historical_Bias'}}, {'s': {'label': 'Bias'}, 'r': ({'label': 'TrainTestSplit'}, 'might_introduce', {'label': 'Bias'}), 't': {'label': 'TrainTestSplit'}}, {'s': {'label': 'Bias'}, 'r': ({'label': 'Bias'}, 'is_threat_to', {'label': 'Fairness'}), 't': {'label': 'Fairness'}}][0m

[1m> Finished chain.[0m
Question: What is Bias?
Response:  Bias is a type of risk. It can be algorithmic bias, historical bias,
or it can be introduced by the train-test split. Bias is a threat to fairness.


In [None]:
CYPHER_GENERATION_TEMPLATE = """Task:
Generate Cypher statement to query a Neo4j graph database.

Instructions:
* Use only the provided node label: {query}.
* Traverse from this node and check if it is connected to a node labeled 'Risk'.
* Find all nodes connected by the 'is_subclass_of' relationship and return them, excluding the original node.

Cypher Query:
MATCH (tts:{query})-[*]-(risk:Risk)
WHERE risk IS NOT NULL
WITH tts
MATCH (tts)-[:is_subclass_of]->(parent)
MATCH (source)-[:is_subclass_of]->(parent)
WHERE source <> tts
RETURN source
"""

from langchain.prompts.prompt import PromptTemplate

CYPHER_GENERATION_PROMPT = PromptTemplate(
    input_variables=["query"],
    template=CYPHER_GENERATION_TEMPLATE
)

node_name = "TrainTestSplit"

cypherChain = GraphCypherQAChain.from_llm(
    graph=kg,
    llm=hf_llm,
    cypher_prompt=CYPHER_GENERATION_PROMPT,
    verbose=True,
    allow_dangerous_requests=True
)

# Run the query and get the response
res = cypherChain.run({"query": node_name})

# Display the response
clean_print(f'Question: {node_name}')
clean_print(f'Response: {res}')



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3m
Note: This Cypher query traverses the graph from the TrainTestSplit node, checks if it is connected to a Risk node, and then finds all nodes connected by the is_subclass_of relationship, excluding the original TrainTestSplit node. It returns those found nodes.

For a more specific example, let's assume we have a graph with the following nodes and relationships:

```
(a:TrainTestSplit {name: 'TrainSet'})
(b:TrainTestSplit {name: 'TestSet'})
(c:Risk {name: 'HighRisk'})
(d:TrainTestSplit {name: 'Train[0m


CypherSyntaxError: {code: Neo.ClientError.Statement.SyntaxError} {message: Invalid input 'Note': expected 'FOREACH', 'ALTER', 'ORDER BY', 'CALL', 'USING PERIODIC COMMIT', 'CREATE', 'LOAD CSV', 'START DATABASE', 'STOP DATABASE', 'DEALLOCATE', 'DELETE', 'DENY', 'DETACH', 'DROP', 'DRYRUN', 'FINISH', 'GRANT', 'INSERT', 'LIMIT', 'MATCH', 'MERGE', 'NODETACH', 'OFFSET', 'OPTIONAL', 'REALLOCATE', 'REMOVE', 'RENAME', 'RETURN', 'REVOKE', 'ENABLE SERVER', 'SET', 'SHOW', 'SKIP', 'TERMINATE', 'UNWIND', 'USE' or 'WITH' (line 2, column 1 (offset: 1))
"Note: This Cypher query traverses the graph from the TrainTestSplit node, checks if it is connected to a Risk node, and then finds all nodes connected by the is_subclass_of relationship, excluding the original TrainTestSplit node. It returns those found nodes."
 ^}

#### Fine tune Mistral-7B Lora

In [None]:
# Install Libraries
!pip install -q streamlit                                                                             # For Deploying apps
!pip install -q transformers>=4.32.0 datasets evaluate                                                # Comes from HuggingFace
!pip install -q auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/   # Use cu117 if on CUDA 11.7
!pip install -q optimum                                                                               # For GPTQ Optimization
!pip install -q -U bitsandbytes                                                                       # For quantization
!pip install -q -U peft                                                                               # Parameter-efficient Fine-tuning
!pip install -q -U accelerate                                                                         # Loading models across GPUs/CPU/disk
!pip install -q trl==0.7.1

##### Loading the Model

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
model_name_or_path = "mistralai/Mistral-7B-Instruct-v0.3"
model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    torch_dtype=torch.float16,
    device_map="auto",
    revision="main",
    token=HF_API_KEY)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

In [None]:
import torch                                # Deep Learning Framework
import time                                 # Measure inference time
import pandas as pd                         # For table dataset structure
import numpy as np                          # Numerical operations on the CPU
from datasets import load_dataset           # Loading the dataseet
import random

##### Loading the Quantized Model

In [None]:
from transformers import BitsAndBytesConfig

# 1. Setup the quantization configuarion
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                     # Q = 4 bits
    bnb_4bit_use_double_quant=True,        # double quantization, quantizing the quantization constants for saving an additional 0.4 bits per parameter
    bnb_4bit_quant_type="nf4",             # 4-bit NormalFloat Quantization (optimal for normal weights; enforces w ‚àà [-1,1])
    bnb_4bit_compute_dtype=torch.bfloat16  # Dequantize to 16-bits before computations (as in the paper)
)
# 2. Pass it while using the model
model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    quantization_config=bnb_config,
    device_map="auto")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

##### Loading the Tokenizer

In [None]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
prompt = f"""
How are you, Llama! Tell me about yourself.
"""

inputs = tokenizer(prompt, return_tensors='pt')
output_tokens = model.generate(inputs["input_ids"], max_new_tokens=100,)[0]     # batch of tokens with one sequence
output = tokenizer.decode(output_tokens, skip_special_tokens=True)
print(output)

In [None]:
from datasets import Dataset
import json
# Load your JSON file
with open('graph_data.json', 'r') as f:
    graph_data = json.load(f)

# Prepare the dataset (input/output pairs) from your JSON data
# Assuming the JSON file has the 'sentence' and 'edges' structure

train_data_dict = {"input": [], "output": []}  # Initialize an empty dictionary with the desired columns
for graph in graph_data:
    train_data_dict["input"].append(graph.get("sentence", ""))
    train_data_dict["output"].append(f"Edges: {graph['edges']}")

# Convert the dictionary into a Hugging Face Dataset
dataset = Dataset.from_dict(train_data_dict)

train_dataset, val_dataset = dataset.train_test_split(test_size=0.2).values()

# Preprocessing function for dataset
def preprocess_function(examples):
    inputs = examples["input"]
    outputs = examples["output"]

    model_inputs = tokenizer(inputs, truncation=True, padding=True, max_length=256)
    labels = tokenizer(outputs, truncation=True, padding=True, max_length=256)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Preprocess the dataset
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)
print(f"Training dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")

Map:   0%|          | 0/206 [00:00<?, ? examples/s]

Map:   0%|          | 0/52 [00:00<?, ? examples/s]

Training dataset size: 206
Validation dataset size: 52


In [None]:
print(train_data[0])

In [None]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
# Prepare the model for LoRA fine-tuning
lora_config = LoraConfig(
    r=16,  # rank of the low-rank matrices
    lora_alpha=64,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"  # LoRA fine-tuning for causal language modeling task
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
#print(model)

##### Train the Model

In [None]:
from transformers import TrainingArguments

training_arguments = TrainingArguments(
    fp16=True,                           # Training computations in 16 bits
    # batch-related
    per_device_train_batch_size=2,       # Batch Size
    gradient_accumulation_steps=4,       # Batch Size (Mathematically)
    # optimizer-related
    optim="paged_adamw_32bit",           # Variant of AdamW designed to be more efficient on 32-bit GPUs
    learning_rate=1e-4,                  # Learning Rate
    warmup_ratio=0.05,                   # After 5% of the data, learning rate has linearly  from 0 to 1e-4
    lr_scheduler_type="cosine",          # Adjust learning rate sinusoidally
    max_grad_norm=0.3,                   # Clip gradients if less than 0.3 (prevent gradient explosion)
    # epochs and saving
    num_train_epochs=2,                  # Number of Epochs
    save_strategy="epoch",               # Save after each epoch
    output_dir="epoch-finetuned",        # Where to save the model
    # validation
    evaluation_strategy="steps",         # For the next argument
    eval_steps=0.2,                      # Evaluate after 20% of training steps
    # logging-related
    logging_steps=1,                     # Number of update steps between two logs
    group_by_length=True,                # Minimize padding by grouping sentences of similar length
    seed=42,                             # For consistent results
)
model.gradient_checkpointing_enable()    # Store less activations and recompute later
model.config.use_cache = False           # Disable using attention output cache. Should be enabled in inference.

In [None]:
pip install --upgrade trl

Collecting trl
  Downloading trl-0.11.4-py3-none-any.whl.metadata (12 kB)
Collecting tyro>=0.5.11 (from trl)
  Downloading tyro-0.8.14-py3-none-any.whl.metadata (8.4 kB)
Collecting shtab>=1.5.6 (from tyro>=0.5.11->trl)
  Downloading shtab-1.7.1-py3-none-any.whl.metadata (7.3 kB)
Downloading trl-0.11.4-py3-none-any.whl (316 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m316.6/316.6 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tyro-0.8.14-py3-none-any.whl (109 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m109.8/109.8 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading shtab-1.7.1-py3-none-any.whl (14 kB)
Installing collected packages: shtab, tyro, trl
  Attempting uninstall: trl
    Found existing installation: trl 0.7.1
    Uninstalling trl-0.7.1:
      Succe

In [None]:
from trl import SFTTrainer
trainer = SFTTrainer(
    # tokenizer and model
    tokenizer=tokenizer,
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    peft_config=lora_config,
    max_seq_length=1024,
    args=training_arguments,
)

trainer.train()

##### Save Trained Model

In [None]:
import os

peft_model_path =os.path.expanduser("~/Downloads/fine-tuned-mistral")
os.makedirs(peft_model_path, exist_ok=True)
trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

In [None]:
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer
tuned_model = AutoPeftModelForCausalLM.from_pretrained(peft_model_path, low_cpu_mem_usage=True, torch_dtype=torch.float16, load_in_4bit=True)
tokenizer = AutoTokenizer.from_pretrained(peft_model_path)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

##### Inference

In [None]:
# Example single sentence for the model to process
sentence = "X is subclass of Y."

prompt = f"""
Extract the nodes and relationships from the following sentence.

### Input:
"{sentence}"

### Nodes and Relationships:
"""

# Encode the prompt
input_ids = tokenizer(prompt, return_tensors='pt', truncation=True).input_ids.cuda()
outputs = tuned_model.generate(input_ids=input_ids, max_new_tokens=150)
output = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]

# Print the generated nodes and relationships
print(f'TRAINED MODEL GENERATED RESULT :\n{output}')

In [None]:
# Example single sentence for the model to process
sentence = "Re-weighting mitigates Algorithmic Bias"

prompt = f"""
Extract the nodes and relationships from the following sentence.

### Input:
"{sentence}"

### Nodes and Relationships:
"""

# Encode the prompt
input_ids = tokenizer(prompt, return_tensors='pt', truncation=True).input_ids.cuda()
outputs = tuned_model.generate(input_ids=input_ids, max_new_tokens=150)
output = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]

# Print the generated nodes and relationships
print(f'TRAINED MODEL GENERATED RESULT :\n{output}')

In [None]:
# Example single sentence for the model to process
sentence = "Societal and environmental well-being Includes sustainability"

prompt = f"""
Extract the nodes and relationships from the following sentence.

### Input:
"{sentence}"

### Nodes and Relationships:
"""

# Encode the prompt
input_ids = tokenizer(prompt, return_tensors='pt', truncation=True).input_ids.cuda()
outputs = tuned_model.generate(input_ids=input_ids, max_new_tokens=150)
output = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]

# Print the generated nodes and relationships
print(f'TRAINED MODEL GENERATED RESULT :\n{output}')

In [None]:
# Example single sentence for the model to process
sentence = "AI systems‚Äô resource usage and energy consumption need to be monitored"

prompt = f"""
Extract the nodes and relationships from the following sentence.

### Input:
"{sentence}"

### Nodes and Relationships:
"""

# Encode the prompt
input_ids = tokenizer(prompt, return_tensors='pt', truncation=True).input_ids.cuda()
outputs = tuned_model.generate(input_ids=input_ids, max_new_tokens=150)
output = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]

# Print the generated nodes and relationships
print(f'TRAINED MODEL GENERATED RESULT :\n{output}')