# Graph extraction

#### Extract vertices

По описанию на естественном языке должны извлекаться вершины.

In [1]:
!pip install openai langchain langchain_openai -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.6/50.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.9/386.9 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m37.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.9/49.9 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.0/78.0 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m325.2/325.2 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m408.0/408.0 kB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
from google.colab import userdata
from langchain_openai import ChatOpenAI
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)
from langchain.prompts import PromptTemplate, HumanMessagePromptTemplate


chat = ChatOpenAI(
    model='gpt-4o-mini',
    base_url="https://api.proxyapi.ru/openai/v1",
    api_key=userdata.get('proxyai_api_key'),
)

In [32]:
from openai import OpenAI, AsyncOpenAI
from typing import List
from pydantic import BaseModel
import json

def get_messages_for_node_extraction(descr: str) -> str:

    node_extraction_sys_message = '''
            ###ROLE###
            You are Graphical model scientist.
            You task is to extract information about DAG from text description.
            The structure, or topology, of the network should capture qualitative relationships between variables. In particular, two nodes should be connected directly if one affects or causes the other, with the arc indicating the direction of the effect.
            The presence of arrows or arcs seems to imply, at an intuitive level, that for each arc one variable should be interpreted as a cause and the other as an effect (e.g., A →E means that A causes E). This interpretation is called causal.

            For every right answer I give you 5$.
        '''

    node_extraction_str_template = f'''
        ###TASK###
        Extract all node names of GAD from the description, the user gaves you.

        ###OUTPUT FORMAT###
        You output should be list of strings

        ###

        User: Imagine a garden where the growth of plants depends on several factors. The amount of Watering affects how well the plants grow. Sunlight is another crucial factor, as it provides energy for photosynthesis. Fertilizer also plays a role by supplying essential nutrients. Together, these factors influence Plant Growth. Additionally, Watering can impact the effectiveness of Fertilizer, as nutrients are better absorbed when the soil is moist.
        Assistant: ['Watering', 'Sunlight', 'Fertilizer', 'Plant Growth']

        ###

        User: {descr}
        Assistant:
    '''


    messages = [
        {
            'role': 'system',
            'content': node_extraction_sys_message
        },
        {
            'role': 'user',
            'content': node_extraction_str_template
        }
    ]

    return messages

class Nodes(BaseModel):
    list_of_nodes: List[str]

def extract_nodes_gpt(descr, gpt_client, gpt_model='gpt-4o-mini', temperature=0):
    completion = gpt_client.beta.chat.completions.parse(
        model=gpt_model,
        messages=get_messages_for_node_extraction(descr),
        response_format=Nodes,
        temperature=temperature,
    )
    return json.loads(completion.choices[0].message.content)['list_of_nodes']

client = OpenAI(
    base_url='https://api.proxyapi.ru/openai/v1',
    api_key=userdata.get('proxyai_api_key'),
)

In [33]:
example = '''Think about a classroom where student learning is shaped by different factors. The amount of Time Spent Studying directly influences Knowledge Acquisition. Teacher Quality also affects how well students understand the material. Classroom Environment, such as noise levels and seating arrangements, can impact both Teacher Quality and Knowledge Acquisition. Altogether, these elements contribute to a student's overall Learning Outcome.'''.lower()

In [36]:
extract_nodes_gpt(example, client, gpt_model='gpt-4o-mini', temperature=0)

['Time Spent Studying',
 'Knowledge Acquisition',
 'Teacher Quality',
 'Classroom Environment',
 'Learning Outcome']

In [37]:
extract_nodes_gpt(example, client, gpt_model='gpt-4o')

['Studying',
 'Knowledge Acquisition',
 'Teacher Quality',
 'Classroom Environment',
 'Learning Outcome']

In [29]:
import pandas as pd
path_to_save = '/content/drive/MyDrive/BMM_2024_GMG'
graphs_with_description = pd.read_csv(path_to_save + '/Training_graphs.csv', index_col=None).drop(['Unnamed: 0'], axis=1)

In [30]:
graphs_with_description

Unnamed: 0,nodes,edges,descr,node_distrs,paper_link,position_in_paper
0,"['asia', 'tub', 'smoke', 'lung', 'bronc', 'eit...","[('asia', 'tub'), ('tub', 'either'), ('smoke',...",Shortness-of-breath (dyspnoea) may be due to t...,"{'asia': 'Binary', 'smoke': 'Binary', 'bronc':...",https://www.eecis.udel.edu/~shatkay/Course/pap...,page 9
1,"['Pollution', 'Smoker', 'Cancer', 'Xray', 'Dys...","[('Pollution', 'Cancer'), ('Smoker', 'Cancer')...","So, in our medical diagnosis example, we might...","{'Pollution': 'Binary', 'Smoker': 'Binary', 'C...",http://repo.darmajaya.ac.id/5277/1/Bayesian%20...,"page 28, section 2.2.2"
2,"['Burglary', 'Earthquake', 'Alarm', 'JohnCalls...","[('Burglary', 'Alarm'), ('Earthquake', 'Alarm'...",You have a new burglar alarm installed. It rel...,"{'Burglary': 'Binary', 'Earthquake': 'Binary',...",http://repo.darmajaya.ac.id/5277/1/Bayesian%20...,"page 70, section 2.5.1"
3,"['A', 'S', 'E', 'O', 'R', 'T']","[('A', 'E'), ('S', 'E'), ('E', 'O'), ('E', 'R'...","\nIn our current example we will examine, for ...","{'A': 'Multinomial', 'S': 'Binary', 'E': 'Bina...",https://www.taylorfrancis.com/books/mono/10.12...,"page 17, section 1.1"


In [41]:
graphs_with_description['extracted_nodes'] = graphs_with_description['descr'].apply(lambda x: extract_nodes_gpt(x, client, gpt_model='gpt-4o', temperature=0))

In [40]:
# gpt-4o-mini
graphs_with_description[['nodes', 'extracted_nodes']]

Unnamed: 0,nodes,extracted_nodes
0,"['asia', 'tub', 'smoke', 'lung', 'bronc', 'eit...","[Shortness-of-breath, tuberculosis, lung cance..."
1,"['Pollution', 'Smoker', 'Cancer', 'Xray', 'Dys...","[Pollution, Smoking, Cancer, Dyspnoea, XRay]"
2,"['Burglary', 'Earthquake', 'Alarm', 'JohnCalls...","[Burglar Alarm, Burglary, Earthquakes, John, M..."
3,"['A', 'S', 'E', 'O', 'R', 'T']","[Age, Sex, Education, Occupation, Residence, T..."


In [42]:
# gpt-4o
graphs_with_description[['nodes', 'extracted_nodes']]

Unnamed: 0,nodes,extracted_nodes
0,"['asia', 'tub', 'smoke', 'lung', 'bronc', 'eit...","[Shortness-of-breath, Tuberculosis, Lung Cance..."
1,"['Pollution', 'Smoker', 'Cancer', 'Xray', 'Dys...","[Pollution, Smoker, Cancer, Dyspnoea, XRay]"
2,"['Burglary', 'Earthquake', 'Alarm', 'JohnCalls...","[Burglar Alarm, Burglary, Earthquake, John Cal..."
3,"['A', 'S', 'E', 'O', 'R', 'T']","[Age, Sex, Education, Occupation, Residence, T..."


#### Extract vertex dependencies

На взод дается описание графа на естественном языке и две вершины. Нужно по описанию и именам 2 вершин сказать, есть ли между ними ребро И если есть --- то в какую сторону. O(n^2) без памяти.


Работает не очень хорошо.

In [80]:
from openai import OpenAI, AsyncOpenAI

client = OpenAI(
    base_url='https://api.proxyapi.ru/openai/v1',
    api_key=userdata.get('proxyai_api_key'),
)

async_client = AsyncOpenAI(
    base_url='https://api.proxyapi.ru/openai/v1',
    api_key=userdata.get('proxyai_api_key'),
)

In [69]:
from enum import Enum

class ArrowEnum(str, Enum):
    no = "no arrow"
    forward = "forward arrow"
    backward = "backward arrow"

class ArrowType(BaseModel):
    arrow_type: ArrowEnum

In [122]:
def get_messages_for_edge_direction(descr: str, set_of_nodes: list[str], pair_of_nodes: tuple[str]) -> str:

    node_extraction_sys_message = '''
            ###ROLE###
            You are Graphical model scientist.
            You task is to extract information about DAG from text description.
            The structure, or topology, of the network should capture qualitative relationships between variables. In particular, two nodes should be connected directly if one affects or causes the other, with the arc indicating the direction of the effect.
            The presence of arrows or arcs seems to imply, at an intuitive level, that for each arc one variable should be interpreted as a cause and the other as an effect (e.g., A →E means that A causes E). This interpretation is called causal.

            For every right answer I give you 5$.
        '''

    edge_extraction_str_template = f'''
        ###TASK###
        You are given a DAG description, a set of its nodes and pair of nodes.
        You should infer from the description and you own knowledge the type of casuality between two given nodes:
            - forward: the left may be the cause of the right
            - backward: the right may be the cause of the left
            - no: no direct casualities

        ###OUTPUT FORMAT###
        You output should be "forward", "backward" or "no".

        ###

        User:
            #DESCRIPTION#: Imagine a garden where the growth of plants depends on several factors. The amount of Watering affects how well the plants grow. Sunlight is another crucial factor, as it provides energy for photosynthesis. Fertilizer also plays a role by supplying essential nutrients. Together, these factors influence Plant Growth. Additionally, Watering can impact the effectiveness of Fertilizer, as nutrients are better absorbed when the soil is moist.
            #SET OF NODES#: [Watering, Sunlight, Fertilizer, Plant Growth]
            #PAIR OF NODES#: (Sunlight, Plant Growth)

        Assistant: forward

        ###

        User:
            #DESCRIPTION#: Imagine a garden where the growth of plants depends on several factors. The amount of Watering affects how well the plants grow. Sunlight is another crucial factor, as it provides energy for photosynthesis. Fertilizer also plays a role by supplying essential nutrients. Together, these factors influence Plant Growth. Additionally, Watering can impact the effectiveness of Fertilizer, as nutrients are better absorbed when the soil is moist.
            #SET OF NODES#: [Watering, Sunlight, Fertilizer, Plant Growth]
            #PAIR OF NODES#: (Fertilizer, Watering)

        Assistant: backward

        ###

        User:
            #DESCRIPTION#: Imagine a garden where the growth of plants depends on several factors. The amount of Watering affects how well the plants grow. Sunlight is another crucial factor, as it provides energy for photosynthesis. Fertilizer also plays a role by supplying essential nutrients. Together, these factors influence Plant Growth. Additionally, Watering can impact the effectiveness of Fertilizer, as nutrients are better absorbed when the soil is moist.
            #SET OF NODES#: [Watering, Sunlight, Fertilizer, Plant Growth]
            #PAIR OF NODES#: (Sunlight, Fertilizer)

        Assistant: no

        ###

        User:
            #DESCRIPTION#: {descr}
            #SET OF NODES#: {set_of_nodes}
            #PAIR OF NODES#: {pair_of_nodes}

        Assistant:
    '''


    messages = [
        {
            'role': 'system',
            'content': node_extraction_sys_message
        },
        {
            'role': 'user',
            'content': edge_extraction_str_template
        }
    ]

    return messages

In [None]:
example = ''' '''

In [47]:
example_nodes = extract_nodes_gpt(example, client, gpt_model='gpt-4o-mini', temperature=0)

In [48]:
example_nodes

['Time Spent Studying',
 'Knowledge Acquisition',
 'Teacher Quality',
 'Classroom Environment',
 'Learning Outcome']

In [52]:
s = f"[{', '.join(example_nodes)}]"

In [53]:
s

'Time Spent Studying, Knowledge Acquisition, Teacher Quality, Classroom Environment, Learning Outcome'

In [55]:
t = ("a", "b")
s = f"({', '.join(t)})"
print(s)

(a, b)


In [66]:
edge_mess = get_messages_for_edge_direction(
    example,
    f"[{', '.join(example_nodes)}]",
    f"({', '.join(('Teacher Quality', 'Learning Outcome'))})",
)

In [81]:
def extract_one_edge_gpt(descr, set_of_nodes, pair_of_nodes, gpt_client, gpt_model='gpt-4o-mini', temperature=0):
    '''
        returns either (None, None) or edge with identified direction
    '''
    completion = gpt_client.beta.chat.completions.parse(
        model=gpt_model,
        messages=get_messages_for_edge_direction(descr, f"[{', '.join(set_of_nodes)}]", f"[{', '.join(pair_of_nodes)}]"),
        response_format=ArrowType,
        temperature=temperature,
    )
    arrow_type = json.loads(completion.choices[0].message.content)['arrow_type']  # json.loads(completion.choices[0].message.content)['list_of_nodes']

    if 'forward' in arrow_type.lower():
        return pair_of_nodes
    if 'backward' in arrow_type.lower():
        return pair_of_nodes[::-1]
    return (None, None)

# AsyncOpenAI
async def async_extract_one_edge_gpt(descr, set_of_nodes, pair_of_nodes, async_gpt_client, gpt_model='gpt-4o-mini', temperature=0):
    '''
        returns either (None, None) or edge with identified direction
    '''
    completion = await async_gpt_client.beta.chat.completions.parse(
        model=gpt_model,
        messages=get_messages_for_edge_direction(descr, f"[{', '.join(set_of_nodes)}]", f"[{', '.join(pair_of_nodes)}]"),
        response_format=ArrowType,
        temperature=temperature,
    )
    arrow_type = json.loads(completion.choices[0].message.content)['arrow_type']  # json.loads(completion.choices[0].message.content)['list_of_nodes']

    if 'forward' in arrow_type.lower():
        return pair_of_nodes
    if 'backward' in arrow_type.lower():
        return pair_of_nodes[::-1]
    return (None, None)

In [76]:
extract_one_edge_gpt(example, example_nodes, ('Learning Outcome', 'Teacher Quality'), client, gpt_model='gpt-4o-mini', temperature=0)

('Teacher Quality', 'Learning Outcome')

In [78]:
extract_one_edge_gpt(example, example_nodes, ('Classroom Environment', 'Teacher Quality'), client, gpt_model='gpt-4o-mini', temperature=0)

(None, None)

In [90]:
extract_one_edge_gpt(example, example_nodes, ('Classroom Environment', 'Knowledge Acquisition'), client, gpt_model='gpt-4o-mini', temperature=0)

(None, None)

In [82]:
await async_extract_one_edge_gpt(example, example_nodes, ('Classroom Environment', 'Teacher Quality'), async_client, gpt_model='gpt-4o-mini', temperature=0)

(None, None)

In [125]:
async def extract_all_edges(descr, set_of_nodes, async_gpt_client, gpt_model='gpt-4o-mini', temperature=0):
    edge_list = []

    for i, node_a in enumerate(set_of_nodes):
        for node_b in set_of_nodes[i+1:]:
            print(f"{node_a} # {node_b}")
            edge = await async_extract_one_edge_gpt(descr, set_of_nodes, (node_a, node_b), async_gpt_client, gpt_model=gpt_model, temperature=temperature)
            if edge[0] is not None:
                edge_list.append(edge)

    return edge_list

In [89]:
example_edge_list = await extract_all_edges(example, example_nodes, async_client, gpt_model='gpt-4o-mini', temperature=0)
print(example_edge_list)

Time Spent Studying Knowledge Acquisition
Time Spent Studying Teacher Quality
Time Spent Studying Classroom Environment
Time Spent Studying Learning Outcome
Knowledge Acquisition Teacher Quality
Knowledge Acquisition Classroom Environment
Knowledge Acquisition Learning Outcome
Teacher Quality Classroom Environment
Teacher Quality Learning Outcome
Classroom Environment Learning Outcome
[('Time Spent Studying', 'Knowledge Acquisition'), ('Time Spent Studying', 'Learning Outcome'), ('Knowledge Acquisition', 'Learning Outcome')]


In [91]:
example_edge_list = await extract_all_edges(example, example_nodes, async_client, gpt_model='gpt-4o', temperature=0)
print(example_edge_list)

Time Spent Studying Knowledge Acquisition
Time Spent Studying Teacher Quality
Time Spent Studying Classroom Environment
Time Spent Studying Learning Outcome
Knowledge Acquisition Teacher Quality
Knowledge Acquisition Classroom Environment
Knowledge Acquisition Learning Outcome
Teacher Quality Classroom Environment
Teacher Quality Learning Outcome
Classroom Environment Learning Outcome
[('Time Spent Studying', 'Knowledge Acquisition'), ('Time Spent Studying', 'Learning Outcome'), ('Knowledge Acquisition', 'Learning Outcome')]


In [98]:
graphs_with_description['descr'][0]

'Shortness-of-breath (dyspnoea) may be due to tuberculosis, lung cancer or bronchitis, or none of them, or more than one of them. A recent visit to Asia increases the chances of tuberculosis, while smoking is known to be a risk factor for both lung cancer and bronchitis. The results of a single chest X-ray do not discriminate between lung cancer and tuberculosis, as neither does the presence or absence of dyspnoea.'

In [106]:
import re
re.sub("\'", "", graphs_with_description['nodes'][0][1:-1]).split(', ')

['asia', 'tub', 'smoke', 'lung', 'bronc', 'either', 'xray', 'dysp']

In [121]:
graphs_with_description['edges'][0]

"[('asia', 'tub'), ('tub', 'either'), ('smoke', 'lung'), ('smoke', 'bronc'), ('lung', 'either'), ('bronc', 'dysp'), ('either', 'xray'), ('either', 'dysp')]"

In [118]:
example_edge_list = await extract_all_edges(graphs_with_description['descr'][0],
                                            re.sub("\'", "", graphs_with_description['nodes'][0][1:-1]).split(', '),
                                            async_client,
                                            gpt_model='gpt-4o-mini',
                                            temperature=0)
print(example_edge_list)

asia # tub
asia # smoke
asia # lung
asia # bronc
asia # either
asia # xray
asia # dysp
tub # smoke
tub # lung
tub # bronc
tub # either
tub # xray
tub # dysp
smoke # lung
smoke # bronc
smoke # either
smoke # xray
smoke # dysp
lung # bronc
lung # either
lung # xray
lung # dysp
bronc # either
bronc # xray
bronc # dysp
either # xray
either # dysp
xray # dysp
[('asia', 'tub'), ('asia', 'lung'), ('asia', 'either'), ('asia', 'dysp'), ('tub', 'either'), ('smoke', 'lung'), ('smoke', 'bronc'), ('smoke', 'dysp'), ('lung', 'dysp')]


In [123]:
example_edge_list = await extract_all_edges(graphs_with_description['descr'][0],
                                            graphs_with_description['extracted_nodes'][0],
                                            async_client,
                                            gpt_model='gpt-4o-mini',
                                            temperature=0)
print(example_edge_list)

Shortness-of-breath # Tuberculosis
Shortness-of-breath # Lung Cancer
Shortness-of-breath # Bronchitis
Shortness-of-breath # Visit to Asia
Shortness-of-breath # Smoking
Shortness-of-breath # Chest X-ray
Tuberculosis # Lung Cancer
Tuberculosis # Bronchitis
Tuberculosis # Visit to Asia
Tuberculosis # Smoking
Tuberculosis # Chest X-ray
Lung Cancer # Bronchitis
Lung Cancer # Visit to Asia
Lung Cancer # Smoking
Lung Cancer # Chest X-ray
Bronchitis # Visit to Asia
Bronchitis # Smoking
Bronchitis # Chest X-ray
Visit to Asia # Smoking
Visit to Asia # Chest X-ray
Smoking # Chest X-ray
[('Shortness-of-breath', 'Tuberculosis'), ('Shortness-of-breath', 'Lung Cancer'), ('Shortness-of-breath', 'Bronchitis'), ('Tuberculosis', 'Visit to Asia'), ('Bronchitis', 'Smoking')]


In [126]:
example_edge_list = await extract_all_edges(graphs_with_description['descr'][0],
                                            graphs_with_description['extracted_nodes'][0],
                                            async_client,
                                            gpt_model='gpt-4o',
                                            temperature=0)
print(example_edge_list)

Shortness-of-breath # Tuberculosis
Shortness-of-breath # Lung Cancer
Shortness-of-breath # Bronchitis
Shortness-of-breath # Visit to Asia
Shortness-of-breath # Smoking
Shortness-of-breath # Chest X-ray
Tuberculosis # Lung Cancer
Tuberculosis # Bronchitis
Tuberculosis # Visit to Asia
Tuberculosis # Smoking
Tuberculosis # Chest X-ray
Lung Cancer # Bronchitis
Lung Cancer # Visit to Asia
Lung Cancer # Smoking
Lung Cancer # Chest X-ray
Bronchitis # Visit to Asia
Bronchitis # Smoking
Bronchitis # Chest X-ray
Visit to Asia # Smoking
Visit to Asia # Chest X-ray
Smoking # Chest X-ray
[('Shortness-of-breath', 'Tuberculosis'), ('Shortness-of-breath', 'Lung Cancer'), ('Shortness-of-breath', 'Bronchitis'), ('Smoking', 'Shortness-of-breath'), ('Visit to Asia', 'Tuberculosis'), ('Smoking', 'Lung Cancer'), ('Smoking', 'Bronchitis')]


Попробовать: сразу для ксех вершин сказать, какое направление ребер

#### Suggest vertex distributions (потом)

По описанию и извлеченным вершинам должны выдаваться типы распределений:

In [None]:
def get_messages_for_edge_direction(descr: str) -> str:

    node_extraction_sys_message = '''
            ###ROLE###
            You are Graphical model scientist.
            You task is to extract information about DAG from text description.
            The structure, or topology, of the network should capture qualitative relationships between variables. In particular, two nodes should be connected directly if one affects or causes the other, with the arc indicating the direction of the effect.
            The presence of arrows or arcs seems to imply, at an intuitive level, that for each arc one variable should be interpreted as a cause and the other as an effect (e.g., A →E means that A causes E). This interpretation is called causal.

            For every right answer I give you 5$.
        '''

    node_extraction_str_template = f'''
        ###TASK###
        Extract all node names of GAD from the description, the user gaves you.

        ###OUTPUT FORMAT###
        You output should be list of strings

        ###

        User: Imagine a garden where the growth of plants depends on several factors. The amount of Watering affects how well the plants grow. Sunlight is another crucial factor, as it provides energy for photosynthesis. Fertilizer also plays a role by supplying essential nutrients. Together, these factors influence Plant Growth. Additionally, Watering can impact the effectiveness of Fertilizer, as nutrients are better absorbed when the soil is moist.
        Assistant: ['Watering', 'Sunlight', 'Fertilizer', 'Plant Growth']

        ###

        User: {descr}
        Assistant:
    '''


    messages = [
        {
            'role': 'system',
            'content': node_extraction_sys_message
        },
        {
            'role': 'user',
            'content': node_extraction_str_template
        }
    ]

    return messages
