In [13]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from openai import OpenAI
import json
import os
import numpy as np

In [12]:
def load_data(file_path):
    return pd.read_csv(file_path)


def filter_data(df, start_time, frame):
    def generate_range(start_time, frame):
        dtStart = datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S')
        dtDiff = timedelta(hours=frame[0], minutes=frame[1])
        dest = dtStart + dtDiff
        return [start_time, dest.strftime('%Y-%m-%d %H:%M:%S')]
    
    time_range = generate_range(start_time, frame)
    return df[(df["time"] >= time_range[0]) & (df["time"] <= time_range[1])]

def initialize_api_client(api_key):
    return OpenAI(
        base_url="https://hackathon.radiantai.com/insight-ops/openai",
        api_key=api_key
    )


In [105]:
def extract_image_data(df_1, context, post_contents, image_index, api_client):
    post_orgininal = df_1['translation']
    imgs = df_1['attachment_urls'].split(",")
    print(imgs)
    if len(df_1['attachment_urls']) == 0:
        file_key = imgs[0]
    else:
        file_key = imgs[image_index]

    #Return false if the file ends with any extension that is not an image
    if not file_key.endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.webp')):
        return False
    
    image_url = "https://vl-nat-sec-hackathon-may-2024.s3.amazonaws.com/" + file_key

    system_prompt = """You are a helpful assistant designed to output JSON and analyze images. This image in the user message was attached to a post. We had a program analyze the post and come up with subjects and relationships/actions between those subjects. You will be provided that data of the subjects and relationships, the context of the post to mkae sure you do not mention anything irrelevant. You need to do the following:
    Analyze the image and provide the subjects and relationships/actions between those subjects to enter into the graph database.
    MAKE SURE to output the data in a JSON format with the top level keys 'subjects' and 'relationships', 
    Every object has a title and it has a type, the title indicates the name of the subject: this can be an person or "actor", event, or even concept, just make up a classification that is very standard.
    Two objects may be related in some way and this may and in this case, you will add them to the relationships list, try to find fitting relationships between related objects. For example, if you create a relationship between a person and an event you can use the relationship "participated in" or "attended". Or if you have a person and a concept you can use "knows" or "is familiar with". Or if you have two events you can use "happened before" or "happened after". Make up a relationship.
    How you will enter it in to the graph database it as follows, only include finding from the image in this response (meaning you will output in JSON format, DO NOT OUTPUT ANYTHING ELSE, IT WILL BREAK THE WHOLE PROGRAM):
    {
        "subjects": [
        [ENTITY_ID: INT, TITLE: STR, TYPE: STR],
        [ENTITY_ID: INT, TITLE: STR, TYPE: STR],
        ...
        ],
        "relationships": [
        [ENTITY_ID_1: INT, RELATIONSHIP: STR, ENTITY_ID_2: INT, PROPERTIES: DICT],
        [ENTITY_ID_1: INT, RELATIONSHIP: STR, ENTITY_ID_2: INT, PROPERTIES: DICT],
        ...
        ]
    }
    When you find a node or relationship you want to add try to create a generic PROPERTY for it that describes the entity you can also think of it as a label.
    Include things relevant to the""", context, """. 
    Also make sure that the RELATIONSHIP is consise, if the relationship is not simple to explain, it may not be relevant to the context of the""", context, """and should be left out. Analyze the image in keeping in mind the similarity and relevance of it to the QUALITIES OF THE PREVIOUS MOST HERE: """, str(json.dumps(post_contents)) , """and provide the JSON response, remember to look at the subjects of the post it is attached to, if there is something potentially related to a subject in the post, include it in the response. Here is the original post:""", post_orgininal, """"""
    print(image_url)
    response = api_client.chat.completions.create(
        model='gpt-4-vision-preview',
        messages=[
            {'role': 'system', 'content': system_prompt},
            {'role': 'user', 'content': [{'type': 'image_url', 'image_url': {'url': image_url}}]}
        ]
    )
  
    return json.loads("".join((response.choices[0].message.content).split("\n")[1:-1]))

def process_post(post, api_client):
    
    system_prompt = """You are a helpful assistant designed to output JSON.
Your task is to extract information from social media posts or announcements relevant to the Ukraine Russian war and convert it into a graph database.

Here is how the graph database works:
Every object has a title and it has a type, the title indicates the name of the subject: this can be an person or "actor", event, or even concept, just make up a classification that is very standard.
Two objects may be related in some way and this may and in this case, you will add them to the relationships list, try to find fitting relationships between related objects. For example, if you create a relationship between a person and an event you can use the relationship "participated in" or "attended". Or if you have a person and a concept you can use "knows" or "is familiar with". Or if you have two events you can use "happened before" or "happened after". Make up a relationship.
How you will enter it in to the graph database it as follows:
{
  "subjects": [
    [ENTITY_ID: INT, TITLE: STR, TYPE: STR],
    [ENTITY_ID: INT, TITLE: STR, TYPE: STR],
    ...
  ],
  "relationships": [
    [ENTITY_ID_1: (ANY), RELATIONSHIP: STR, ENTITY_ID_2: (ANY), PROPERTIES: DICT],
    [ENTITY_ID_1: (ANY), RELATIONSHIP: STR, ENTITY_ID_2: (ANY), PROPERTIES: DICT],
    ...
  ]
}
If you can't pair a relationship with a pair of nodes don't add it.
When you find a node or relationship you want to add try to create a generic PROPERTY for it that describes the entity you can also think of it as a label.
Only include things relevant to the Ukraine Russia conflict
One more thing: this image IS ATTACHED TO A POST, so try to find a relationship between the image subjects and the post subjects. This image needs to add to the actual post subjects and relationships, as it is attached to the post."""

    
    response = api_client.chat.completions.create(
        model="gpt-3.5-turbo-0125",
        response_format={"type": "json_object"},
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": f"Here's the post: {post} \\n Make sure to output in a JSON with the top level keys 'subjects' and 'relationships'"}
        ]
    )
    return json.loads(response.choices[0].message.content)


def extract_graph_data(df, api_client):
    """
    Iterate over a DataFrame of posts, process each one, and accumulate the graph data.
    
    Args:
    df (pandas.DataFrame): DataFrame containing the posts to process.
    api_client (OpenAI.Client): The API client instance.
    
    Returns:
    dict: A dictionary containing accumulated nodes and edges from all posts.
    """
    system_prompt = "Your task is to extract information from social media posts and convert it into a graph database."
    all_data = {'subjects': [], 'relationships': []}  # Initialize structure to hold combined data from all posts
    
    img_data = {}

    image_data = []

    for index, row in df.iterrows():
        
        post_data = process_post(row['translation'], api_client)

        image_data = []
        for i in range(len(row["attachment_urls"].split(","))):
            try:
                image_1 = extract_image_data(row, "Russia vs. Ukraine", post_data, i, api_client)
            except:
                image_1 = False
            if image_1:
                image_data.append(image_1)
                post_data['subjects'].extend(image_1['subjects'])
                post_data['relationships'].extend(image_1['relationships'])
        
        try:
            for x in post_data['subjects']:
                x[0] = row['id'] + '_' + str(x[0])

            for x in post_data['relationships']:
                x[0] = row['id'] + '_' + str(x[0])
                x[2] = row['id'] + '_' + str(x[2])
        except:
            continue
        else:
            all_data['subjects'].extend(post_data['subjects'])
            all_data['relationships'].extend(post_data['relationships'])
    
    return all_data

In [70]:
'''

def deduping_nodes(nodes, api_client):
    
    system_prompt = """You are a helpful assistant designed to do data disambiguation nodes from a knowledge graph about the Russia Ukraine war.

Here is how the graph database works:
Every object has a title and it has a type, the title indicates the name of the subject
The input is a list of lists and this is what the input looks like:
[
    [ENTITY_ID: INT, TITLE: STR, TYPE: STR],
    [ENTITY_ID: INT, TITLE: STR, TYPE: STR],
    ...
  ]

You have to output a list of tuples for the titles that are different entries in the list but actually represent the same entity.

Example of entities that are the same could be [(Russian Army, Russian Armed Forces)]"""

system_prompt = """Your task is to identify if there are duplicated nodes in a noisy knowledge graph 
    
You will be given a datasets of nodes and some of these nodes may be duplicated or refer to the same entity. 
The datasets contains nodes in the form [ENTITY_ID, ENTITY_NAME, TYPE]. 

There are many duplicates in this dataset with slightly different phraseology for the ENTITY_NAME

When you have completed your task please give me the pairwise entities that are reasonably the duplicate entities. 

The output should be in the format [(a,b)] 
"""
    response = api_client.chat.completions.create(
        model="gpt-3.5-turbo-0125",
        response_format={"type": "json_object"},
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": f"Here's the input: {nodes}. Make sure the output is in JSON \\n"}
        ]
    )
    return json.loads(response.choices[0].message.content)

deduped = deduping_nodes(structured_data["subjects"][:20], api_client)
deduped
'''

'\n\ndef deduping_nodes(nodes, api_client):\n    \n    system_prompt = """You are a helpful assistant designed to do data disambiguation nodes from a knowledge graph about the Russia Ukraine war.\n\nHere is how the graph database works:\nEvery object has a title and it has a type, the title indicates the name of the subject\nThe input is a list of lists and this is what the input looks like:\n[\n    [ENTITY_ID: INT, TITLE: STR, TYPE: STR],\n    [ENTITY_ID: INT, TITLE: STR, TYPE: STR],\n    ...\n  ]\n\nYou have to output a list of tuples for the titles that are different entries in the list but actually represent the same entity.\n\nExample of entities that are the same could be [(Russian Army, Russian Armed Forces)]"""\n\nsystem_prompt = """Your task is to identify if there are duplicated nodes in a noisy knowledge graph \n    \nYou will be given a datasets of nodes and some of these nodes may be duplicated or refer to the same entity. \nThe datasets contains nodes in the form [ENTITY_

In [106]:
api_key = os.environ["HACKATHON_API_KEY"]
df = load_data('data/russia_social_media.csv')


In [107]:
filtered_df = filter_data(df, "2024-03-31 18:00:00", [0, 1]).head(3)
api_client = initialize_api_client(api_key)
filtered_df.shape

(3, 9)

In [108]:
%%time
abc = extract_graph_data(filtered_df, api_client)
print(abc)

['media/telegram-web/ac680058-eeda-41f0-b65d-6971cc3e831e.jpeg']
https://vl-nat-sec-hackathon-may-2024.s3.amazonaws.com/media/telegram-web/ac680058-eeda-41f0-b65d-6971cc3e831e.jpeg
['media/telegram-web/7447129d-96cc-4d63-b0f5-991f65238c13.jpeg']
https://vl-nat-sec-hackathon-may-2024.s3.amazonaws.com/media/telegram-web/7447129d-96cc-4d63-b0f5-991f65238c13.jpeg
['media/telegram-web/867b1e01-2e49-400e-b809-4cc0e91aeb4d.jpeg']
https://vl-nat-sec-hackathon-may-2024.s3.amazonaws.com/media/telegram-web/867b1e01-2e49-400e-b809-4cc0e91aeb4d.jpeg
{'subjects': [['0400c828-c2a7-5021-8bae-1aadf3206e21_1', 'No elections', 'concept'], ['0400c828-c2a7-5021-8bae-1aadf3206e21_2', 'No Crimea', 'concept'], ['0400c828-c2a7-5021-8bae-1aadf3206e21_3', 'No Donbass', 'concept'], ['0400c828-c2a7-5021-8bae-1aadf3206e21_4', 'No future', 'concept'], ['0400c828-c2a7-5021-8bae-1aadf3206e21_5', 'crests', 'concept'], ['0400c828-c2a7-5021-8bae-1aadf3206e21_6', 'Advertisement', 'object'], ['0400c828-c2a7-5021-8bae-1aadf

In [438]:
structured_data

{'subjects': [['5093e40c-1770-54dc-b4a3-c298b7d1ea72_1',
   'Front-line laboratory',
   'actor'],
  ['5093e40c-1770-54dc-b4a3-c298b7d1ea72_2', 'UAV', 'concept'],
  ['5093e40c-1770-54dc-b4a3-c298b7d1ea72_3', 'two majors', 'person'],
  ['5093e40c-1770-54dc-b4a3-c298b7d1ea72_4', 'channel ZOV~ZOV26', 'event'],
  ['9dcd79d7-e512-5f3e-8d0b-07f522ffa771_1', 'Roman Donetsky', 'person'],
  ['9dcd79d7-e512-5f3e-8d0b-07f522ffa771_2', 'Donbass front', 'location'],
  ['9dcd79d7-e512-5f3e-8d0b-07f522ffa771_3', 'Pervomaisky', 'location'],
  ['9dcd79d7-e512-5f3e-8d0b-07f522ffa771_4', 'Orlovka', 'location'],
  ['9dcd79d7-e512-5f3e-8d0b-07f522ffa771_5', 'Berdychi', 'location'],
  ['9dcd79d7-e512-5f3e-8d0b-07f522ffa771_6', 'Novomikhailovka', 'location'],
  ['9dcd79d7-e512-5f3e-8d0b-07f522ffa771_7', 'Georgievka', 'location'],
  ['9dcd79d7-e512-5f3e-8d0b-07f522ffa771_8', 'Avdeevka', 'location'],
  ['9dcd79d7-e512-5f3e-8d0b-07f522ffa771_9', 'UAVs', 'technology'],
  ['9dcd79d7-e512-5f3e-8d0b-07f522ffa771_10'

In [426]:
with open('graph_50_deduped.json', 'w+') as file:
    json.dump(structured_data, file, indent = 4)

In [203]:
def create_graph(data):
    G = nx.DiGraph()
    for node in data['subjects']:
        node_id, name, node_type = node
        G.add_node(node_id, name = name, node_type = node_type)
        
    for edge in data['relationships']:
        source, relation, target, props = edge
        G.add_edge(source, target, relation=relation)
    return G

def draw_graph(G):
    pos = nx.spring_layout(G)
    nx.draw_networkx_nodes(G, pos, node_size=100, node_color='lightblue', alpha=0.6)
    nx.draw_networkx_edges(G, pos, arrowstyle='-|>', arrowsize=5, edge_color='gray', width=2)
    node_labels = {node: G.nodes[node]['name'] for node in G.nodes()}
    nx.draw_networkx_labels(G, pos, labels=node_labels, font_size=12)
    edge_labels = {(u, v): data['relation'] for u, v, data in G.edges(data=True)}
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_color='red')
    plt.title('Graph Visualization')
    plt.axis('off')
    plt.show()

In [402]:
G = create_graph(structured_data)

In [432]:
import difflib
from collections import defaultdict

# Function to group nodes by name similarity
def group_similar_nodes(graph, cutoff=0.8):
    names = nx.get_node_attributes(graph, 'name')
    nodes = sorted(names, key=lambda x: names[x])
    clusters = []
    used = set()

    for node in nodes:
        if node not in used:
            cluster = {node}
            for other in nodes:
                if other not in used and difflib.SequenceMatcher(None, names[node], names[other]).ratio() >= cutoff:
                    cluster.add(other)
            clusters.append(cluster)
            used.update(cluster)

    return clusters

# Function to merge nodes
def merge_nodes(graph, clusters):
    new_graph = nx.DiGraph()
    for cluster in clusters:
        # Concatenate names of all nodes in the cluster
        merged_name = ' ; '.join(set(graph.nodes[n]['name'] for n in cluster))
        node_type = ' ; '.join(set(graph.nodes[n]['node_type'] for n in cluster))
        new_node = min(cluster)  # Use the smallest node ID in the cluster as the new node ID

        new_graph.add_node(new_node, name=merged_name, node_type = node_type)

        # Merging edges and their attributes
        edge_attributes = defaultdict(set)
        for old_node in cluster:
            for neighbor, attrs in graph[old_node].items():
                if neighbor not in cluster:
                    edge_attributes[(new_node, neighbor)].add(attrs['relation'])

        # Add merged edges with combined attributes to the new graph
        for (u, v), relations in edge_attributes.items():
            new_graph.add_edge(u, v, relation=list(relations))

    return new_graph

# Find clusters of similar nodes and merge them
clusters = group_similar_nodes(G)
new_G = merge_nodes(G, clusters)


In [395]:
nx.write_gml(new_G, 'test1.gml')

In [433]:
def graph_to_custom_json(graph):
    # Only include nodes that have a 'name' attribute
    subjects = [[node, graph.nodes[node]['name'], 'single_type'] for node in graph.nodes() if 'name' in graph.nodes[node]]
    
    # Prepare relationships, making sure both nodes in the edge have 'name' attributes
    relationships = [[u, graph.edges[u, v]['relation'], v, graph.edges[u, v].get('properties', {})]
                     for u, v in graph.edges() if 'name' in graph.nodes[u] and 'name' in graph.nodes[v]]

    return {
        "subjects": subjects,
        "relationships": relationships
    }

# Assuming 'new_G' is your graph object post-processing
custom_data = graph_to_custom_json(new_G)

# Save the JSON object to a file
with open('graph_50_deduped.json', 'w') as f:
    json.dump(custom_data, f, indent=4)

print("Graph saved in custom JSON format.")

Graph saved in custom JSON format.


In [407]:
new_G

<networkx.classes.graph.Graph at 0x7f93fb0b1f10>

In [415]:
def compute_graph_metrics(graph):
    # Ensure the graph is a directed graph; adapt as necessary for undirected graphs
    if not graph.is_directed():
        raise ValueError("Input graph must be directed. For undirected graphs, modify the function accordingly.")

    # Calculate metrics
    in_degrees = dict(graph.in_degree())  # In-degree
    out_degrees = dict(graph.out_degree())  # Out-degree
    try:
        eigenvector_centrality = nx.eigenvector_centrality_numpy(graph)  # Eigenvector centrality
    except nx.NetworkXError as e:
        print("Error calculating eigenvector centrality:", e)
        eigenvector_centrality = {n: 0 for n in graph.nodes()}  # Default to 0 if calculation fails

    # Component ID assignment using weakly connected components
    components = list(nx.weakly_connected_components(graph))
    component_dict = {node: idx for idx, component in enumerate(components) for node in component}

    # Create DataFrame
    data = {
        'Node': list(graph.nodes()),
        'In-Degree': [in_degrees.get(node) for node in graph.nodes()],
        'Out-Degree': [out_degrees.get(node) for node in graph.nodes()],
        'Eigenvector Centrality': [eigenvector_centrality.get(node, 0.0) for node in graph.nodes()],
        'Component ID': [component_dict.get(node, -1) for node in graph.nodes()]
    }
    df = pd.DataFrame(data)
    return df

In [428]:
compute_graph_metrics(new_G).to_csv('graph_50_deduped_nodes.csv', index = False)
