In [1]:
import json
import networkx as nx
from langchain_community.embeddings import OllamaEmbeddings
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Data preprocess

In [2]:
node_df = pd.read_csv('nodes.csv')
node_df = node_df.fillna(-1)
node_df['nodetype'].value_counts()

Actors                475
Media Content         217
Organization          198
Event                 187
Short-term Project     25
Long-term Project      18
Space                  11
Name: nodetype, dtype: int64

### convert date to period

In [3]:
N = 10
node_df['date'] = pd.to_datetime(node_df['date'])
date_min = node_df['date'].min()
date_max = node_df['date'].max()
gap = (date_max - date_min)/(N-1)
period = (node_df['date']-date_min)//gap
node_df['period'] = period
# node_df['period'].value_counts().sort_index()

In [4]:
print(gap)

146 days 08:00:00


In [5]:
node_df['period'].value_counts().sort_index()

0    129
1    120
2    179
3    216
4     97
5     85
6     89
7    111
8    103
9      2
Name: period, dtype: int64

In [4]:
edge_df = pd.read_csv('links.csv')
edge_df = edge_df.fillna(-1)
edge_df['date'] = pd.to_datetime(edge_df['date'])
edge_df['period'] = (edge_df['date']-date_min)//gap
# edge_df['period'].value_counts().sort_index()

# Create Community Graph

In [5]:
from collections import defaultdict
G = nx.DiGraph()
count = defaultdict(int)
for i, node in node_df.iterrows():
    
    if node['nodetype'] == 'Actors':
        # Actor
        props = ['age','gender','marrige','withkids','student','workstatue','residentinneighbor','educationlevel']
        G.add_node(int(node['nodeid']), type=node['nodetype'], period = node['period'], properties=node[props].to_dict(), label=node['nodetype'])
    elif node['nodetype'] == 'Space':
        # Space
        G.add_node(int(node['nodeid']), type=node['nodetype'], period = node['period'], properties=node[['spacefunction']].to_dict(),label=node['nodetype'])
    else:
        # Organization, Event, Short-term Project, Long-term Project , Media Content
        G.add_node(int(node['nodeid']),type=node['nodetype'], period = node['period'], properties=node[['eventtopic']].to_dict(),label=node['nodetype'])

for j, edge in edge_df.iterrows():
    source = int(edge['source'])
    target = int(edge['target'])
    if source in G.nodes() and target in G.nodes():
        # weight links by the number of connections
        count[str([source, target])] += 1
        G.add_edge(source, target, period=edge['period'], weight= count[str([source, target])], type="choose_to",label="connected_to")

# Embbeding

In [6]:
embed_model = OllamaEmbeddings(model='nomic-embed-text')

for i,d in G.nodes(data=True):
    d['embedding'] = embed_model.embed_query(d['properties'])


# Save the graph

In [7]:
# save the embeded graph
with open("community_graph_embeded.pkl", "wb") as f:
    pickle.dump(G, f)