In [2]:
import faiss
import networkx as nx
import pickle
import numpy as np
import pandas as pd

# Load the Graph

In [3]:
with open("community_graph_embeded.pkl", "rb") as f:
    G = pickle.load(f)

# Create Index

In [4]:
embedding_dimension = np.array(G.nodes[0]["embedding"]).shape[0]
index = faiss.IndexFlatL2(embedding_dimension)
index.is_trained

True

In [5]:
# embedding matrix 以及对应nodeid matrix
ids = np.array([node for node in G.nodes()])
node_embeddings = np.array([node['embedding'] for id,node in G.nodes(data=True)])
print(ids.shape,node_embeddings.shape)

(868,) (868, 768)


In [6]:
# add to index
index.add(node_embeddings)
index.ntotal

868

# Similarity Search

In [7]:
from langchain_community.embeddings import OllamaEmbeddings
import json
embed_model = OllamaEmbeddings(model='nomic-embed-text')

In [8]:

degree_centralities = nx.degree_centrality(G)

# step 1: find nodes with similar profile
def semantic_similarity_search(query, k1):
    query = str(query)
    query_embed = embed_model.embed_query(query)
    query_embed = np.array(query_embed).reshape(1,-1)
    d, i = index.search(query_embed, k1)
    similar_nodes = ids[i][0]
    simiilar_scores = d[0]  
    return similar_nodes,simiilar_scores

# step 2: find nodes with similar degree_centrality in the community network
def social_similarity_search(node_id,k2,node_type='Actors'):
    # filter all Actor nodes
    same_type_nodes = [node for node in G.nodes() if G.nodes[node]["type"] == node_type]
    node_degree_centrality = degree_centralities[node_id]
    node_degree_centralities = {k:abs(v-node_degree_centrality) for k, v in degree_centralities.items() if k in same_type_nodes}
    sorted_nodes = sorted(node_degree_centralities.items(), key=lambda x: x[1])
    sorted_similar_nodes = sorted_nodes[:k2]
    similar_nodes = [node for node, _ in sorted_similar_nodes]
    similar_scores = [score for _, score in sorted_similar_nodes]
    return similar_nodes,similar_scores

# combine the two similarity search results
def combined_search(query, k1=5, k2=2, type='Actors'):
    nodes = []
    scores = []
    senmantic_similar_nodes,senmantic_similar_scores = semantic_similarity_search(query, k1)
    for node1,score1 in zip(senmantic_similar_nodes,senmantic_similar_scores):
        social_similar_nodes,social_similar_scores = social_similarity_search(node1,k2)
        filtered_nodes = [n for n in social_similar_nodes if n not in nodes]
        filtered_scores = [score1+score2 for score2,n in zip(social_similar_scores,social_similar_nodes) if n not in nodes]
        nodes.extend(filtered_nodes)
        scores.extend(filtered_scores)
    return nodes,scores

In [9]:
example_input = {
'age': '18-35',
'gender': 'Female',
'marrige': 'Yes',
'withkids': 'No',
'student': 'Yes',
'workstatue': 'Freelancer',
'residentinneighbor': 'No',
'educationlevel': 'Undergraduate'
}
k1 = 5
k2 = 2


In [10]:
similar_nodes,simiilar_scores = semantic_similarity_search(example_input, k1)
print("semantic_similarity_search:",similar_nodes,simiilar_scores)

semantic_similarity_search: [ 267  768 1054  231  268] [0.49987638 0.6176311  0.6176311  0.6176311  0.6176311 ]


In [245]:
nodes,scores = combined_search(example_input, k1, k2)
print("Similar nodes: ", nodes) 
print("Scores: ", scores)

Similar nodes:  [12, 469, 457, 768, 686, 702, 681, 464]
Scores:  [0.49987637996673584, 0.49987637996673584, 0.6176310777664185, 0.6176310777664185, 0.6176310777664185, 0.6176310777664185, 0.6176310777664185, 0.6176310777664185]
