In [None]:
# Required Libraries
from pathlib import Path
import pickle
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
# from msb import Balance
# from msb.utils import frustration_count, label_clusters

#%matplotlib inline
from gensim.models import Word2Vec

# for network analysis
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# for visualization
import matplotlib.pyplot as plt

# Paths
ROOT = Path(".").absolute().parent
DATA = ROOT/r"C:\Users\Admin\PhD Projects\ai_heider\notebooks\data"
# DATA = ROOT / r"C:\Users\krishnadas\Projects\PhD Project\ai_heider\notebooks\data"
FIGS = DATA/'figs'
FIGS.mkdir(exist_ok=True)

In [None]:
def get_next_node(G, current, previous, p, q, weight_key:str):
    alphas = []
    neighbors = list(G.neighbors(current))
    for neighbor in neighbors:
        weight = abs(G[current][neighbor][weight_key])
        if neighbor == previous:
            alpha = weight* 1/p
        elif G.has_edge(neighbor, previous):
            alpha = weight
        else:
            alpha = weight * 1/q
        alphas.append(alpha)
    probs = [alpha/sum(alphas) for alpha in alphas]
    next = np.random.choice(neighbors,1,p=probs)[0]
    return next

def biased_random_walk(G, start_node, walk_length, p, q, weight_key:str):
    walk = [start_node]
    # for i in range(walk_length-1):
    while len(walk) < walk_length:
        current = walk[-1]
        neighbors = list(G.neighbors(current)) 
        if not neighbors:
            break
        if len(walk)==1:
            # neighbors = list(G.neighbors(current))
            next = np.random.choice(neighbors)
        else:
            previous = walk[-2]
            next = get_next_node(G, current, previous, p, q, weight_key)
        walk.append(next)
    return walk

def simulate_walks(G, num_walks, walk_length, p, q, weight_key:str='Rating', method='node2vec'):
    """Simulate random walks on the graph G."""
    walks = []
    nodes = list(G.nodes)
    for _ in range(num_walks):
        np.random.shuffle(nodes) # Shuffle nodes to ensure randomness
        for node in nodes:
            if method == 'node2vec':
                walk = biased_random_walk(G, node, walk_length, p, q, weight_key)
            elif method == 'triadic':
                walk = triadic_biased_random_walk(G, node, walk_length)
            else:
                raise ValueError("Method must be 'node2vec' or 'triadic'")
            walks.append(walk)
    return walks

def is_triad_balanced(s_uv, s_uw, s_vw):
    """Return True if triad (u,v,w) is balanced."""
    product = s_uv * s_uw * s_vw
    return product > 0

def triadic_balance_bias(G, u, v, weight_key='Rating'):
    """Return a score based on how many balanced triads edge (u,v) would form."""
    balanced = 0
    unbalanced = 0

    common_neighbors = set(G.neighbors(u)).intersection(G.neighbors(v))
    for w in common_neighbors:
        try:
            # Use the sign attributes that we add to the graph
            s_uv = G[u][v][weight_key]
            s_uw = G[u][w][weight_key]
            s_vw = G[v][w][weight_key]
            if is_triad_balanced(s_uv, s_uw, s_vw):
                balanced += 1
            else:
                unbalanced += 1
        except KeyError:
            continue  # if any sign is missing, skip

    return balanced + 1  # +1 to ensure nonzero score (base bias)

def triadic_biased_random_walk(G, start_node, num_steps):
    walk = [start_node]
    current = start_node

    for _ in range(num_steps):
        neighbors = list(G.neighbors(current))
        if not neighbors:
            break

        weights = [triadic_balance_bias(G, current, nbr) for nbr in neighbors]
        total_weight = sum(weights)
        if total_weight == 0:
            # If all weights are zero, use uniform distribution
            probabilities = [1/len(neighbors)] * len(neighbors)
        else:
            probabilities = [w / total_weight for w in weights]

        next_node = np.random.choice(neighbors, 1, p=probabilities)[0]
        walk.append(next_node)
        current = next_node

    return walk

def save_model(dimension, walk_length, num_walks, model, save_to, prefix):
    d = dimension
    w = walk_length
    n = num_walks
    embedding_filename = f"{prefix}_embeddings_d{d}_w{w}_n{n}.txt"
    model_filename = f"{prefix}_model_d{d}_w{w}_n{n}.pkl"
    model.wv.save_word2vec_format(str(save_to/embedding_filename))
    model.save(str(save_to/model_filename))
    print(f"Model and embeddings saved to {str(save_to)}")

def load_model(model_filename, embedding_path):
    if Path(embedding_path/model_filename).exists():
        print(f"Loading model: {model_filename}")
        with Path(embedding_path/model_filename).open('rb') as file:
            model = pickle.load(file)
    else:
        raise FileNotFoundError(f"Model file {model_filename} not found in {embedding_path}")
    return model


## Data preprocessing

In [None]:
# import data
df = pd.read_csv(r"DATA\bitcoinotc\soc-sign-bitcoinotc.csv", header=None)
# cols = ["SOURCE","TARGET","RATING","TIME"]
df.rename(columns={0:"Source",1:"Target",2:"Rating",3:"Time"}, inplace=True)
df.drop(columns=["Time"], inplace=True)
df.head()

Unnamed: 0,Source,Target,Rating
0,6,2,4
1,6,5,2
2,1,15,1
3,4,3,7
4,13,16,8


In [None]:
# Assuming df is your DataFrame containing the data
G = nx.from_pandas_edgelist(df, source='Source', target='Target', edge_attr='Rating')

# Add sign attributes for triadic balance calculations
# Convert ratings to binary signs: positive = +1, negative/zero = -1
for u, v, d in G.edges(data=True):
    d['sign'] = 1 if d['Rating'] > 0 else -1

print(f"Number of nodes: {nx.number_of_nodes(G)}")
print(f"Number of edges: {nx.number_of_edges(G)}")
print(f"Number of positive edges: {sum(1 for u, v, d in G.edges(data=True) if d['sign'] > 0)}")
print(f"Number of negative edges: {sum(1 for u, v, d in G.edges(data=True) if d['sign'] < 0)}")
nx.number_of_nodes(G)

5881

## Edge embedding

Compare two edge embedding methods

In [None]:
# Main Execution

# Parameters
num_walks = 1
walk_length = 100
p = 1.0
q = 0.5
embedding_dim = 64

# Generate walks
walks_nv = simulate_walks(G, num_walks=num_walks, walk_length=walk_length, p=p, q=q, method='node2vec')
# Convert nodes to strings (required by gensim)
walks_nv = [[str(node) for node in walk] for walk in walks_nv]

In [None]:
# Parameters
num_walks = 1
walk_length = 100
p = 1.0
q = 0.5
embedding_dim = 64
walks_heider = simulate_walks(G, num_walks=num_walks, walk_length=walk_length, p=p, q=q, method='triadic')
# Convert nodes to strings (required by gensim)
walks_heider = [[str(node) for node in walk] for walk in walks_heider]

In [10]:
model_nv = Word2Vec(sentences=walks_nv, vector_size=embedding_dim, window=10, min_count=0, sg=1, workers=4, epochs=10)


In [11]:
model_heider = Word2Vec(sentences=walks_heider, vector_size=embedding_dim, window=10, min_count=0, sg=1, workers=4, epochs=10)

In [13]:
# create node embedding matrix
def create_node_embedding_matrix(model, nodes):
    node_emb_matrix = []
    for node in nodes:
        if node in model.wv:
            node_emb_matrix.append(model.wv[node])
        else:
            # If the node is not in the model, use a zero vector
            node_emb_matrix.append(np.zeros(model.vector_size))
    return np.array(node_emb_matrix)

# create edge embedding matrix
# Edge embedding for (v1, v2) = hadamard product of node embeddings for v1, v2
def create_edge_embedding_matrix(model, edges_list, emb_matrix):
    embs = []
    for edge in edges_list:
        source, target = edge
        if source in model.wv and target in model.wv:
            source_emb = emb_matrix[model.wv.key_to_index[source]]
            target_emb = emb_matrix[model.wv.key_to_index[target]]
            edge_emb = np.multiply(source_emb, target_emb) # hadamard product
            embs.append(edge_emb)
    embs = np.array(embs)
    return embs

In [14]:
# create a train and test set for edges 
test_size = 0.2
# Split the edges into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Rating']), df['Rating'], 
                                                    test_size=test_size, random_state=42)

# Create train-set edge labels: > 0 = 1, <0 = -1, = 0 = 0
y_train_labels = y_train.apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))
# Create test-set edge labels
y_test_labels = y_test.apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))

In [17]:
# Create node embedding matrix
node_emb_matrix_nv = create_node_embedding_matrix(model_nv, G.nodes)
node_emb_matrix_heider = create_node_embedding_matrix(model_heider, G.nodes)
# Create edge emb matrix for the train set
edge_emb_train_nv = create_edge_embedding_matrix(model_nv,X_train.values, node_emb_matrix_nv)
edge_emb_train_heider = create_edge_embedding_matrix(model_heider,X_train.values, node_emb_matrix_heider)
# Create edge emb matrix for the test set 
edge_emb_test_nv = create_edge_embedding_matrix(model_nv,X_test.values, node_emb_matrix_nv)
edge_emb_test_heider = create_edge_embedding_matrix(model_heider, X_test.values, node_emb_matrix_heider)



In [20]:
# train the model to predict the edge sign
edge_classifier_nv = LogisticRegression(max_iter=100, random_state=4)
edge_classifier_nv.fit(edge_emb_train_nv, y_train_labels)

edge_classifier_heider = LogisticRegression(max_iter=100, random_state=4)
edge_classifier_heider.fit(edge_emb_train_heider, y_train_labels)


In [21]:
# Evaluate the model
train_accuracy_nv = accuracy_score(y_train_labels, edge_classifier_nv.predict(edge_emb_train_nv))
test_accuracy_nv = accuracy_score(y_test_labels, edge_classifier_nv.predict(edge_emb_test_nv))
train_accuracy_heider = accuracy_score(y_train_labels, edge_classifier_heider.predict(edge_emb_train_heider))
test_accuracy_heider = accuracy_score(y_test_labels, edge_classifier_heider.predict(edge_emb_test_heider))
print("Node2Vec Model Performance:")
print(f"Train Accuracy: {train_accuracy_nv:.4f}")
print(f"Test Accuracy: {test_accuracy_nv:.4f}")
print("\nHeider Model Performance:")
print(f"Train Accuracy: {train_accuracy_heider:.4f}")   
print(f"Test Accuracy: {test_accuracy_heider:.4f}")

Node2Vec Model Performance:
Train Accuracy: 0.9005
Test Accuracy: 0.8972

Heider Model Performance:
Train Accuracy: 0.9006
Test Accuracy: 0.8972
