In [1]:
# Required Libraries
from pathlib import Path
import pickle
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib as mpl
import matplotlib.patheffects as pe
import matplotlib.pyplot as plt
import seaborn as sns
# from msb import Balance
# from msb.utils import frustration_count, label_clusters

#%matplotlib inline
from gensim.models import Word2Vec

# for network analysis
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# import igraph as ig
# from msb import Balance
# from msb.utils import frustration_count, label_clusters
# for visualization
import matplotlib.pyplot as plt

# For network analysis
# import community

# Paths
ROOT = Path(".").absolute().parent
DATA = ROOT/r"C:\Users\Admin\PhD Projects\ai_heider\notebooks\data"
# DATA = ROOT / r"C:\Users\krishnadas\Projects\PhD Project\ai_heider\notebooks\data"
FIGS = DATA/'figs'
FIGS.mkdir(exist_ok=True)

In [2]:
# import data
df = pd.read_csv(r"DATA\bitcoinotc\soc-sign-bitcoinotc.csv", header=None)
# cols = ["SOURCE","TARGET","RATING","TIME"]
df.rename(columns={0:"Source",1:"Target",2:"Rating",3:"Time"}, inplace=True)
df.head()

Unnamed: 0,Source,Target,Rating,Time
0,6,2,4,1289242000.0
1,6,5,2,1289242000.0
2,1,15,1,1289243000.0
3,4,3,7,1289245000.0
4,13,16,8,1289254000.0


In [3]:
# As for now we are not looking at temporal changes, we can drop the time column
df.drop(columns=["Time"], inplace=True)
df.head()

Unnamed: 0,Source,Target,Rating
0,6,2,4
1,6,5,2
2,1,15,1
3,4,3,7
4,13,16,8


In [4]:
df['Rating'].value_counts()

Rating
 1     20048
 2      5562
 3      2561
-10     2413
 5      1268
 4       967
 10      765
-1       601
 8       277
 6       265
 7       208
-2       182
-5       179
 9       108
-3        91
-8        31
-4        27
-9        20
-7        14
-6         5
Name: count, dtype: int64

In [5]:
target_column = 'Rating'
df_btc_vars = df.drop(columns=[target_column])
df_btc_vars.head()

Unnamed: 0,Source,Target
0,6,2
1,6,5
2,1,15
3,4,3
4,13,16


In [None]:
def get_next_node(G, current, previous, p, q, weight_key:str):
    alphas = []
    neighbors = list(G.neighbors(current))
    for neighbor in neighbors:
        weight = abs(G[current][neighbor][weight_key])
        if neighbor == previous:
            alpha = weight* 1/p
        elif G.has_edge(neighbor, previous):
            alpha = weight
        else:
            alpha = weight * 1/q
        alphas.append(alpha)
    probs = [alpha/sum(alphas) for alpha in alphas]
    next = np.random.choice(neighbors,1,p=probs)[0]
    return next

def biased_random_walk(G, start_node, walk_length, p, q, weight_key:str):
    walk = [start_node]
    # for i in range(walk_length-1):
    while len(walk) < walk_length:
        current = walk[-1]
        neighbors = list(G.neighbors(current)) 
        if not neighbors:
            break
        if len(walk)==1:
            # neighbors = list(G.neighbors(current))
            next = np.random.choice(neighbors)
        else:
            previous = walk[-2]
            next = get_next_node(G, current, previous, p, q, weight_key)
        walk.append(next)
    return walk

def simulate_walks(G, num_walks, walk_length, p, q):
    walks = []
    nodes = list(G.nodes)
    for _ in range(num_walks):
        np.random.shuffle(nodes) # Shuffle nodes to ensure randomness
        for node in nodes:
            walk = biased_random_walk(G, node, walk_length, p, q)
            walks.append(walk)
    return walks

In [7]:
G = nx.from_pandas_edgelist(df, source='Source', target='Target',edge_attr='Rating')
nx.number_of_nodes(G)

5881

In [None]:
# get the number of nodes and edges
num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()
print(f"average degree: {2*num_edges/num_nodes:.2f}")

In [None]:
# density of the graph
density = nx.density(G)
print(f"Density of the graph: {density:.4f}")

# network diameter
try:
    diameter = nx.diameter(G)
    print(f"Diameter of the graph: {diameter}")
except nx.NetworkXError:
    print("Graph is not connected")

## Node embedding

In [51]:
# Main Execution

# Parameters
num_walks = 2
walk_length = 150
p = 1.0
q = 0.5
embedding_dim = 64

# Generate walks
walks = simulate_walks(G, num_walks=num_walks, walk_length=walk_length, p=p, q=q)


In [37]:

# Convert nodes to strings (required by gensim)
# walks = [map(str, walk) for walk in walks]
# Train Word2Vec
model = Word2Vec(sentences=walks, vector_size=embedding_dim, window=10, min_count=0, sg=1, workers=4, epochs=10)


In [None]:
# save the embeddings
# embeddings = {str(node): model.wv[str(node)] for node in G.nodes}
# Save the model
# model.save('bitcoinotc_word2vec.model')

In [None]:
class VecConfig:
    embedding_dim: int = 64
    walk_length: int = 100
    num_walks: int = 2
    window: int = 10
    min_count: int = 1

class NodeEmbedding:
    def __init__(self):
        self.config = VecConfig()
        self.model = None
    



In [38]:
# def fit(walks):
#     if not walks:
#         raise Exception("No walks provided for training the model.")
#     model = Word2Vec(sentences=walks, vector_size=embedding_dim, window=10, min_count=0, sg=1, workers=4, epochs=10)
#     return model

def save_model(dimension, walk_length, num_walks, model, save_to, prefix):
    d = dimension
    w = walk_length
    n = num_walks
    embedding_filename = f"{prefix}_embeddings_d{d}_w{w}_n{n}.txt"
    model_filename = f"{prefix}_model_d{d}_w{w}_n{n}.pkl"
    model.wv.save_word2vec_format(str(save_to/embedding_filename))
    model.save(str(save_to/model_filename))
    print(f"Model and embeddings saved to {str(save_to)}")


In [39]:
save_model(embedding_dim, walk_length, num_walks, model, DATA, "bitcoinotc")

Model and embeddings saved to C:\Users\Admin\PhD Projects\ai_heider\notebooks\data


In [40]:

def load_model(model_filename, embedding_path):
    if Path(embedding_path/model_filename).exists():
        print(f"Loading model: {model_filename}")
        with Path(embedding_path/model_filename).open('rb') as file:
            model = pickle.load(file)
    else:
        raise FileNotFoundError(f"Model file {model_filename} not found in {embedding_path}")
    return model


In [41]:
# Load the model

model = load_model(f"bitcoinotc_model_d{embedding_dim}_w{walk_length}_n{num_walks}.pkl", DATA)

Loading model: bitcoinotc_model_d64_w150_n1.pkl


In [None]:
with open(DATA/'bitcoinotc_embeddings_d64_w100_n1.txt', 'r') as emb_file:
    embeddings = emb_file.readlines()

Embeddings were written to a txt file with the format:

[
node_count vector_dimension,

first_node_name value1_1 value1_2 ... value1_64

second_node_name value2_1 value2_2 ... value2_64

...

]

So the first element in the list is just information about the content hence not required. 

In [None]:
embeddings[:3]

In [None]:
# align the embeddings with target nodes
def align_embeddings_target(df, embedding_file):
    vectors = []
    with Path(DATA/embedding_file).open('r') as file:
        results = file.readlines()
        for line in results[1:]:
            parts = line.strip().split()
            node = parts[0]
            vector = np.array([float(x) for x in parts[1:]])
            vectors.append((node, vector))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    list(embeddings.values()), list(embeddings.keys()), test_size=0.2, random_state=42
)

## Edge embeddings

In [42]:
# Load the model
model = load_model(f"bitcoinotc_model_d{embedding_dim}_w{walk_length}_n{num_walks}.pkl", DATA)

Loading model: bitcoinotc_model_d64_w150_n1.pkl


In [43]:
model.wv.key_to_index[1532] # gives the index corresponding to the node 1532


5317

In [44]:
for word, index in list(model.wv.key_to_index.items()):
    print(f"Word: {word}, Index: {index}")

Word: 1810, Index: 0
Word: 2125, Index: 1
Word: 35, Index: 2
Word: 4172, Index: 3
Word: 2642, Index: 4
Word: 2028, Index: 5
Word: 905, Index: 6
Word: 3744, Index: 7
Word: 2067, Index: 8
Word: 1, Index: 9
Word: 2266, Index: 10
Word: 1386, Index: 11
Word: 2296, Index: 12
Word: 25, Index: 13
Word: 2388, Index: 14
Word: 1953, Index: 15
Word: 7, Index: 16
Word: 2045, Index: 17
Word: 1018, Index: 18
Word: 1383, Index: 19
Word: 2934, Index: 20
Word: 1334, Index: 21
Word: 3897, Index: 22
Word: 832, Index: 23
Word: 13, Index: 24
Word: 3988, Index: 25
Word: 257, Index: 26
Word: 4559, Index: 27
Word: 1352, Index: 28
Word: 4291, Index: 29
Word: 4197, Index: 30
Word: 1565, Index: 31
Word: 1396, Index: 32
Word: 135, Index: 33
Word: 2691, Index: 34
Word: 1201, Index: 35
Word: 2351, Index: 36
Word: 1363, Index: 37
Word: 2017, Index: 38
Word: 3790, Index: 39
Word: 3791, Index: 40
Word: 2498, Index: 41
Word: 3789, Index: 42
Word: 3795, Index: 43
Word: 3793, Index: 44
Word: 3756, Index: 45
Word: 4635, In

In [None]:
# create node embedding matrix
def create_node_embedding_matrix(model, nodes):
    node_emb_matrix = []
    for node in nodes:
        if node in model.wv:
            node_emb_matrix.append(model.wv[node])
        else:
            # If the node is not in the model, use a zero vector
            node_emb_matrix.append(np.zeros(model.vector_size))
    return np.array(node_emb_matrix)

# create edge embedding matrix
# Edge embedding for (v1, v2) = hadamard product of node embeddings for v1, v2
def create_edge_embedding_matrix(edges_list, emb_matrix):
    embs = []
    for edge in edges_list:
        source, target = edge
        if source in model.wv and target in model.wv:
            source_emb = emb_matrix[model.wv.key_to_index[source]]
            target_emb = emb_matrix[model.wv.key_to_index[target]]
            edge_emb = np.multiply(source_emb, target_emb) # hadamard product
            embs.append(edge_emb)
    embs = np.array(embs)
    return embs

In [46]:
# create a train and test set for edges 
test_size = 0.2
# Split the edges into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Rating']), df['Rating'], 
                                                    test_size=test_size, random_state=42)

# Create train-set edge labels: > 0 = 1, <0 = -1, = 0 = 0
y_train_labels = y_train.apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))
# Create test-set edge labels
y_test_labels = y_test.apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))

In [47]:
# Create node embedding matrix
node_embedding_matrix = create_node_embedding_matrix(model, G.nodes)
# Create edge embedding matrix for the train set
edge_embedding_train = create_edge_embedding_matrix(X_train.values, node_embedding_matrix)
# Create edge embedding matrix for the test set 
edge_embedding_test = create_edge_embedding_matrix(X_test.values, node_embedding_matrix)


## Train the model

In [48]:
# train the model to predict the edge sign
edge_classifier = LogisticRegression(max_iter=100, random_state=4)
edge_classifier.fit(edge_embedding_train, y_train_labels)


In [50]:
# Evaluate the model
train_accuracy = accuracy_score(y_train_labels, edge_classifier.predict(edge_embedding_train))
test_accuracy = accuracy_score(y_test_labels, edge_classifier.predict(edge_embedding_test))
print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

Train Accuracy: 0.9006
Test Accuracy: 0.8972


In [None]:
# plot the network G
def plot_network(G, pos=None, node_size=50, edge_color='gray', with_labels=False):
    if pos is None:
        pos = nx.spring_layout(G)
    plt.figure(figsize=(12, 12))
    nx.draw_networkx(G, pos, node_size=node_size, edge_color=edge_color, with_labels=with_labels)
    plt.title("Network Graph")
    plt.axis('off')
    plt.show()
# Plot the network
plot_network(G, with_labels=False, node_size=10, edge_color='gray')

In [None]:
# visulaize the graph for the entire set. #
# show positive edges in blue, negative edges in red, and predicted edges in dashed green
def visualize_graph(G, pos=None, node_size=50, edge_color='gray', with_labels=False, 
                        positive_edges=None, negative_edges=None, predicted_edges=None):
    if pos is None:
        pos = nx.spring_layout(G)
    plt.figure(figsize=(10, 10))
    
    # Draw the main graph
    nx.draw_networkx(G, pos, node_size=node_size, edge_color=edge_color, with_labels=with_labels)
    
    # Draw positive edges
    if positive_edges:
        nx.draw_networkx_edges(G, pos, edgelist=positive_edges, edge_color='blue', width=2)
    
    # Draw negative edges
    if negative_edges:
        nx.draw_networkx_edges(G, pos, edgelist=negative_edges, edge_color='red', width=2)
    
    # Draw predicted edges
    if predicted_edges:
        nx.draw_networkx_edges(G, pos, edgelist=predicted_edges, edge_color='green', style='dashed', width=2)
    
    plt.title("Network Graph with Edge Types")
    plt.axis('off')
    plt.show()
# Prepare edge lists for visualization
positive_edges = [(u, v) for u, v, d in G.edges(data=True) if d['Rating'] > 0]
negative_edges = [(u, v) for u, v, d in G.edges(data=True) if d['Rating'] < 0]
predicted_edges = [(u, v) for u, v in zip(X_test['Source'], X_test['Target']) if edge_classifier.predict([create_edge_embedding_matrix([(u, v)], node_embedding_matrix)[0]])[0] == 1]
# or edge_classifier.predict([create_edge_embedding_matrix([(u, v)], node_embedding_matrix)[0]])[0] == -1]
# Visualize the graph with edge types
visualize_graph(G, positive_edges=positive_edges, negative_edges=negative_edges, predicted_edges=predicted_edges, 
                node_size=10, edge_color='gray', with_labels=False)


