In [48]:
# Required Libraries
from pathlib import Path
import random
import numpy as np
import pandas as pd
import igraph as ig
import networkx as nx
import matplotlib as mpl
import matplotlib.patheffects as pe
import matplotlib.pyplot as plt
import seaborn as sns
# from msb import Balance
# from msb.utils import frustration_count, label_clusters

#%matplotlib inline
from gensim.models import Word2Vec

# for network analysis
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import igraph as ig
# from msb import Balance
# from msb.utils import frustration_count, label_clusters
# for visualization
import matplotlib.pyplot as plt

# For network analysis
import community

# Paths
ROOT = Path(".").absolute().parent
# DATA = ROOT/r"C:\Users\Admin\PhD Projects\ai_heider\notebooks\data"
DATA = ROOT / r"C:\Users\krishnadas\Projects\PhD Project\ai_heider\notebooks\data"
FIGS = DATA/'figs'
FIGS.mkdir(exist_ok=True)

In [10]:
# import data
df = pd.read_csv(r"C:\Users\krishnadas\Projects\PhD Project\ai_heider\notebooks\data\bitcoinotc\soc-sign-bitcoinotc.csv", 
                header=None)
# cols = ["SOURCE","TARGET","RATING","TIME"]
df.rename(columns={0:"Source",1:"Target",2:"Rating",3:"Time"}, inplace=True)
df.head()

Unnamed: 0,Source,Target,Rating,Time
0,6,2,4,1289242000.0
1,6,5,2,1289242000.0
2,1,15,1,1289243000.0
3,4,3,7,1289245000.0
4,13,16,8,1289254000.0


In [11]:
# As for now we are not looking at temporal changes, we can drop the time column
df.drop(columns=["Time"], inplace=True)
df.head()

Unnamed: 0,Source,Target,Rating
0,6,2,4
1,6,5,2
2,1,15,1
3,4,3,7
4,13,16,8


In [15]:
df['Rating'].value_counts()

Rating
 1     20048
 2      5562
 3      2561
-10     2413
 5      1268
 4       967
 10      765
-1       601
 8       277
 6       265
 7       208
-2       182
-5       179
 9       108
-3        91
-8        31
-4        27
-9        20
-7        14
-6         5
Name: count, dtype: int64

In [None]:
def get_next_node(G, current, previous, p, q, weight_key:str):
    alphas = []
    neighbors = list(G.neighbors(current))
    for neighbor in neighbors:
        weight = abs(G[current][neighbor][weight_key])
        if neighbor == previous:
            alpha = weight* 1/p
        elif G.has_edge(neighbor, previous):
            alpha = weight
        else:
            alpha = weight * 1/q
        alphas.append(alpha)
    probs = [alpha/sum(alphas) for alpha in alphas]
    next = np.random.choice(neighbors,1,p=probs)[0]
    return next

def biased_random_walk(G, start_node, walk_length, p, q):
    walk = [start_node]
    # for i in range(walk_length-1):
    while len(walk) < walk_length:
        current = walk[-1]
        neighbors = list(G.neighbors(current)) 
        if not neighbors:
            break
        if len(walk)==1:
            # neighbors = list(G.neighbors(current))
            next = np.random.choice(neighbors)
        else:
            previous = walk[-2]
            next = get_next_node(G, current, previous, p, q, 'Rating')
        walk.append(next)
    return walk

def simulate_walks(G, num_walks, walk_length, p, q):
    walks = []
    nodes = list(G.nodes)
    for _ in range(num_walks):
        np.random.shuffle(nodes)
        for node in nodes:
            walk = biased_random_walk(G, node, walk_length, p, q)
            walks.append(walk)
    return walks

In [13]:
G = nx.from_pandas_edgelist(df, source='Source', target='Target',edge_attr='Rating')
nx.number_of_nodes(G)

5881

In [25]:
# get the number of nodes and edges
num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()
print(f"average degree: {2*num_edges/num_nodes:.2f}")

average degree: 7.31


In [33]:
# density of the graph
density = nx.density(G)
print(f"Density of the graph: {density:.4f}")

# network diameter
try:
    diameter = nx.diameter(G)
    print(f"Diameter of the graph: {diameter}")
except nx.NetworkXError:
    print("Graph is not connected")

Density of the graph: 0.0012
Graph is not connected


In [39]:
# Main Execution

# Parameters
num_walks = 1
walk_length = 100
p = 1.0
q = 0.5
embedding_dim = 64

# Generate walks
walks = simulate_walks(G, num_walks=num_walks, walk_length=walk_length, p=p, q=q)
# Convert nodes to strings (required by gensim)
walks = [[str(node) for node in walk] for walk in walks]
# Train Word2Vec
model = Word2Vec(sentences=walks, vector_size=embedding_dim, window=10, min_count=0, sg=1, workers=4, epochs=10)


In [40]:
# save the embeddings
# embeddings = {str(node): model.wv[str(node)] for node in G.nodes}
# Save the model
model.save('bitcoinotc_word2vec.model')

In [44]:
# embeddings
# Load the model
model = Word2Vec.load('bitcoinotc_word2vec.model')
# Get embeddings for all nodes
embeddings = {str(node): model.wv[str(node)] for node in G.nodes}
print(f"Embedding array shape: {model.wv.vectors.shape}")

Embedding array shape: (5881, 64)


In [None]:
# PCA_model = sklearn.decomposition.PCA(n_components=2)

In [46]:
X_train, X_test, y_train, y_test = train_test_split(
    list(embeddings.values()), list(embeddings.keys()), test_size=0.2, random_state=42
)

In [50]:
ML_model = LogisticRegression(max_iter=100, random_state=42)
ML_model.fit(X_train, y_train)
# Predictions
y_pred = ML_model.predict(X_test)
# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)
# Classification Report
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)
# AUC score
# from sklearn.metrics import roc_auc_score
# auc_score = roc_auc_score(y_test, ML_model.predict_proba(X_test), multi_class='ovr')
# print(f"AUC Score: {auc_score:.4f}")

Accuracy: 0.0000
Confusion Matrix:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Classification Report:
              precision    recall  f1-score   support

         100       0.00      0.00      0.00       1.0
        1001       0.00      0.00      0.00       0.0
        1003       0.00      0.00      0.00       1.0
         101       0.00      0.00      0.00       0.0
        1011       0.00      0.00      0.00       1.0
        1013       0.00      0.00      0.00       0.0
        1016       0.00      0.00      0.00       1.0
        1017       0.00      0.00      0.00       0.0
        1020       0.00      0.00      0.00       1.0
        1028       0.00      0.00      0.00       1.0
        1029       0.00      0.00      0.00       1.0
        1042       0.00      0.00      0.00       0.0
        1044       0.00      0.00      0.00       1.0
        1046       0.00      0.00      0.00       1.0
        105

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
