In [11]:
import pandas as pd
import networkx as nx

# Load the dataset directly from a CSV file
file_path = 'soc-sign-bitcoinalpha.csv'
data = pd.read_csv(file_path, header=None, names=['source', 'target', 'rating', 'time'])

# Display the first few rows of the data to understand its structure
print("Initial data preview:")
print(data.head())

# Check for any missing values in the dataset
print("\nMissing values in each column:")
print(data.isnull().sum())

# If there are missing values, you might choose to drop them or fill them
# For example, if missing values are few, you could drop them
# data.dropna(inplace=True)

# Alternatively, if 'rating' or 'time' have missing values, decide on a filling strategy
# For example:
# data['rating'].fillna(data['rating'].mean(), inplace=True)
# data['time'].fillna(method='ffill', inplace=True)

# Check data types and convert them if necessary
print("\nData types before conversion:")
print(data.dtypes)

data['time'] = pd.to_datetime(data['time'], unit='s')  # Assuming 'time' is a UNIX timestamp
print("\nData types after conversion:")
print(data.dtypes)

# Summary statistics for numerical columns
print("\nSummary statistics for numerical columns:")
print(data.describe())

# Creating a network graph from the DataFrame
G = nx.from_pandas_edgelist(data, 'source', 'target', edge_attr=True)

# Basic network statistics
print("\nNetwork statistics:")
print("Number of nodes:", G.number_of_nodes())
print("Number of edges:", G.number_of_edges())

# Save the cleaned data back to a CSV, if no missing values or after handling them
output_file_path = 'cleaned_soc-sign-bitcoinalpha.csv'
data.to_csv(output_file_path, index=False)

print("Processed data has been saved to CSV.")


Initial data preview:
   source  target  rating        time
0    7188       1      10  1407470400
1     430       1      10  1376539200
2    3134       1      10  1369713600
3    3026       1      10  1350014400
4    3010       1      10  1347854400

Missing values in each column:
source    0
target    0
rating    0
time      0
dtype: int64

Data types before conversion:
source    int64
target    int64
rating    int64
time      int64
dtype: object

Data types after conversion:
source             int64
target             int64
rating             int64
time      datetime64[ns]
dtype: object

Summary statistics for numerical columns:
             source        target        rating                           time
count  24186.000000  24186.000000  24186.000000                          24186
mean     864.029314   1051.093815      1.463946  2012-09-08 10:04:14.825105408
min        1.000000      1.000000    -10.000000            2010-11-08 05:00:00
25%       58.000000     66.000000      1.0000

In [10]:
import pandas as pd
import networkx as nx
from networkx.algorithms.link_prediction import jaccard_coefficient

# Load the dataset
file_path = 'cleaned_soc-sign-bitcoinalpha.csv'
data = pd.read_csv(file_path, header=None, names=['source', 'target', 'rating', 'time'])

# Convert the DataFrame to a graph
G = nx.from_pandas_edgelist(data, 'source', 'target', create_using=nx.Graph())

# Split data into train and test sets
# Here, we randomly remove 10% of edges for testing
test_ratio = 0.1
edges = list(G.edges())
num_test = int(len(edges) * test_ratio)

# Make sure the graph remains connected
test_edges = []
for i in range(num_test):
    edge = edges[i]
    G.remove_edge(*edge)
    if nx.is_connected(G):
        test_edges.append(edge)
    else:
        G.add_edge(*edge)

# Link prediction using Jaccard Coefficient
pred_jaccard = list(jaccard_coefficient(G))
pred_jaccard.sort(key=lambda x: x[2], reverse=True)  # Sort by score

# Evaluate the model: Check how many of the top ranked predictions are actually in the test set
top_k = 100
hits = sum(1 for u, v, p in pred_jaccard[:top_k] if (u, v) in test_edges or (v, u) in test_edges)
print(f"Accuracy of the top {top_k} predictions: {hits/top_k:.2%}")

# Save predictions to a file
predictions_df = pd.DataFrame(pred_jaccard, columns=['source', 'target', 'score'])
predictions_df.to_csv('link_predictions.csv', index=False)

print("Link prediction completed and results saved.")


Accuracy of the top 100 predictions: 0.00%
Link prediction completed and results saved.


In [12]:
import pandas as pd
import networkx as nx
from networkx.algorithms.link_prediction import jaccard_coefficient

# Load the dataset
file_path = 'cleaned_soc-sign-bitcoinalpha.csv'
data = pd.read_csv(file_path, header=None, names=['source', 'target', 'rating', 'time'])

# Convert the DataFrame to a graph
G = nx.from_pandas_edgelist(data, 'source', 'target', create_using=nx.Graph())

# Example of how to handle splits, assuming the script does this
# For the sake of demonstration, let's randomly remove 10% of edges for testing
import random
random.seed(42)  # For reproducibility
edges = list(G.edges())
random.shuffle(edges)

test_ratio = 0.1
num_test = int(len(edges) * test_ratio)
train_edges = edges[num_test:]
test_edges = edges[:num_test]

# Remove test edges from graph
G_train = G.copy()
G_train.remove_edges_from(test_edges)

# Ensure the graph remains connected; this example doesn't handle this but should be considered in a real scenario
# Calculate Jaccard Coefficients on the training graph
predictions = list(jaccard_coefficient(G_train))
predictions.sort(key=lambda x: x[2], reverse=True)  # Sort by Jaccard scores

# Evaluate predictions
# Let's consider the top-k predictions and see how many are actually in the test set
top_k = 100
hits = 0
for u, v, p in predictions[:top_k]:
    if (u, v) in test_edges or (v, u) in test_edges:
        hits += 1

print(f"Accuracy of the top {top_k} predictions: {hits/top_k:.2%}")

# Save the predictions to a CSV file
predictions_df = pd.DataFrame(predictions, columns=['source', 'target', 'score'])
predictions_output_path = '/mnt/data/jaccard_predictions.csv'
predictions_df.to_csv(predictions_output_path, index=False)

print("Link prediction completed and results saved to:", predictions_output_path)


Accuracy of the top 100 predictions: 0.00%


OSError: Cannot save file into a non-existent directory: '/mnt/data'

In [14]:
pip install dgl

Collecting dgl
  Downloading dgl-2.1.0-cp310-cp310-manylinux1_x86_64.whl (8.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m00:01[0mm0:01[0mm
Installing collected packages: dgl
Successfully installed dgl-2.1.0
Note: you may need to restart the kernel to use updated packages.


In [16]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import itertools
from sklearn.metrics import roc_auc_score
import scipy.sparse as sp
import dgl.function as fn


# Load the BitcoinAlpha dataset
file_path = 'soc-sign-bitcoinalpha.csv'
data = pd.read_csv(file_path, header=None, names=['source', 'target', 'rating', 'time'])

# Create a graph
edges = data[['source', 'target']].to_numpy()
g = dgl.graph((edges[:, 0], edges[:, 1]))
g = dgl.add_self_loop(g)  # Optionally add self-loops

# Since the dataset may not have node features, we create synthetic features (e.g., one-hot encoded)
num_nodes = g.number_of_nodes()
g.ndata['feat'] = torch.eye(num_nodes)

# Define the GraphSAGE model
from dgl.nn import SAGEConv

class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, 'mean')
        self.conv2 = SAGEConv(h_feats, h_feats, 'mean')

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

# Dot product predictor
class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            return g.edata['score'][:, 0]

# Prepare training and testing sets
u, v = g.edges()
eids = np.random.permutation(g.number_of_edges())
test_size = int(len(eids) * 0.1)
train_size = g.number_of_edges() - test_size
train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]
test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]

# Negative sampling
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))
adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes())
neg_u, neg_v = np.where(adj_neg != 0)

neg_eids = np.random.choice(len(neg_u), g.number_of_edges())
train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]]
test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]

# Training loop
model = GraphSAGE(num_nodes, 16)  # num_nodes is used as in_feats for synthetic features
pred = DotPredictor()
optimizer = torch.optim.Adam(itertools.chain(model.parameters(), pred.parameters()), lr=0.01)

for e in range(100):
    h = model(g, g.ndata['feat'])
    pos_score = pred(g, h)  # Predict on full graph for simplicity
    neg_score = pred(g, h)  # Same here, simplify for example purposes
    loss = F.binary_cross_entropy_with_logits(torch.cat([pos_score, neg_score]),
                                              torch.cat([torch.ones(pos_score.shape[0]),
                                                         torch.zeros(neg_score.shape[0])]))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if e % 5 == 0:
        print(f'Epoch {e}, Loss: {loss.item()}')

# Evaluation (simplified, just to demonstrate)
with torch.no_grad():
    pos_score = pred(g, h)
    neg_score = pred(g, h)
    scores = torch.cat([pos_score, neg_score]).numpy()
    labels = np.concatenate([np.ones(pos_score.shape[0]), np.zeros(neg_score.shape[0])])
    auc = roc_auc_score(labels, scores)
    print('AUC:', auc)


Epoch 0, Loss: 0.73162841796875
Epoch 5, Loss: 0.6965652108192444
Epoch 10, Loss: 0.6936313509941101
Epoch 15, Loss: 0.6932875514030457
Epoch 20, Loss: 0.6933138370513916
Epoch 25, Loss: 0.6933148503303528
Epoch 30, Loss: 0.6932670474052429
Epoch 35, Loss: 0.6932210922241211
Epoch 40, Loss: 0.693195104598999
Epoch 45, Loss: 0.6931819319725037
Epoch 50, Loss: 0.6931743621826172
Epoch 55, Loss: 0.6931692361831665
Epoch 60, Loss: 0.6931655406951904
Epoch 65, Loss: 0.6931628584861755
Epoch 70, Loss: 0.6931607723236084
Epoch 75, Loss: 0.693159282207489
Epoch 80, Loss: 0.6931580305099487
Epoch 85, Loss: 0.6931570768356323
Epoch 90, Loss: 0.6931563019752502
Epoch 95, Loss: 0.6931557655334473
AUC: 0.5


In [17]:
pip install python-igraph


Collecting python-igraph
  Downloading python_igraph-0.11.4-py3-none-any.whl (9.1 kB)
Collecting igraph==0.11.4 (from python-igraph)
  Downloading igraph-0.11.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m42.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: igraph, python-igraph
  Attempting uninstall: igraph
    Found existing installation: igraph 0.11.3
    Uninstalling igraph-0.11.3:
      Successfully uninstalled igraph-0.11.3
Successfully installed igraph-0.11.4 python-igraph-0.11.4
Note: you may need to restart the kernel to use updated packages.


ValueError: invalid literal for int() with base 10: '{'