In [1]:
import ast
import pandas as pd
import numpy as np

nodes_df = pd.read_csv('nodes.csv' , header = None , names=['node_attribute'], index_col=0)
edges_df = pd.read_csv('edges.csv')


nodes_list = [(index, ast.literal_eval(row.node_attribute))  for index, row in nodes_df.iterrows()]
edges_list = [(row.From, row.To) for index, row in edges_df.iterrows()]

number_of_days = 1448
nodes_list_vec = []
for index,dict_node in nodes_list:
    vec = np.zeros(number_of_days,dtype=np.float32)
    for day, count in dict_node.items():
        vec[day] = count

    nodes_list_vec.append((index, vec))

import networkx as nx


Graph = nx.DiGraph()
for node_id, node_attr_vec in nodes_list_vec:
    Graph.add_node(node_id, x = node_attr_vec)
Graph.add_edges_from(edges_list)

print(f"Number of nodes: {nx.number_of_nodes(Graph)}")
print(f"Number of edges: {nx.number_of_edges(Graph)}")


Number of nodes: 6600
Number of edges: 50897


In [2]:
import torch
from torch_geometric.utils import from_networkx

# Convert to PyG data
data = from_networkx(Graph)

# Convert node attributes to tensor and move to device
X = torch.stack([torch.tensor(attr['x'], dtype=torch.float32) for _, attr in Graph.nodes(data=True)])
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X = X.to(device)
data.x = X
data = data.to(device)

print(data)
print(f"Data is on device: {data.x.device}")


  import torch_geometric.typing
  import torch_geometric.typing
  import torch_geometric.typing
  import torch_geometric.typing
  import torch_geometric.typing
  from .autonotebook import tqdm as notebook_tqdm
  data_dict[key] = torch.as_tensor(value)


Data(x=[6600, 1448], edge_index=[2, 50897])
Data is on device: cuda:0


In [3]:
phase1_score = []  #Holds every node's phase 1 score , index number corresponds to node id
def phase1(node_list_vec, start_day = 0 , current_day=1448):
    for node_id, node_attr_vec in node_list_vec:
        mean = np.mean(node_attr_vec[start_day:current_day])
        std = np.std(node_attr_vec[start_day:current_day])
        today_score = (node_attr_vec[current_day] -mean) / std if std > 0 else 0
        phase1_score.append(today_score)

phase1(nodes_list_vec, start_day=0, current_day=1447)   

In [6]:
from torch_geometric.utils import negative_sampling

phase2_score = [] #Holds every node's phase 2 score , index number corresponds to node id

import torch 
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels , num_layers=3 , dropout = 0.3):
        super(GraphSAGE, self).__init__()
        self.convs = torch.nn.ModuleList()
        self.convs.append(SAGEConv(in_channels, hidden_channels))

        for _ in range(num_layers - 2):
            self.convs.append(SAGEConv(hidden_channels, hidden_channels))
        
        self.convs.append(SAGEConv(hidden_channels, out_channels))

        self.dropout = dropout

    def forward(self, x, edge_index):
        for conv in self.convs[:-1]:
            x = conv(x, edge_index)
            x = F.relu(x)
            x = F.dropout(x, p = self.dropout, training= self.training)
        x = self.convs[-1](x, edge_index)
        return x 

in_channels = data.num_node_features
hidden_channels = 128 
out_channels = 64 

model = GraphSAGE(in_channels, hidden_channels, out_channels).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr =0.01, weight_decay=5e-4)


def train():
    model.train()
    optimizer.zero_grad()
    z = model(data.x, data.edge_index)

    pos_edge_index = data.edge_index
    neg_edge_index = negative_sampling(
        edge_index=data.edge_index,
        num_nodes=data.num_nodes,
        num_neg_samples=pos_edge_index.size(1),
    ).to(device)  

    pos_similarity = (z[pos_edge_index[0]] * z[pos_edge_index[1]]).sum(dim=1)
    pos_loss = F.logsigmoid(pos_similarity).mean()

    neg_similarity = (z[neg_edge_index[0]] * z[neg_edge_index[1]]).sum(dim=1)
    neg_loss = F.logsigmoid(-neg_similarity).mean()

    loss = -pos_loss - neg_loss
    loss.backward()
    optimizer.step()

    return loss.item(), z

print(f"Training on device: {next(model.parameters()).device}")

for epoch in range(1, 201):
    loss, embeddings = train()
    if epoch % 10 == 0:
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

model.eval()
with torch.no_grad():
    z = model(data.x, data.edge_index)

# saving file
    torch.save(z, 'final_embeddings.pt')
    print("\nEmbeddings saved to 'final_embeddings.pt'")

Training on device: cuda:0
Epoch: 010, Loss: 3.7006
Epoch: 020, Loss: 1.4405
Epoch: 030, Loss: 1.4223
Epoch: 040, Loss: 1.4160
Epoch: 050, Loss: 1.4053
Epoch: 060, Loss: 1.4091
Epoch: 070, Loss: 1.4090
Epoch: 080, Loss: 1.3777
Epoch: 090, Loss: 1.3536
Epoch: 100, Loss: 1.3382
Epoch: 110, Loss: 1.3326
Epoch: 120, Loss: 1.3286
Epoch: 130, Loss: 1.3271
Epoch: 140, Loss: 1.3299
Epoch: 150, Loss: 1.3212
Epoch: 160, Loss: 1.3275
Epoch: 170, Loss: 1.3209
Epoch: 180, Loss: 1.3226
Epoch: 190, Loss: 1.3174
Epoch: 200, Loss: 1.3388
Final embeddings:

Embeddings shape: torch.Size([6600, 64])
Sample of first 5 nodes:
tensor([[ 9.2174e-02,  1.1157e-01, -1.8774e-02, -1.0982e-01,  1.9369e-02,
          5.2376e-02, -2.6877e-02, -1.5991e-03, -1.9877e-02,  6.9069e-02,
          4.5941e-02, -3.6317e-02,  8.2629e-02,  1.0024e-01, -9.3068e-02,
         -1.8002e-02, -2.1818e-02, -1.0936e-01, -1.0695e-01, -8.0348e-02,
         -1.1923e-01,  1.2103e-01,  2.2726e-02, -7.2719e-03, -1.0334e-02,
          1.1574e-

In [48]:
import pandas as pd 
import ast
from collections import defaultdict

def historical_pattern(current_day = 200):
    csv_file = 'node_day_recipients.csv'

    df = pd.read_csv(csv_file)
    df["day_recipients_str"] = df["day_recipients_str"].apply(ast.literal_eval)

    for index, row in df.iterrows():
        node = row['node_id']
        day_recipients = row['day_recipients_str']
        if (current_day < len(day_recipients) and 
            len(day_recipients[current_day]) > 0):  
            
            recipients_today = day_recipients[current_day]
            total_score_not_recipent = [0 for _ in range(len(recipients_today))]
            total_score_recipent = [0 for _ in range(len(recipients_today))]

            for past_day in range(current_day + 1):
                if past_day < len(day_recipients) and len(day_recipients[past_day]) > 0:
                    for past_recipient in day_recipients[past_day]:
                        for j_index, current_recipient in enumerate(recipients_today):
                            if past_recipient == current_recipient:
                                total_score_recipent[j_index] += 1
                            else:
                                total_score_not_recipent[j_index] += 1

            print(f"Node {node}:")
            for j_index, recipient in enumerate(recipients_today):
                print(f"  Recipient {recipient}: score_recipent={total_score_recipent[j_index]}, score_not_recipent={total_score_not_recipent[j_index]}")
            
       

historical_pattern()

Node 847:
  Recipient 5508: score_recipent=2, score_not_recipent=0
Node 1311:
  Recipient 2761: score_recipent=3, score_not_recipent=19
  Recipient 5508: score_recipent=10, score_not_recipent=12
Node 1847:
  Recipient 679: score_recipent=1, score_not_recipent=44
  Recipient 1069: score_recipent=3, score_not_recipent=42
  Recipient 1460: score_recipent=3, score_not_recipent=42
  Recipient 3042: score_recipent=7, score_not_recipent=38
  Recipient 3454: score_recipent=2, score_not_recipent=43
  Recipient 4043: score_recipent=2, score_not_recipent=43
  Recipient 5355: score_recipent=2, score_not_recipent=43
Node 2208:
  Recipient 2275: score_recipent=1, score_not_recipent=96
  Recipient 2525: score_recipent=1, score_not_recipent=96
  Recipient 3401: score_recipent=1, score_not_recipent=96
  Recipient 3798: score_recipent=6, score_not_recipent=91
Node 2810:
  Recipient 1327: score_recipent=2, score_not_recipent=12
Node 3862:
  Recipient 1789: score_recipent=3, score_not_recipent=53
  Recipi