In [11]:
import ast
import pandas as pd
import numpy as np

nodes_df = pd.read_csv('nodes.csv' , header = None , names=['node_attribute'], index_col=0)
edges_df = pd.read_csv('edges.csv')


nodes_list = [(index, ast.literal_eval(row.node_attribute))  for index, row in nodes_df.iterrows()]
edges_list = [(row.From, row.To) for index, row in edges_df.iterrows()]

number_of_days = 1448
nodes_list_vec = []
for index,dict_node in nodes_list:
    vec = np.zeros(number_of_days,dtype=np.float32)
    for day, count in dict_node.items():
        vec[day] = count

    nodes_list_vec.append((index, vec))

import networkx as nx


Graph = nx.DiGraph()
for node_id, node_attr_vec in nodes_list_vec:
    Graph.add_node(node_id, x = node_attr_vec)
Graph.add_edges_from(edges_list)

print(f"Number of nodes: {nx.number_of_nodes(Graph)}")
print(f"Number of edges: {nx.number_of_edges(Graph)}")



Number of nodes: 6600
Number of edges: 50897


In [12]:
import torch
from torch_geometric.utils import from_networkx

# Convert to PyG data
data = from_networkx(Graph)

# Convert node attributes to tensor and move to device
X = torch.stack([torch.tensor(attr['x'], dtype=torch.float32) for _, attr in Graph.nodes(data=True)])
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X = X.to(device)
data.x = X
data = data.to(device)

print(data)
print(f"Data is on device: {data.x.device}")


Data(x=[6600, 1448], edge_index=[2, 50897])
Data is on device: cuda:0


In [13]:
phase1_score = []  #Holds every node's phase 1 score , index number corresponds to node id
def phase1(node_list_vec, start_day = 0 , current_day=1448):
    for node_id, node_attr_vec in node_list_vec:
        mean = np.mean(node_attr_vec[start_day:current_day])
        std = np.std(node_attr_vec[start_day:current_day])
        today_score = (node_attr_vec[current_day] -mean) / std if std > 0 else 0
        phase1_score.append(today_score)

phase1(nodes_list_vec, start_day=0, current_day=1447)   

In [17]:
from torch_geometric.utils import negative_sampling

phase2_score = [0] * 6600 #Holds every node's phase 2 score , index number corresponds to node id


import torch 
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels , num_layers=3 , dropout = 0.3):
        super(GraphSAGE, self).__init__()
        self.convs = torch.nn.ModuleList()
        self.convs.append(SAGEConv(in_channels, hidden_channels))

        for _ in range(num_layers - 2):
            self.convs.append(SAGEConv(hidden_channels, hidden_channels))
        
        self.convs.append(SAGEConv(hidden_channels, out_channels))

        self.dropout = dropout

    def forward(self, x, edge_index):
        for conv in self.convs[:-1]:
            x = conv(x, edge_index)
            x = F.relu(x)
            x = F.dropout(x, p = self.dropout, training= self.training)
        x = self.convs[-1](x, edge_index)
        return x 

in_channels = data.num_node_features
hidden_channels = 128 
out_channels = 64 

model = GraphSAGE(in_channels, hidden_channels, out_channels).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr =0.01, weight_decay=5e-4)


def train():
    model.train()
    optimizer.zero_grad()
    z = model(data.x, data.edge_index)

    pos_edge_index = data.edge_index
    neg_edge_index = negative_sampling(
        edge_index=data.edge_index,
        num_nodes=data.num_nodes,
        num_neg_samples=pos_edge_index.size(1),
    ).to(device)  

    pos_similarity = (z[pos_edge_index[0]] * z[pos_edge_index[1]]).sum(dim=1)
    pos_loss = F.logsigmoid(pos_similarity).mean()

    neg_similarity = (z[neg_edge_index[0]] * z[neg_edge_index[1]]).sum(dim=1)
    neg_loss = F.logsigmoid(-neg_similarity).mean()

    loss = -pos_loss - neg_loss
    loss.backward()
    optimizer.step()

    return loss.item(), z

print(f"Training on device: {next(model.parameters()).device}")

for epoch in range(1, 500):
    loss, embeddings = train()
    if epoch % 10 == 0:
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

model.eval()
with torch.no_grad():
    z = model(data.x, data.edge_index)

# saving file
    torch.save(z, 'final_embeddings.pt')
    print("\nEmbeddings saved to 'final_embeddings.pt'")

Training on device: cuda:0
Epoch: 010, Loss: 2.6077
Epoch: 020, Loss: 1.5511
Epoch: 030, Loss: 1.4200
Epoch: 040, Loss: 1.3939
Epoch: 050, Loss: 1.3875
Epoch: 060, Loss: 1.3723
Epoch: 070, Loss: 1.3709
Epoch: 080, Loss: 1.3508
Epoch: 090, Loss: 1.3438
Epoch: 100, Loss: 1.3245
Epoch: 110, Loss: 1.3202
Epoch: 120, Loss: 1.3172
Epoch: 130, Loss: 1.3099
Epoch: 140, Loss: 1.3068
Epoch: 150, Loss: 1.2967
Epoch: 160, Loss: 1.2927
Epoch: 170, Loss: 1.2952
Epoch: 180, Loss: 1.2885
Epoch: 190, Loss: 1.2824
Epoch: 200, Loss: 1.2759
Epoch: 210, Loss: 1.2751
Epoch: 220, Loss: 1.2618
Epoch: 230, Loss: 1.2700
Epoch: 240, Loss: 1.2616
Epoch: 250, Loss: 1.2458
Epoch: 260, Loss: 1.2409
Epoch: 270, Loss: 1.2272
Epoch: 280, Loss: 1.2277
Epoch: 290, Loss: 1.2083
Epoch: 300, Loss: 1.2135
Epoch: 310, Loss: 1.2124
Epoch: 320, Loss: 1.1873
Epoch: 330, Loss: 1.2032
Epoch: 340, Loss: 1.1864
Epoch: 350, Loss: 1.1930
Epoch: 360, Loss: 1.1932
Epoch: 370, Loss: 1.1684
Epoch: 380, Loss: 1.1559
Epoch: 390, Loss: 1.173

In [None]:
import pandas as pd 
import ast
from collections import defaultdict

def historical_pattern(current_day =  220, weight_distribution=0.3):
    csv_file = 'node_day_recipients.csv'

    df = pd.read_csv(csv_file)
    df["day_recipients_str"] = df["day_recipients_str"].apply(ast.literal_eval)

    

    for index, row in df.iterrows():
        node = row['node_id']
        day_recipients = row['day_recipients_str']
        if (current_day < len(day_recipients) and 
            len(day_recipients[current_day]) > 0):  
            
            recipients_today = day_recipients[current_day]
            total_score_not_recipent = [0 for _ in range(len(recipients_today))]
            total_score_recipent = [0 for _ in range(len(recipients_today))]

            for past_day in range(current_day + 1):
                if past_day < len(day_recipients) and len(day_recipients[past_day]) > 0:
                    for past_recipient in day_recipients[past_day]:
                        for j_index, current_recipient in enumerate(recipients_today):  
                            if past_recipient == current_recipient:
                                total_score_recipent[j_index] += 1
                            else:
                                total_score_not_recipent[j_index] += 1

           
                # Calculate final scores with cosine similarity
            day_scores = []
            for j_index, recipient in enumerate(recipients_today):
                    # Get historical pattern score
                if total_score_not_recipent[j_index] == 0:
                    historical_score = 1.0
                else:
                    historical_score = total_score_recipent[j_index] / total_score_not_recipent[j_index]
                    
                    # If historical score is 1, keep it as is
                if historical_score == 1.0:
                    final_score = 1.0
                else:
                        # Calculate cosine similarity between current node and recipient
                    try:
                            # Get embeddings for current node and recipient
                        node_embedding = z[node]
                        recipient_embedding = z[recipient]
                            
                            # Calculate cosine similarity using PyTorch
                        cos = F.cosine_similarity(
                                node_embedding.unsqueeze(0), 
                                recipient_embedding.unsqueeze(0)
                        ).item()

                        cos = (cos+1)/2  #Normalize [-1,1] to [0,1]
                        
                            
                            # Combine historical pattern and cosine similarity using weight
                        final_score = (weight_distribution * historical_score + 
                                         (1 - weight_distribution) * (1-cos))
                            
                            # Ensure score is between 0 and 1
                        final_score = max(0.0, min(1.0, final_score))
                            
                    except (IndexError, ValueError) as e:
                            # Fallback if there's an issue with embeddings
                        print(f"Warning: Error calculating cosine similarity for nodes {node} and {recipient}: {e}")
                        final_score = historical_score
                    
                day_scores.append((recipient, final_score))
                
                # Store in phase2_score at the node's index position
            phase2_score[node] = day_scores

# Call the function for a single day
historical_pattern(current_day=930, weight_distribution=0.3)
for i in range(len(nodes_list)):
    if(phase2_score[i]):
        print(f"Node {i} - Phase 2 Score: {phase2_score[i]}")


# Print some results to verify the structure
def get_phase2_nodes():
    phase2_nodes = []
    for node_id in range(len(phase2_score)):
        if phase2_score[node_id]:  # Only  stores that have data
            phase2_nodes.append(node_id)            
    return phase2_nodes

5
9
160
217
402
412
432
609
700
737
839
855
868
895
910
922
1014
1049
1341
1453
1454
1540
1576
1588
1601
1648
1661
1673
1705
1843
1847
1905
1922
1924
1932
2065
2074
2208
2229
2231
2278
2289
2290
2291
2322
2384
2512
2572
2671
2705
2719
2786
2866
3039
3060
3078
3107
3118
3197
3257
3296
3424
3440
3583
3608
3725
3747
3759
3774
3777
3803
3839
3848
3896
3994
4013
4021
4082
4270
4312
4360
4361
4382
4390
4411
4412
4473
4515
4577
4689
4850
4913
4915
5133
5166
5213
5214
5276
5301
5363
5547
5595
5603
5635
5811
5850
5897
5910
5982
5989
6179
6226
6337
6385
6422
Node 5 - Phase 2 Score: [(2719, 1.0)]
Node 9 - Phase 2 Score: [(2719, 1.0)]
Node 160 - Phase 2 Score: [(2186, 0.03728409347947964), (3456, 0.04761391518529781), (4108, 0.041304190158844)]
Node 217 - Phase 2 Score: [(2208, 0.9377031743526458)]
Node 402 - Phase 2 Score: [(3028, 1.0)]
Node 412 - Phase 2 Score: [(412, 0.0024291706591965216), (1244, 0.06416020317922665), (1404, 0.08724431532328245), (1455, 0.08183941923770584), (1787, 0.222014700

In [None]:
import community.community_louvain as community_louvain
from collections import defaultdict

# Initialize global scores
phase3_score = [0] * len(nodes_list)

def phase3_community_normalization(Graph):
    global phase3_score

    # Step 1: Undirected graph
    undirected_graph = Graph.to_undirected()

    # Step 2: Louvain partition
    partition = community_louvain.best_partition(undirected_graph)

    # Step 3: Nodes to evaluate (from your original code)
    nodes_to_eval = set(get_phase2_nodes())

    # Step 4: Count nodes per community (only those being evaluated)
    community_count = defaultdict(int)
    for node in nodes_to_eval:
        community_id = partition.get(node, -1)
        community_count[community_id] += 1

    # Step 5: Assign scores based on community size
    for node in nodes_to_eval:
        community_id = partition.get(node, -1)
        if community_id != -1:
            phase3_score[node] = community_count[community_id] - 1
        else:
            phase3_score[node] = 0  # if not found in partition

    return phase3_score, partition

phase3_community_normalization(Graph)

# for i in range(len(nodes_list)):
#     if(phase3_score[i]):
#         print(f"Node {i} - Phase 3 Score: {phase3_score[i]}")






Node 5 - Phase 2 Score: {930: [(2719, 1.0)]}
Node 9 - Phase 2 Score: {930: [(2719, 1.0)]}
Node 160 - Phase 2 Score: {930: [(2186, 0.030809416751231045), (3456, 0.03557901022848019), (4108, 0.03346597671508789)]}
Node 217 - Phase 2 Score: {930: [(2208, 0.9032444834709167)]}
Node 402 - Phase 2 Score: {930: [(3028, 1.0)]}
Node 412 - Phase 2 Score: {930: [(412, 0.0024291497975708503), (1244, 0.015189873417721518), (1404, 0.0069500877231848035), (1455, 0.01386667274627365), (1787, 0.015189873417721518), (2271, 0.01125), (2719, 0.01986228917507415), (3118, 0.016182761629925498), (3594, 0.019359423955784567), (4032, 0.015189873417721518), (4308, 0.013865546218487396), (4365, 0.013865546218487396), (4509, 0.016405730685101278), (5437, 0.015189873417721518), (5982, 0.016152178486691246), (6399, 0.015189873417721518), (6402, 0.016622300115589315)]}
Node 432 - Phase 2 Score: {930: [(835, 0.025678096597011275), (1208, 0.029620005075748152), (6424, 0.030315823738391583), (6517, 0.030315823738391583