In [2]:
import ast
import pandas as pd
import numpy as np

nodes_df = pd.read_csv('nodes.csv' , header = None , names=['node_attribute'], index_col=0)
edges_df = pd.read_csv('edges.csv')


nodes_list = [(index, ast.literal_eval(row.node_attribute))  for index, row in nodes_df.iterrows()]
edges_list = [(row.From, row.To) for index, row in edges_df.iterrows()]

number_of_days = 1448
nodes_list_vec = []
for index,dict_node in nodes_list:
    vec = np.zeros(number_of_days,dtype=np.float32)
    for day, count in dict_node.items():
        vec[day] = count

    nodes_list_vec.append((index, vec))

import networkx as nx


Graph = nx.DiGraph()
for node_id, node_attr_vec in nodes_list_vec:
    Graph.add_node(node_id, x = node_attr_vec)
Graph.add_edges_from(edges_list)

print(f"Number of nodes: {nx.number_of_nodes(Graph)}")
print(f"Number of edges: {nx.number_of_edges(Graph)}")



Number of nodes: 6600
Number of edges: 50897


In [3]:
import torch
from torch_geometric.utils import from_networkx

# Convert to PyG data
data = from_networkx(Graph)

# Convert node attributes to tensor and move to device
X = torch.stack([torch.tensor(attr['x'], dtype=torch.float32) for _, attr in Graph.nodes(data=True)])
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X = X.to(device)
data.x = X
data = data.to(device)

print(data)
print(f"Data is on device: {data.x.device}")


  import torch_geometric.typing
  import torch_geometric.typing
  import torch_geometric.typing
  import torch_geometric.typing
  import torch_geometric.typing
  from .autonotebook import tqdm as notebook_tqdm
  data_dict[key] = torch.as_tensor(value)


Data(x=[6600, 1448], edge_index=[2, 50897])
Data is on device: cuda:0


In [56]:
def sigmoid_norm(x, alpha=0.5):
    """Applies sigmoid normalization with adjustable slope."""
    return float(1 / (1 + np.exp(-alpha * x)))


phase1_score = []  # Holds every node's normalized phase 1 score [0,1]

def phase1(node_list_vec, start_day=0, current_day=1448):
    for node_id, node_attr_vec in node_list_vec:
        mean = np.mean(node_attr_vec[start_day:current_day])
        std = np.std(node_attr_vec[start_day:current_day])
        today_score = (node_attr_vec[current_day] - mean) / std if std > 0 else 0
        
        # Apply sigmoid normalization directly to get [0,1]
        normalized_score = sigmoid_norm(today_score, alpha=0.5)
        phase1_score.append(normalized_score)


phase1(nodes_list_vec, start_day=0, current_day=920)

for i in range(len(nodes_list)):
    if(phase1_score[i] ):
        print(f"Node {i} - Phase 1 Score: {phase1_score[i]}")

Node 0 - Phase 1 Score: 0.4958767294883728
Node 1 - Phase 1 Score: 0.5
Node 2 - Phase 1 Score: 0.49446576833724976
Node 3 - Phase 1 Score: 0.5
Node 4 - Phase 1 Score: 0.9956005811691284
Node 5 - Phase 1 Score: 0.4958767294883728
Node 6 - Phase 1 Score: 0.5
Node 7 - Phase 1 Score: 0.5
Node 8 - Phase 1 Score: 0.5
Node 9 - Phase 1 Score: 0.49446576833724976
Node 10 - Phase 1 Score: 0.49285078048706055
Node 11 - Phase 1 Score: 0.4958767294883728
Node 12 - Phase 1 Score: 0.4958767294883728
Node 13 - Phase 1 Score: 0.5
Node 14 - Phase 1 Score: 0.4882948100566864
Node 15 - Phase 1 Score: 0.5
Node 16 - Phase 1 Score: 0.4958767294883728
Node 17 - Phase 1 Score: 0.5
Node 18 - Phase 1 Score: 0.4958767294883728
Node 19 - Phase 1 Score: 0.4958767294883728
Node 20 - Phase 1 Score: 0.5
Node 21 - Phase 1 Score: 0.4941657483577728
Node 22 - Phase 1 Score: 0.4940544068813324
Node 23 - Phase 1 Score: 0.5
Node 24 - Phase 1 Score: 0.4958767294883728
Node 25 - Phase 1 Score: 0.49174049496650696
Node 26 - Ph

In [5]:
from torch_geometric.utils import negative_sampling

phase2_score = [0] * 6600 #Holds every node's phase 2 score , index number corresponds to node id


import torch 
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels , num_layers=3 , dropout = 0.3):
        super(GraphSAGE, self).__init__()
        self.convs = torch.nn.ModuleList()
        self.convs.append(SAGEConv(in_channels, hidden_channels))

        for _ in range(num_layers - 2):
            self.convs.append(SAGEConv(hidden_channels, hidden_channels))
        
        self.convs.append(SAGEConv(hidden_channels, out_channels))

        self.dropout = dropout

    def forward(self, x, edge_index):
        for conv in self.convs[:-1]:
            x = conv(x, edge_index)
            x = F.relu(x)
            x = F.dropout(x, p = self.dropout, training= self.training)
        x = self.convs[-1](x, edge_index)
        return x 

in_channels = data.num_node_features
hidden_channels = 128 
out_channels = 64 

model = GraphSAGE(in_channels, hidden_channels, out_channels).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr =0.01, weight_decay=5e-4)


def train():
    model.train()
    optimizer.zero_grad()
    z = model(data.x, data.edge_index)

    pos_edge_index = data.edge_index
    neg_edge_index = negative_sampling(
        edge_index=data.edge_index,
        num_nodes=data.num_nodes,
        num_neg_samples=pos_edge_index.size(1),
    ).to(device)  

    pos_similarity = (z[pos_edge_index[0]] * z[pos_edge_index[1]]).sum(dim=1)
    pos_loss = F.logsigmoid(pos_similarity).mean()

    neg_similarity = (z[neg_edge_index[0]] * z[neg_edge_index[1]]).sum(dim=1)
    neg_loss = F.logsigmoid(-neg_similarity).mean()

    loss = -pos_loss - neg_loss
    loss.backward()
    optimizer.step()

    return loss.item(), z

print(f"Training on device: {next(model.parameters()).device}")

for epoch in range(1, 500):
    loss, embeddings = train()
    if epoch % 10 == 0:
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

model.eval()
with torch.no_grad():
    z = model(data.x, data.edge_index)

# saving file
    torch.save(z, 'final_embeddings.pt')
    print("\nEmbeddings saved to 'final_embeddings.pt'")

Training on device: cuda:0
Epoch: 010, Loss: 3.1462
Epoch: 020, Loss: 1.4543
Epoch: 030, Loss: 1.4204
Epoch: 040, Loss: 1.4228
Epoch: 050, Loss: 1.3971
Epoch: 060, Loss: 1.3815
Epoch: 070, Loss: 1.3967
Epoch: 080, Loss: 1.3717
Epoch: 090, Loss: 1.3671
Epoch: 100, Loss: 1.3549
Epoch: 110, Loss: 1.3383
Epoch: 120, Loss: 1.3401
Epoch: 130, Loss: 1.3155
Epoch: 140, Loss: 1.3144
Epoch: 150, Loss: 1.3151
Epoch: 160, Loss: 1.2968
Epoch: 170, Loss: 1.2961
Epoch: 180, Loss: 1.3011
Epoch: 190, Loss: 1.2995
Epoch: 200, Loss: 1.3004
Epoch: 210, Loss: 1.2959
Epoch: 220, Loss: 1.2804
Epoch: 230, Loss: 1.2628
Epoch: 240, Loss: 1.2707
Epoch: 250, Loss: 1.2546
Epoch: 260, Loss: 1.2696
Epoch: 270, Loss: 1.2492
Epoch: 280, Loss: 1.2474
Epoch: 290, Loss: 1.2393
Epoch: 300, Loss: 1.2255
Epoch: 310, Loss: 1.2138
Epoch: 320, Loss: 1.2288
Epoch: 330, Loss: 1.2134
Epoch: 340, Loss: 1.1992
Epoch: 350, Loss: 1.1961
Epoch: 360, Loss: 1.1988
Epoch: 370, Loss: 1.1854
Epoch: 380, Loss: 1.1909
Epoch: 390, Loss: 1.185

In [None]:

import pandas as pd 
import ast
from collections import defaultdict

def historical_pattern(current_day =  220, weight_distribution=0.3):
    csv_file = 'node_day_recipients.csv'

    df = pd.read_csv(csv_file)
    df["day_recipients_str"] = df["day_recipients_str"].apply(ast.literal_eval)

    

    for index, row in df.iterrows():
        node = row['node_id']
        day_recipients = row['day_recipients_str']
        if (current_day < len(day_recipients) and 
            len(day_recipients[current_day]) > 0):  
            
            recipients_today = day_recipients[current_day]
            total_score_not_recipent = [0 for _ in range(len(recipients_today))]
            total_score_recipent = [0 for _ in range(len(recipients_today))]

            for past_day in range(current_day + 1):
                if past_day < len(day_recipients) and len(day_recipients[past_day]) > 0:
                    for past_recipient in day_recipients[past_day]:
                        for j_index, current_recipient in enumerate(recipients_today):  
                            if past_recipient == current_recipient:
                                total_score_recipent[j_index] += 1
                            else:
                                total_score_not_recipent[j_index] += 1

           
                # Calculate final scores with cosine similarity
            day_scores = []
            for j_index, recipient in enumerate(recipients_today):
                    # Get historical pattern score
                if total_score_not_recipent[j_index] == 0:
                    historical_score = 1.0
                else:
                    historical_score = total_score_recipent[j_index] / total_score_not_recipent[j_index]
                    
                    # If historical score is 1, keep it as is
                if historical_score == 1.0:
                    final_score = 1.0
                else:
                        # Calculate cosine similarity between current node and recipient
                    try:
                            # Get embeddings for current node and recipient
                        node_embedding = z[node]
                        recipient_embedding = z[recipient]
                            
                            # Calculate cosine similarity using PyTorch
                        cos = F.cosine_similarity(
                                node_embedding.unsqueeze(0), 
                                recipient_embedding.unsqueeze(0)
                        ).item()

                        cos = (cos+1)/2  #Normalize [-1,1] to [0,1]
                        
                            
                            # Combine historical pattern and cosine similarity using weight
                        raw_score = (weight_distribution * historical_score + 
                                         (1 - weight_distribution) * (1-cos))
                            
                            # Ensure score is between 0 and 1
                        
                            
                    except (IndexError, ValueError) as e:
                            # Fallback if there's an issue with embeddings
                        print(f"Warning: Error calculating cosine similarity for nodes {node} and {recipient}: {e}")
                        raw_score = historical_score

                final_score = sigmoid_norm(raw_score, alpha=0.5)
                    
                day_scores.append((recipient, final_score))
                
                # Store in phase2_score at the node's index position
            phase2_score[node] = day_scores

# Call the function for a single day
historical_pattern(current_day=220, weight_distribution=0.3)
for i in range(len(nodes_list)):
    if(phase2_score[i]):
        print(f"Node {i} - Phase 2 Score: {phase2_score[i]}")


Node 1540 - Phase 2 Score: [(1327, 0.5068447806513057), (2766, 0.507053739865132), (5809, 0.5044845401689202)]
Node 2208 - Phase 2 Score: [(1320, 0.5034501671798508)]
Node 3348 - Phase 2 Score: [(5342, 0.5159791782683044)]
Node 3627 - Phase 2 Score: [(735, 0.5060736914718051), (810, 0.5079265555815667), (1412, 0.5344098661435672), (2884, 0.5726064145183206), (3608, 0.5129788354817125), (3701, 0.5071019463902707), (3860, 0.5126587939594184), (3946, 0.5095044783231462), (4360, 0.5075708355934385), (4392, 0.5725665074494821), (4437, 0.5085541034964647), (5224, 0.5107824535677131), (5228, 0.5108660955454895), (5422, 0.5727657831967894), (5972, 0.5237610721060774), (6530, 0.5106965798140493)]
Node 5213 - Phase 2 Score: [(3223, 0.5013256811327415), (6337, 0.5039957187929812)]
Node 5419 - Phase 2 Score: [(1327, 0.5013358326397749)]
Node 6055 - Phase 2 Score: [(729, 0.5014904758091296), (4912, 0.501559717269562)]
Node 6337 - Phase 2 Score: [(2291, 0.5058683140724467), (4043, 0.5198911495788887

[1540, 2208, 3348, 3627, 5213, 5419, 6055, 6337]

In [None]:
import numpy as np
import community.community_louvain as community_louvain
from collections import defaultdict

# Initialize global scores
phase3_score = [0] * len(nodes_list)

def phase3_community_normalization(Graph):

    # Step 1: Undirected graph
    undirected_graph = Graph.to_undirected()

    # Step 2: Louvain partition
    partition = community_louvain.best_partition(undirected_graph)

    results = {}
    
# Step 1: collect all fraction_same values first
    all_fraction_same = []
    senders = []

    for sender, rec_scores in enumerate(phase2_score):
        if not rec_scores:
            continue
        
        c_send = partition.get(sender, -1)
        recipients = [r for r, _ in rec_scores]
        if c_send == -1:
            continue
        
        same_comm_count = sum(1 for r in recipients if partition.get(r, -1) == c_send)
        total_recipients = len(recipients)
        fraction_same = same_comm_count / total_recipients if total_recipients > 0 else 0
        
        all_fraction_same.append(fraction_same)
        senders.append(sender)

    # Step 2: compute mean across all senders
    mean_fraction = np.mean(all_fraction_same)

    # Step 3: apply sigmoid normalization relative to population
    for sender, fraction_same in zip(senders, all_fraction_same):
        phase3_score[sender] = float(1 / (1 + np.exp(0.5 * (fraction_same - mean_fraction))))

    
phase3_community_normalization(Graph)





Node 1540 - Phase 3 Score: 0.47852883753932274
Node 2208 - Phase 3 Score: 0.47852883753932274
Node 3348 - Phase 3 Score: 0.47852883753932274
Node 3627 - Phase 3 Score: 0.5869964297463166
Node 5213 - Phase 3 Score: 0.47852883753932274
Node 5419 - Phase 3 Score: 0.47852883753932274
Node 6055 - Phase 3 Score: 0.47852883753932274
Node 6337 - Phase 3 Score: 0.5409238721893666


In [117]:
final_score = [[] for _ in range(len(nodes_list))]

def aggregate_score(w2=0.6, w1=0.2, w3=0.2):
        for node_id in range(len(nodes_list)):
            entry = phase2_score[node_id]
            if entry:
                p1_score = phase1_score[node_id] if node_id < len(phase1_score) else 0.0
                p3_score = phase3_score[node_id] if node_id < len(phase3_score) and phase3_score[node_id] > 0 else 1

                for recipient, score in entry:
                    final_score_node = w2 * score + w1 * p1_score + w3 * p3_score
                    final_score[node_id].append((recipient, final_score_node))


            if final_score[node_id]:
                 print(node_id, final_score[node_id])

    # call
aggregate_score()

1540 [(1327, 0.5223280639649016), (2766, 0.5224534394931974), (5809, 0.5209119196754703)]
2208 [(1320, 0.5037263544757787)]
3348 [(5342, 0.4996782904333552)]
3627 [(735, 0.518230835131434), (810, 0.5193425535972909), (1412, 0.5352325399344912), (2884, 0.5581504689593433), (3608, 0.5223739215373784), (3701, 0.5188477880825133), (3860, 0.5221818966240019), (3946, 0.5202893072422385), (4360, 0.5191291216044139), (4392, 0.5581265247180401), (4437, 0.5197190823462297), (5224, 0.5210560923889788), (5228, 0.5211062775756445), (5422, 0.5582460901664245), (5972, 0.5288432635119973), (6530, 0.5210045681367804)]
5213 [(3223, 0.4856507943363132), (6337, 0.48725281693245703)]
5419 [(1327, 0.49015421662137726)]
6055 [(729, 0.47879274813418093), (4912, 0.47883429301044034)]
6337 [(2291, 0.5053869016352324), (4043, 0.5138006029390976)]
