In [10]:
import ast
import pandas as pd
import numpy as np

nodes_df = pd.read_csv('nodes.csv' , header = None , names=['node_attribute'], index_col=0)
edges_df = pd.read_csv('edges.csv')


nodes_list = [(index, ast.literal_eval(row.node_attribute))  for index, row in nodes_df.iterrows()]
edges_list = [(row.From, row.To) for index, row in edges_df.iterrows()]

number_of_days = 1448
nodes_list_vec = []
for index,dict_node in nodes_list:
    vec = np.zeros(number_of_days,dtype=np.float32)
    for day, count in dict_node.items():
        vec[day] = count

    nodes_list_vec.append((index, vec))

import networkx as nx


Graph = nx.DiGraph()
for node_id, node_attr_vec in nodes_list_vec:
    Graph.add_node(node_id, x = node_attr_vec)
Graph.add_edges_from(edges_list)

print(f"Number of nodes: {nx.number_of_nodes(Graph)}")
print(f"Number of edges: {nx.number_of_edges(Graph)}")


Number of nodes: 6600
Number of edges: 50897


In [11]:
import torch
from torch_geometric.utils import from_networkx

# Convert to PyG data
data = from_networkx(Graph)

# Convert node attributes to tensor and move to device
X = torch.stack([torch.tensor(attr['x'], dtype=torch.float32) for _, attr in Graph.nodes(data=True)])
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X = X.to(device)
data.x = X
data = data.to(device)

print(data)
print(f"Data is on device: {data.x.device}")


  import torch_geometric.typing
  import torch_geometric.typing
  import torch_geometric.typing
  import torch_geometric.typing
  import torch_geometric.typing
  from .autonotebook import tqdm as notebook_tqdm
  data_dict[key] = torch.as_tensor(value)


Data(x=[6600, 1448], edge_index=[2, 50897])
Data is on device: cuda:0


In [12]:
phase1_score = []  #Holds every node's phase 1 score , index number corresponds to node id
def phase1(node_list_vec, start_day = 0 , current_day=1448):
    for node_id, node_attr_vec in node_list_vec:
        mean = np.mean(node_attr_vec[start_day:current_day])
        std = np.std(node_attr_vec[start_day:current_day])
        today_score = (node_attr_vec[current_day] -mean) / std if std > 0 else 0
        phase1_score.append(today_score)

phase1(nodes_list_vec, start_day=0, current_day=1447)   

In [57]:
from torch_geometric.utils import negative_sampling

phase2_score = [] #Holds every node's phase 2 score , index number corresponds to node id

for i in range(len(nodes_list_vec)):
    phase2_score.append({})  

import torch 
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels , num_layers=3 , dropout = 0.3):
        super(GraphSAGE, self).__init__()
        self.convs = torch.nn.ModuleList()
        self.convs.append(SAGEConv(in_channels, hidden_channels))

        for _ in range(num_layers - 2):
            self.convs.append(SAGEConv(hidden_channels, hidden_channels))
        
        self.convs.append(SAGEConv(hidden_channels, out_channels))

        self.dropout = dropout

    def forward(self, x, edge_index):
        for conv in self.convs[:-1]:
            x = conv(x, edge_index)
            x = F.relu(x)
            x = F.dropout(x, p = self.dropout, training= self.training)
        x = self.convs[-1](x, edge_index)
        return x 

in_channels = data.num_node_features
hidden_channels = 128 
out_channels = 64 

model = GraphSAGE(in_channels, hidden_channels, out_channels).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr =0.01, weight_decay=5e-4)


def train():
    model.train()
    optimizer.zero_grad()
    z = model(data.x, data.edge_index)

    pos_edge_index = data.edge_index
    neg_edge_index = negative_sampling(
        edge_index=data.edge_index,
        num_nodes=data.num_nodes,
        num_neg_samples=pos_edge_index.size(1),
    ).to(device)  

    pos_similarity = (z[pos_edge_index[0]] * z[pos_edge_index[1]]).sum(dim=1)
    pos_loss = F.logsigmoid(pos_similarity).mean()

    neg_similarity = (z[neg_edge_index[0]] * z[neg_edge_index[1]]).sum(dim=1)
    neg_loss = F.logsigmoid(-neg_similarity).mean()

    loss = -pos_loss - neg_loss
    loss.backward()
    optimizer.step()

    return loss.item(), z

print(f"Training on device: {next(model.parameters()).device}")

for epoch in range(1, 500):
    loss, embeddings = train()
    if epoch % 10 == 0:
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

model.eval()
with torch.no_grad():
    z = model(data.x, data.edge_index)

# saving file
    torch.save(z, 'final_embeddings.pt')
    print("\nEmbeddings saved to 'final_embeddings.pt'")

Training on device: cuda:0
Epoch: 010, Loss: 4.8065
Epoch: 020, Loss: 1.6864
Epoch: 030, Loss: 1.4552
Epoch: 040, Loss: 1.4511
Epoch: 050, Loss: 1.4159
Epoch: 060, Loss: 1.4161
Epoch: 070, Loss: 1.4002
Epoch: 080, Loss: 1.3952
Epoch: 090, Loss: 1.4006
Epoch: 100, Loss: 1.3963
Epoch: 110, Loss: 1.3859
Epoch: 120, Loss: 1.3834
Epoch: 130, Loss: 1.3886
Epoch: 140, Loss: 1.3778
Epoch: 150, Loss: 1.3679
Epoch: 160, Loss: 1.3573
Epoch: 170, Loss: 1.3481
Epoch: 180, Loss: 1.3412
Epoch: 190, Loss: 1.3328
Epoch: 200, Loss: 1.3273
Epoch: 210, Loss: 1.3230
Epoch: 220, Loss: 1.3013
Epoch: 230, Loss: 1.3169
Epoch: 240, Loss: 1.3148
Epoch: 250, Loss: 1.3104
Epoch: 260, Loss: 1.3089
Epoch: 270, Loss: 1.3107
Epoch: 280, Loss: 1.3004
Epoch: 290, Loss: 1.2905
Epoch: 300, Loss: 1.2848
Epoch: 310, Loss: 1.2975
Epoch: 320, Loss: 1.2917
Epoch: 330, Loss: 1.2905
Epoch: 340, Loss: 1.2856
Epoch: 350, Loss: 1.2879
Epoch: 360, Loss: 1.2838
Epoch: 370, Loss: 1.2817
Epoch: 380, Loss: 1.2826
Epoch: 390, Loss: 1.278

In [59]:
import pandas as pd 
import ast
from collections import defaultdict

def historical_pattern(current_day =  920, weight_distribution=0.3):
    csv_file = 'node_day_recipients.csv'

    df = pd.read_csv(csv_file)
    df["day_recipients_str"] = df["day_recipients_str"].apply(ast.literal_eval)

    

    for index, row in df.iterrows():
        node = row['node_id']
        day_recipients = row['day_recipients_str']
        if (current_day < len(day_recipients) and 
            len(day_recipients[current_day]) > 0):  
            
            recipients_today = day_recipients[current_day]
            total_score_not_recipent = [0 for _ in range(len(recipients_today))]
            total_score_recipent = [0 for _ in range(len(recipients_today))]

            for past_day in range(current_day + 1):
                if past_day < len(day_recipients) and len(day_recipients[past_day]) > 0:
                    for past_recipient in day_recipients[past_day]:
                        for j_index, current_recipient in enumerate(recipients_today):  
                            if past_recipient == current_recipient:
                                total_score_recipent[j_index] += 1
                            else:
                                total_score_not_recipent[j_index] += 1

           
                # Calculate final scores with cosine similarity
            day_scores = []
            for j_index, recipient in enumerate(recipients_today):
                    # Get historical pattern score
                if total_score_not_recipent[j_index] == 0:
                    historical_score = 1.0
                else:
                    historical_score = total_score_recipent[j_index] / total_score_not_recipent[j_index]
                    
                    # If historical score is 1, keep it as is
                if historical_score == 1.0:
                    final_score = 1.0
                else:
                        # Calculate cosine similarity between current node and recipient
                    try:
                            # Get embeddings for current node and recipient
                        node_embedding = z[node]
                        recipient_embedding = z[recipient]
                            
                            # Calculate cosine similarity using PyTorch
                        cos = F.cosine_similarity(
                                node_embedding.unsqueeze(0), 
                                recipient_embedding.unsqueeze(0)
                        ).item()

                        cos = (cos+1)/2  #Normalize [-1,1] to [0,1]
                        
                            
                            # Combine historical pattern and cosine similarity using weight
                        final_score = (weight_distribution * historical_score + 
                                         (1 - weight_distribution) * (1-cos))
                            
                            # Ensure score is between 0 and 1
                        final_score = max(0.0, min(1.0, final_score))
                            
                    except (IndexError, ValueError) as e:
                            # Fallback if there's an issue with embeddings
                        print(f"Warning: Error calculating cosine similarity for nodes {node} and {recipient}: {e}")
                        final_score = historical_score
                    
                day_scores.append((recipient, final_score))
                
                # Store in phase2_score at the node's index position
            phase2_score[node][current_day] = day_scores

# Call the function for a single day
historical_pattern(current_day=920, weight_distribution=0.3)


# Print some results to verify the structure
print(f"phase2_score length: {len(phase2_score)}")
for node_id in range(len(phase2_score)):
    if phase2_score[node_id]:  # Only print nodes that have data
        print(f"Node {node_id}: {phase2_score[node_id]}")



phase2_score length: 6600
Node 171: {920: [(602, 0.00287441056799318), (1023, 0.004330333467559296), (1041, 0.003948359357159069), (1049, 0.002059462925662165), (1086, 0.00410860729254899), (1404, 0.0036040050325572585), (1588, 0.0038240866530346185), (1859, 0.003064003991683531), (2293, 0.004425282832468902), (2409, 0.0011841631065177226), (2719, 0.013578047043775795), (2746, 0.004908009670065634), (3042, 0.0010880744586753152), (3296, 0.0013576901088523173), (4033, 0.003733801471926), (4041, 0.0036789986946129723), (4172, 0.013283946687095101), (4442, 0.004549851590506965), (4825, 0.0013164761273757272), (4840, 0.007878125331693072), (4943, 0.004903664761893684), (5095, 0.006191027014037412), (5284, 0.006637429505101143), (5859, 0.0038542206824945754), (5870, 0.004811822157255642), (5921, 0.0013399245946303657), (5982, 0.00942204883772201), (6179, 0.0089713334150917), (6185, 0.004648572468495631), (6241, 0.004485889792816018)]}
Node 253: {920: [(171, 0.007956426695128468), (180, 0.00