In [10]:
import ast
import pandas as pd
import numpy as np

nodes_df = pd.read_csv('nodes.csv' , header = None , names=['node_attribute'], index_col=0)
edges_df = pd.read_csv('edges.csv')


nodes_list = [(index, ast.literal_eval(row.node_attribute))  for index, row in nodes_df.iterrows()]
edges_list = [(row.From, row.To) for index, row in edges_df.iterrows()]

number_of_days = 1448
nodes_list_vec = []
for index,dict_node in nodes_list:
    vec = np.zeros(number_of_days,dtype=np.float32)
    for day, count in dict_node.items():
        vec[day] = count

    nodes_list_vec.append((index, vec))

import networkx as nx


Graph = nx.DiGraph()
for node_id, node_attr_vec in nodes_list_vec:
    Graph.add_node(node_id, x = node_attr_vec)
Graph.add_edges_from(edges_list)

print(f"Number of nodes: {nx.number_of_nodes(Graph)}")
print(f"Number of edges: {nx.number_of_edges(Graph)}")


Number of nodes: 6600
Number of edges: 50897


In [11]:
import torch
from torch_geometric.utils import from_networkx

# Convert to PyG data
data = from_networkx(Graph)

# Convert node attributes to tensor and move to device
X = torch.stack([torch.tensor(attr['x'], dtype=torch.float32) for _, attr in Graph.nodes(data=True)])
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X = X.to(device)
data.x = X
data = data.to(device)

print(data)
print(f"Data is on device: {data.x.device}")


  import torch_geometric.typing
  import torch_geometric.typing
  import torch_geometric.typing
  import torch_geometric.typing
  import torch_geometric.typing
  from .autonotebook import tqdm as notebook_tqdm
  data_dict[key] = torch.as_tensor(value)


Data(x=[6600, 1448], edge_index=[2, 50897])
Data is on device: cuda:0


In [12]:
phase1_score = []  #Holds every node's phase 1 score , index number corresponds to node id
def phase1(node_list_vec, start_day = 0 , current_day=1448):
    for node_id, node_attr_vec in node_list_vec:
        mean = np.mean(node_attr_vec[start_day:current_day])
        std = np.std(node_attr_vec[start_day:current_day])
        today_score = (node_attr_vec[current_day] -mean) / std if std > 0 else 0
        phase1_score.append(today_score)

phase1(nodes_list_vec, start_day=0, current_day=1447)   

In [20]:
from torch_geometric.utils import negative_sampling

phase2_score = [] #Holds every node's phase 2 score , index number corresponds to node id

for i in range(len(nodes_list_vec)):
    phase2_score.append({})  

import torch 
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels , num_layers=3 , dropout = 0.3):
        super(GraphSAGE, self).__init__()
        self.convs = torch.nn.ModuleList()
        self.convs.append(SAGEConv(in_channels, hidden_channels))

        for _ in range(num_layers - 2):
            self.convs.append(SAGEConv(hidden_channels, hidden_channels))
        
        self.convs.append(SAGEConv(hidden_channels, out_channels))

        self.dropout = dropout

    def forward(self, x, edge_index):
        for conv in self.convs[:-1]:
            x = conv(x, edge_index)
            x = F.relu(x)
            x = F.dropout(x, p = self.dropout, training= self.training)
        x = self.convs[-1](x, edge_index)
        return x 

in_channels = data.num_node_features
hidden_channels = 128 
out_channels = 64 

model = GraphSAGE(in_channels, hidden_channels, out_channels).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr =0.01, weight_decay=5e-4)


def train():
    model.train()
    optimizer.zero_grad()
    z = model(data.x, data.edge_index)

    pos_edge_index = data.edge_index
    neg_edge_index = negative_sampling(
        edge_index=data.edge_index,
        num_nodes=data.num_nodes,
        num_neg_samples=pos_edge_index.size(1),
    ).to(device)  

    pos_similarity = (z[pos_edge_index[0]] * z[pos_edge_index[1]]).sum(dim=1)
    pos_loss = F.logsigmoid(pos_similarity).mean()

    neg_similarity = (z[neg_edge_index[0]] * z[neg_edge_index[1]]).sum(dim=1)
    neg_loss = F.logsigmoid(-neg_similarity).mean()

    loss = -pos_loss - neg_loss
    loss.backward()
    optimizer.step()

    return loss.item(), z

print(f"Training on device: {next(model.parameters()).device}")

for epoch in range(1, 500):
    loss, embeddings = train()
    if epoch % 10 == 0:
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

model.eval()
with torch.no_grad():
    z = model(data.x, data.edge_index)

# saving file
    torch.save(z, 'final_embeddings.pt')
    print("\nEmbeddings saved to 'final_embeddings.pt'")

Training on device: cuda:0
Epoch: 010, Loss: 3.1615
Epoch: 020, Loss: 1.4410
Epoch: 030, Loss: 1.4042
Epoch: 040, Loss: 1.4076
Epoch: 050, Loss: 1.3979
Epoch: 060, Loss: 1.3922
Epoch: 070, Loss: 1.3935
Epoch: 080, Loss: 1.3909
Epoch: 090, Loss: 1.3875
Epoch: 100, Loss: 1.3856
Epoch: 110, Loss: 1.3882
Epoch: 120, Loss: 1.3826
Epoch: 130, Loss: 1.3810
Epoch: 140, Loss: 1.3799
Epoch: 150, Loss: 1.3759
Epoch: 160, Loss: 1.3734
Epoch: 170, Loss: 1.3749
Epoch: 180, Loss: 1.3723
Epoch: 190, Loss: 1.3721
Epoch: 200, Loss: 1.3649
Epoch: 210, Loss: 1.3640
Epoch: 220, Loss: 1.3609
Epoch: 230, Loss: 1.3544
Epoch: 240, Loss: 1.3462
Epoch: 250, Loss: 1.3380
Epoch: 260, Loss: 1.3287
Epoch: 270, Loss: 1.3217
Epoch: 280, Loss: 1.3113
Epoch: 290, Loss: 1.2973
Epoch: 300, Loss: 1.2824
Epoch: 310, Loss: 1.2740
Epoch: 320, Loss: 1.2717
Epoch: 330, Loss: 1.2732
Epoch: 340, Loss: 1.2637
Epoch: 350, Loss: 1.2540
Epoch: 360, Loss: 1.2588
Epoch: 370, Loss: 1.2495
Epoch: 380, Loss: 1.2530
Epoch: 390, Loss: 1.253

In [None]:
import pandas as pd 
import ast
from collections import defaultdict

def historical_pattern(current_day = 200):
    csv_file = 'node_day_recipients.csv'

    df = pd.read_csv(csv_file)
    df["day_recipients_str"] = df["day_recipients_str"].apply(ast.literal_eval)

    

    for index, row in df.iterrows():
        node = row['node_id']
        day_recipients = row['day_recipients_str']
        if (current_day < len(day_recipients) and 
            len(day_recipients[current_day]) > 0):  
            
            recipients_today = day_recipients[current_day]
            total_score_not_recipent = [0 for _ in range(len(recipients_today))]
            total_score_recipent = [0 for _ in range(len(recipients_today))]

            for past_day in range(current_day + 1):
                if past_day < len(day_recipients) and len(day_recipients[past_day]) > 0:
                    for past_recipient in day_recipients[past_day]:
                        for j_index, current_recipient in enumerate(recipients_today):  
                            if past_recipient == current_recipient:
                                total_score_recipent[j_index] += 1
                            else:
                                total_score_not_recipent[j_index] += 1

           
            day_scores = []
            for j_index, recipient in enumerate(recipients_today):
                if total_score_not_recipent[j_index] == 0:
                    score = 1
                else:
                    score = total_score_recipent[j_index] / total_score_not_recipent[j_index]
                day_scores.append((recipient, score))
            # Store in node_pattern_dict
            phase2_score[node] = {current_day: day_scores}


historical_pattern()



{200: [(5508, 1)]}
{200: [(2761, 0.15789473684210525), (5508, 0.8333333333333334)]}
{200: [(679, 0.022727272727272728), (1069, 0.07142857142857142), (1460, 0.07142857142857142), (3042, 0.18421052631578946), (3454, 0.046511627906976744), (4043, 0.046511627906976744), (5355, 0.046511627906976744)]}
{200: [(2275, 0.010416666666666666), (2525, 0.010416666666666666), (3401, 0.010416666666666666), (3798, 0.06593406593406594)]}
{200: [(1327, 0.16666666666666666)]}
{200: [(1789, 0.05660377358490566), (1828, 0.09803921568627451), (3223, 0.07692307692307693), (4072, 0.19148936170212766), (4834, 0.05660377358490566), (5581, 0.09803921568627451)]}
{200: [(3442, 0.0625)]}
{200: [(5053, 0.03180212014134275)]}
{200: [(5508, 0.6666666666666666)]}
{200: [(1327, 1)]}
{200: [(1311, 0.05517241379310345), (5656, 0.05517241379310345)]}
{200: [(4072, 0.05747126436781609), (4912, 0.013774104683195593), (6521, 0.027932960893854747)]}
