- node 2개로 pair로만 이루어진 여러 개의 graph를 dataset으로 하기

In [1]:
import numpy as np
from scipy.special import expit as sigmoid
import igraph as ig
import random
import argparse
import os
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns

import data_generation as dg

In [7]:
n = random.choice([500, 1000])
d = 2
s0 = 1
graph_type = 'ER'
linear_sem_type = 'exp'
nonlinear_sem_type = 'mlp'
sem_type = 'linear'
save_dir = '/home/jina/reprod/data/simple'


In [None]:
# for _ in range(10000):
#     dg.generate_data(n=n, d=d, s0=s0, graph_type=graph_type, linear_sem_type=linear_sem_type, nonlinear_sem_type=nonlinear_sem_type, type=sem_type, save_dir=save_dir)

In [None]:
folder_path = save_dir
for files in os.listdir(folder_path):
    if files.startswith('00000'):
        print(files)
        file_path = os.path.join(save_dir, files)
        for csv in os.listdir(file_path):
            if csv.startswith('B_true'):
                csv_path = os.path.join(file_path, csv)
                dg.visualize_dag(csv_path)

In [1]:
import pickle

pk = '/home/jina/reprod/data/pickle/simple'
with open(pk, 'rb') as f:
    train_data = pickle.load(f)

pk_v = '/home/jina/reprod/data/pickle/simple_test'
with open(pk_v, 'rb') as f:
    valid_data = pickle.load(f)

# train-test split
# train_data = data[:2400]
# valid_data = data[2400:]

In [None]:
# train_data = train_data[1000:]

In [5]:
len(train_data)

2400

In [6]:
len(valid_data)

600

In [6]:
from torch_geometric.nn import SAGEConv, BatchNorm
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import numpy as np 
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class GraphSAGE(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers=3, residual=True):
        super(GraphSAGE, self).__init__()
        self.num_layers = num_layers
        self.residual = residual
        
        # Create convolutional layers dynamically based on num_layers
        self.convs = nn.ModuleList()
        self.bns = nn.ModuleList()
        
        # First layer
        self.convs.append(SAGEConv(in_channels, hidden_channels))
        self.bns.append(BatchNorm(hidden_channels))
        
        # Hidden layers
        for _ in range(num_layers - 2):
            self.convs.append(SAGEConv(hidden_channels, hidden_channels))
            self.bns.append(BatchNorm(hidden_channels))
        
        # Last layer
        self.convs.append(SAGEConv(hidden_channels, out_channels))
        
        # Linear layer for dimension matching if needed
        if residual and in_channels != out_channels:
            self.dim_match = nn.Linear(in_channels, out_channels)
        else:
            self.dim_match = None

    def encode(self, x, edge_index):
        x_res = x  # For potential residual connection
        
        for i in range(self.num_layers - 1):
            x = self.convs[i](x, edge_index)
            x = self.bns[i](x)
            x = F.relu(x)
            x = F.dropout(x, p=0.3, training=self.training)
        
        # Apply the last convolutional layer without activation for output
        x = self.convs[-1](x, edge_index)
        
        # Residual connection after the last hidden layer if enabled
        if self.residual and self.num_layers > 2:
            if self.dim_match is not None:
                x_res = self.dim_match(x_res)  # Adjust dimension if necessary
            x += x_res

        return x
    
    def decode(self, z, edge_index):
        z_product = torch.sum(z[edge_index[0]] * z[edge_index[1]], dim=1)
        return z_product / (torch.norm(z[edge_index[0]], dim=1) * torch.norm(z[edge_index[1]], dim=1) + 1e-6)  

    def forward(self, x, edge_index):
        z = self.encode(x, edge_index)
        return z

In [11]:

# Random seed setting
def seed_everything(seed):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

seed_everything(47)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model, optimizer, loss initialization
in_channels = train_data[0].num_features
hidden_channels = 64  # Increased hidden channel size
out_channels = 32
model = GraphSAGE(in_channels, hidden_channels, out_channels, num_layers=3).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.005, weight_decay=1e-4)  # Adjusted learning rate for stability
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=10, verbose=True)
criterion = nn.BCEWithLogitsLoss()  # Using BCEWithLogitsLoss since sigmoid is not applied in decode

def train(train_data):
    model.train()
    total_loss = 0
    total_samples = 0
    all_scores = []
    all_labels = []
    for data in train_data:
        data = data.to(device)
        optimizer.zero_grad()
        edge_index = data.edge_index
        z = model(data.x.float(), edge_index)
        # print(z.shape)
        data.y[data.y==2] = 1
        
        score = model.decode(z, edge_index)

        all_scores.append(score)
        all_labels.append(data.y)

        loss = criterion(score, data.y)
        loss.backward()
        # nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
        optimizer.step()
        total_loss += loss.item() * (data.y.size(0))
        total_samples += data.y.size(0)
        
    all_scores = torch.cat(tuple(all_scores), dim=0).detach().cpu().numpy()
    all_labels = torch.cat(tuple(all_labels), dim=0).detach().cpu().numpy()
    predictions = (torch.sigmoid(torch.tensor(all_scores)) > 0.5).int().numpy()

    accuracy = accuracy_score(all_labels, predictions)
    f1 = f1_score(all_labels, predictions)

    return total_loss / total_samples, accuracy, f1

# Testing loop
def test(valid_data):
    model.eval()
    all_scores = []
    all_labels = []
    with torch.no_grad():
        for data in valid_data:
            data = data.to(device)
            edge_index = data.edge_index
            data.y[data.y==2] = 1
            z = model(data.x.float(), edge_index)

            out = model.decode(z, edge_index).view(-1).sigmoid()
            
            # Append results to all_scores and all_labels
            all_scores.extend(out.cpu().numpy())
            all_labels.extend(data.y.view(-1).cpu().numpy())  # Assuming data.y is the label tensor

    # Ensure labels are binary
    if len(np.unique(all_labels)) > 2:
        raise ValueError("Expected binary labels but found multi-class labels. Check your data.")
        
    # Calculate metrics
    auc = roc_auc_score(all_labels, all_scores, average="macro")
    predictions = [1 if score > 0.5 else 0 for score in all_scores]
    accuracy = accuracy_score(all_labels, predictions)
    f1 = f1_score(all_labels, predictions)

    return auc, accuracy, f1, predictions, all_labels

# Run training for all epochs
epochs = 200
for epoch in range(epochs):
    train_loss, train_accuracy, train_f1 = train(train_data)
    scheduler.step(train_loss)  # Update learning rate based on training loss
    if epoch % 10 == 0:
        auc, accuracy, f1, predicted, all_labels = test(valid_data)
        print(f'Epoch {epoch}, Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Train F1 Score: {train_f1:.4f}, Test AUC: {auc:.4f}, Test Accuracy: {accuracy:.4f}, Test F1 Score: {f1:.4f}')


# Final Test
auc, accuracy, f1, predicted, all_labels = test(valid_data)
print(f'Final Test AUC: {auc:.4f}, Test Accuracy: {accuracy:.4f}, Test F1 Score: {f1:.4f}')




Epoch 0, Loss: 0.6423, Train Accuracy: 0.6542, Train F1 Score: 0.6835, Test AUC: 0.3098, Test Accuracy: 0.5035, Test F1 Score: 0.6214
Epoch 10, Loss: 0.6412, Train Accuracy: 0.6525, Train F1 Score: 0.6709, Test AUC: 0.3790, Test Accuracy: 0.5440, Test F1 Score: 0.6622
Epoch 20, Loss: 0.6210, Train Accuracy: 0.6849, Train F1 Score: 0.7014, Test AUC: 0.4270, Test Accuracy: 0.5110, Test F1 Score: 0.6203
Epoch 30, Loss: 0.6238, Train Accuracy: 0.6769, Train F1 Score: 0.6905, Test AUC: 0.4074, Test Accuracy: 0.5170, Test F1 Score: 0.6293
Epoch 40, Loss: 0.6084, Train Accuracy: 0.7010, Train F1 Score: 0.7156, Test AUC: 0.4485, Test Accuracy: 0.5405, Test F1 Score: 0.6302
Epoch 50, Loss: 0.6047, Train Accuracy: 0.7005, Train F1 Score: 0.7101, Test AUC: 0.3793, Test Accuracy: 0.5080, Test F1 Score: 0.6000
Epoch 60, Loss: 0.5884, Train Accuracy: 0.7171, Train F1 Score: 0.7326, Test AUC: 0.3766, Test Accuracy: 0.5035, Test F1 Score: 0.5975
Epoch 70, Loss: 0.5433, Train Accuracy: 0.7867, Train F1

KeyboardInterrupt: 

In [12]:
for a, b in zip(predicted, all_labels):
    print(a, b)

0 1.0
0 1.0
1 0.0
0 1.0
1 0.0
1 1.0
1 0.0
1 0.0
0 0.0
1 1.0
1 0.0
0 1.0
1 0.0
0 0.0
0 1.0
1 0.0
1 0.0
1 0.0
0 0.0
1 1.0
1 1.0
1 0.0
0 1.0
0 1.0
0 0.0
1 1.0
1 0.0
1 1.0
0 0.0
0 1.0
1 1.0
0 0.0
0 1.0
0 1.0
1 0.0
0 1.0
1 0.0
1 1.0
0 1.0
0 1.0
1 1.0
1 0.0
1 1.0
0 1.0
1 1.0
0 1.0
1 1.0
1 0.0
1 0.0
1 0.0
0 0.0
1 1.0
1 0.0
0 0.0
1 1.0
1 1.0
0 0.0
1 0.0
1 1.0
1 1.0
0 1.0
0 1.0
1 0.0
0 1.0
1 0.0
1 0.0
0 0.0
1 0.0
0 1.0
1 0.0
1 0.0
1 1.0
1 1.0
0 0.0
1 1.0
0 0.0
1 0.0
1 0.0
0 1.0
0 1.0
1 1.0
1 0.0
0 0.0
1 0.0
1 1.0
0 1.0
1 1.0
1 0.0
0 0.0
1 1.0
0 0.0
1 0.0
1 1.0
1 1.0
0 1.0
1 1.0
0 0.0
0 1.0
1 0.0
1 1.0
0 0.0
1 0.0
1 1.0
0 1.0
1 1.0
0 0.0
1 0.0
1 0.0
0 0.0
0 1.0
1 1.0
0 0.0
0 1.0
1 1.0
0 1.0
1 1.0
1 0.0
1 0.0
0 0.0
1 0.0
1 1.0
0 1.0
1 0.0
1 1.0
1 0.0
1 1.0
0 0.0
1 1.0
1 1.0
1 0.0
0 0.0
1 0.0
1 1.0
1 0.0
0 0.0
0 0.0
1 1.0
0 1.0
1 1.0
1 0.0
1 0.0
1 1.0
1 0.0
1 1.0
1 0.0
1 0.0
1 1.0
1 0.0
0 0.0
1 1.0
1 0.0
1 1.0
1 1.0
1 0.0
1 1.0
1 1.0
1 0.0
1 1.0
0 0.0
0 0.0
0 1.0
0 1.0
1 1.0
1 0.0
0 0.0
1 0.0
0 0.

In [13]:
# Convert to numpy arrays for counting values and cast to integer type
predicted_flat = np.array(predicted).astype(int)
all_labels_flat = np.array(all_labels).astype(int)

# Count values
count_predicted = np.bincount(predicted_flat)
count_labels = np.bincount(all_labels_flat)

# Print counts
print("Predicted counts:", count_predicted)
print("Actual labels counts:", count_labels)

# Calculate accuracy, precision, recall, and F1 score
accuracy = accuracy_score(all_labels_flat, predicted_flat)
precision = precision_score(all_labels_flat, predicted_flat)
recall = recall_score(all_labels_flat, predicted_flat)
f1 = f1_score(all_labels_flat, predicted_flat)

# Print the results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

tn, fp, fn, tp = confusion_matrix(all_labels_flat, predicted_flat).ravel()
print(tn, fp, fn, tp)

Predicted counts: [ 755 1245]
Actual labels counts: [1017  983]
Accuracy: 0.5240
Precision: 0.5124
Recall: 0.6490
F1 Score: 0.5727
410 607 345 638


In [7]:
# Random seed setting
def seed_everything(seed):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

In [8]:
import torch
import torch.nn as nn
from torch_geometric.loader import DataLoader
from model import GraphSAGE
from sklearn.metrics import f1_score
from collections import Counter

batch_size = 4
epochs = 20
num_layers = 3
lr = 0.0001
num_neighbor = 2
threshold = 0.5
# seed_everything(47)
seed_everything(4)

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=batch_size, shuffle=True)

loader_type = {
    "train": train_loader,
    "test": valid_loader
}

node_dim = train_data[0].x.size(1)
edge_dim = train_data[0].edge_attr.size(1)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = 'cpu'
model = GraphSAGE(node_dim, edge_dim, num_layers=num_layers, output_class=2, device=device, num_samples=None).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
# criterion = nn.CrossEntropyLoss() 

best_test_f1 = 0
best_test_acc = 0
best_test_loss = 9999


for epoch in range(epochs):

    for mode in ["train", "test"]:
        running_loss = 0
        running_acc = 0
        total_samples = 0
        all_predictions = []
        all_labels = []

        if mode == 'train':
            model.train()
        elif mode == 'test':
            model.eval()
        
        for data in loader_type[mode]:
         
            data.x = data.x.float()
            data.y = data.y.long()

            data.y[data.y == 2] = 1

            # class_counts = Counter(data.y.numpy())
            # print(class_counts)
            # total_count = sum(class_counts.values())
            # print(total_count)
            # class_weights = {cls: total_count / count for cls, count in class_counts.items()}
            # weights = torch.tensor([class_weights[cls] for cls in range(3)], dtype=torch.float).to(device)

            data = data.to(device)

            optimizer.zero_grad()

            with torch.set_grad_enabled(mode == 'train'):
                logits = model(data)
                # if epoch == 9:
                #     print(torch.sigmoid(logits))
                preds = (torch.sigmoid(logits) > threshold).float()
                # pos_weight = torch.tensor([3.0]).to(device)
                # loss = focal_loss(logits, data.y.view(-1, 1).float())
                loss = nn.BCEWithLogitsLoss()(logits, data.y.view(-1, 1).float())
                # loss = nn.CrossEntropyLoss()(logits, data.y)
                # loss = nn.CrossEntropyLoss(weight=weights)(logits, data.y)
                # loss = F1Loss(classes=3)(logits, data.y)
                # loss = WeightedF1Loss(classes=3)(logits, data.y, class_weights=weights)
                
                # _, preds = torch.max(logits, 1)

                if mode == 'train':
                    loss.backward()
                    # 각 파라미터의 기울기를 출력
                    # print(f"Epoch {epoch}: Gradients")
                    # for name, param in model.named_parameters():
                    #     if param.grad is not None:
                    #         print(f"  Layer: {name} | Gradient Mean: {param.grad.mean()} | Gradient Std: {param.grad.std()}")
    
                    optimizer.step()


                all_predictions.extend(preds.cpu().numpy())
                all_labels.extend(data.y.cpu().numpy())

            running_loss += loss.item() * data.y.size(0)
            # print(torch.sum(preds == data.y), data.y.size(0))
            # running_acc += torch.sum(preds == data.y).item()
            running_acc += torch.sum(preds.squeeze() == data.y).item()
            total_samples += data.y.size(0)

        epoch_loss = running_loss / total_samples
        epoch_acc = running_acc / total_samples
        epoch_f1 = f1_score(all_labels, all_predictions, average='binary')
        print(f'epoch: {epoch+1}, {mode} loss: {epoch_loss:.3f}, acc: {epoch_acc:.3f} f1: {epoch_f1:.3f}')

        if mode == 'test' and best_test_f1 < epoch_f1:
            best_test_f1 = epoch_f1

        if mode == 'test' and best_test_acc < epoch_acc:
            best_test_acc = epoch_acc

        if mode == 'test' and best_test_loss > epoch_loss:
            # file_name = f'epoch{epoch+1:04d}_{str(epoch_loss)[:5].replace(".","")}'
            # save_model(model, saved_dir, f'{file_name}.pth')
            best_test_loss = epoch_loss

print("training end!!")
print(f"best f1: {best_test_f1}, best acc: {best_test_acc}, best loss: {best_test_loss}")

epoch: 1, train loss: 0.354, acc: 0.886 f1: 0.879
epoch: 1, test loss: 0.080, acc: 0.978 f1: 0.978
epoch: 2, train loss: 0.081, acc: 0.979 f1: 0.979
epoch: 2, test loss: 0.043, acc: 0.987 f1: 0.987
epoch: 3, train loss: 0.050, acc: 0.987 f1: 0.987
epoch: 3, test loss: 0.036, acc: 0.992 f1: 0.992
epoch: 4, train loss: 0.050, acc: 0.986 f1: 0.986
epoch: 4, test loss: 0.021, acc: 0.992 f1: 0.992
epoch: 5, train loss: 0.037, acc: 0.988 f1: 0.989
epoch: 5, test loss: 0.025, acc: 0.993 f1: 0.993
epoch: 6, train loss: 0.030, acc: 0.990 f1: 0.990
epoch: 6, test loss: 0.021, acc: 0.993 f1: 0.993
epoch: 7, train loss: 0.030, acc: 0.991 f1: 0.991
epoch: 7, test loss: 0.017, acc: 0.995 f1: 0.995
epoch: 8, train loss: 0.022, acc: 0.993 f1: 0.993
epoch: 8, test loss: 0.022, acc: 0.993 f1: 0.993
epoch: 9, train loss: 0.019, acc: 0.994 f1: 0.994
epoch: 9, test loss: 0.014, acc: 0.995 f1: 0.995
epoch: 10, train loss: 0.020, acc: 0.994 f1: 0.994
epoch: 10, test loss: 0.020, acc: 0.993 f1: 0.993
epoch: 1

In [9]:
for a, b in zip(all_predictions, all_labels):
    print(a, b)

[0.] 0
[0.] 0
[1.] 1
[0.] 0
[0.] 0
[0.] 0
[1.] 1
[0.] 0
[1.] 1
[1.] 1
[1.] 1
[1.] 1
[1.] 1
[1.] 1
[1.] 1
[0.] 0
[0.] 0
[1.] 1
[1.] 1
[0.] 0
[0.] 0
[0.] 0
[1.] 1
[0.] 0
[1.] 1
[0.] 0
[1.] 1
[0.] 0
[0.] 0
[1.] 1
[1.] 1
[0.] 0
[1.] 1
[0.] 0
[0.] 0
[1.] 1
[1.] 1
[0.] 0
[0.] 0
[1.] 1
[1.] 1
[0.] 0
[1.] 1
[1.] 1
[1.] 1
[0.] 0
[0.] 0
[1.] 1
[1.] 1
[0.] 0
[1.] 1
[1.] 1
[1.] 1
[1.] 1
[0.] 0
[1.] 1
[0.] 0
[0.] 0
[1.] 1
[0.] 0
[0.] 0
[0.] 0
[0.] 0
[1.] 1
[0.] 0
[1.] 1
[0.] 0
[0.] 0
[0.] 0
[0.] 0
[0.] 0
[0.] 0
[1.] 1
[0.] 0
[1.] 1
[1.] 1
[0.] 0
[1.] 1
[1.] 1
[1.] 1
[1.] 1
[0.] 0
[1.] 1
[1.] 1
[1.] 1
[1.] 1
[1.] 1
[0.] 0
[1.] 1
[0.] 0
[1.] 1
[1.] 1
[1.] 1
[0.] 0
[1.] 1
[1.] 1
[1.] 1
[0.] 0
[0.] 0
[1.] 1
[0.] 0
[0.] 0
[0.] 0
[0.] 0
[0.] 0
[1.] 1
[0.] 0
[0.] 0
[0.] 0
[1.] 1
[0.] 0
[0.] 0
[0.] 0
[0.] 0
[1.] 1
[0.] 0
[0.] 0
[0.] 0
[1.] 1
[0.] 0
[0.] 0
[0.] 0
[1.] 1
[1.] 1
[1.] 1
[0.] 0
[0.] 0
[1.] 1
[0.] 0
[1.] 1
[1.] 1
[1.] 1
[0.] 0
[0.] 0
[1.] 1
[1.] 1
[0.] 0
[0.] 0
[1.] 1
[0.] 0
[1.] 1
[0.] 0
[0.] 0

In [10]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

# 정확도 계산
accuracy = accuracy_score(all_labels, all_predictions)

# F1 점수 계산 (binary classification이므로 average='binary')
f1 = f1_score(all_labels, all_predictions)

# 혼동 행렬 계산
tn, fp, fn, tp = confusion_matrix(all_labels, all_predictions).ravel()

# 결과 출력
print(f"Accuracy: {accuracy:}")
print(f"F1 Score: {f1:}")
print("Confusion Matrix:")
print(tn, fp, fn, tp)


Accuracy: 0.9933333333333333
F1 Score: 0.9933110367892977
Confusion Matrix:
299 0 4 297


In [None]:
import torch
import torch.nn as nn
from torch_geometric.loader import DataLoader
from model import GraphSAGE
from sklearn.metrics import f1_score

batch_size = 4
epochs = 50
num_layers = 3
lr = 0.0001
num_neighbor = 2
threshold = 0.5
seed_everything(4)

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=batch_size, shuffle=False)

loader_type = {
    "train": train_loader,
    "test": valid_loader
}

node_dim = train_data[0].x.size(1)
edge_dim = train_data[0].edge_attr.size(1)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GraphSAGE(node_dim, edge_dim, num_layers=num_layers, output_class=2, device=device, num_samples=None).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

best_test_f1 = 0
best_test_acc = 0
best_test_loss = 9999

for epoch in range(epochs):
    # To store predictions and labels
    saved_predictions = {"train": [], "test": []}
    saved_labels = {"train": [], "test": []}
    
    for mode in ["train", "test"]:
        running_loss = 0
        running_acc = 0
        total_samples = 0
        all_predictions = []
        all_labels = []

        if mode == 'train':
            model.train()
        else:
            model.eval()
        
        for data in loader_type[mode]:
            data.x = data.x.float()
            data.y = data.y.long()
            data.y[data.y == 2] = 1  # Optional: Map class 2 to class 1

            data = data.to(device)
            optimizer.zero_grad()

            with torch.set_grad_enabled(mode == 'train'):
                logits = model(data)
                preds = (torch.sigmoid(logits) > threshold).float()
                loss = nn.BCEWithLogitsLoss()(logits, data.y.view(-1, 1).float())

                if mode == 'train':
                    loss.backward()
                    optimizer.step()

                all_predictions.extend(preds.cpu().numpy())
                all_labels.extend(data.y.cpu().numpy())

            running_loss += loss.item() * data.y.size(0)
            running_acc += torch.sum(preds.squeeze() == data.y).item()
            total_samples += data.y.size(0)

        epoch_loss = running_loss / total_samples
        epoch_acc = running_acc / total_samples
        epoch_f1 = f1_score(all_labels, all_predictions, average='binary')

        print(f'epoch: {epoch+1}, {mode} loss: {epoch_loss:.3f}, acc: {epoch_acc:.3f}, f1: {epoch_f1:.3f}')

        if mode == 'test' and best_test_f1 < epoch_f1:
            best_test_f1 = epoch_f1

        if mode == 'test' and best_test_acc < epoch_acc:
            best_test_acc = epoch_acc

        if mode == 'test' and best_test_loss > epoch_loss:
            best_test_loss = epoch_loss

        # Save predictions and labels
        saved_predictions[mode].extend(all_predictions)
        saved_labels[mode].extend(all_labels)

print("training end!!")
print(f"best f1: {best_test_f1}, best acc: {best_test_acc}, best loss: {best_test_loss}")

# Print saved predictions and labels after training
# print("\nTrain Predictions:", saved_predictions["train"])
# print("\nTrain Labels:", saved_labels["train"])
# print("\nTest Predictions:", saved_predictions["test"])
# print("\nTest Labels:", saved_labels["test"])


epoch: 1, train loss: 0.692, acc: 0.519, f1: 0.455
epoch: 1, test loss: 0.684, acc: 0.508, f1: 0.000
epoch: 2, train loss: 0.647, acc: 0.606, f1: 0.575
epoch: 2, test loss: 0.514, acc: 0.756, f1: 0.671
epoch: 3, train loss: 0.586, acc: 0.666, f1: 0.599
epoch: 3, test loss: 0.397, acc: 0.841, f1: 0.806
epoch: 4, train loss: 0.547, acc: 0.697, f1: 0.618
epoch: 4, test loss: 0.391, acc: 0.842, f1: 0.808
epoch: 5, train loss: 0.520, acc: 0.721, f1: 0.647
epoch: 5, test loss: 0.339, acc: 0.871, f1: 0.849
epoch: 6, train loss: 0.498, acc: 0.744, f1: 0.674
epoch: 6, test loss: 0.304, acc: 0.919, f1: 0.910
epoch: 7, train loss: 0.475, acc: 0.764, f1: 0.703
epoch: 7, test loss: 0.304, acc: 0.907, f1: 0.896
epoch: 8, train loss: 0.468, acc: 0.770, f1: 0.713
epoch: 8, test loss: 0.265, acc: 0.940, f1: 0.935
epoch: 9, train loss: 0.462, acc: 0.773, f1: 0.715
epoch: 9, test loss: 0.273, acc: 0.918, f1: 0.909
epoch: 10, train loss: 0.455, acc: 0.780, f1: 0.726
epoch: 10, test loss: 0.233, acc: 0.942

In [5]:
for a, b in zip(saved_predictions['train'], saved_labels['train']):
    print(a, b)

[0.] 0
[1.] 1
[0.] 0
[0.] 0
[0.] 0
[1.] 1
[0.] 0
[1.] 1
[0.] 0
[0.] 0
[1.] 1
[1.] 1
[1.] 1
[0.] 0
[1.] 1
[0.] 1
[0.] 0
[0.] 0
[1.] 1
[0.] 0
[0.] 0
[1.] 1
[1.] 1
[1.] 1
[0.] 0
[1.] 1
[1.] 1
[1.] 1
[0.] 0
[0.] 0
[0.] 0
[1.] 1
[0.] 0
[0.] 0
[1.] 1
[1.] 1
[0.] 0
[1.] 1
[1.] 1
[0.] 0
[0.] 1
[0.] 0
[0.] 1
[0.] 0
[0.] 1
[1.] 1
[0.] 0
[0.] 0
[0.] 0
[1.] 1
[0.] 0
[0.] 1
[1.] 1
[1.] 1
[1.] 1
[1.] 1
[0.] 1
[0.] 0
[1.] 1
[0.] 0
[1.] 1
[1.] 1
[0.] 1
[0.] 1
[0.] 1
[0.] 0
[0.] 1
[0.] 0
[1.] 1
[0.] 0
[1.] 1
[1.] 1
[0.] 0
[1.] 1
[1.] 1
[1.] 1
[1.] 1
[1.] 1
[0.] 0
[1.] 1
[1.] 1
[1.] 1
[0.] 0
[0.] 0
[0.] 0
[0.] 0
[0.] 0
[0.] 0
[0.] 0
[0.] 0
[0.] 0
[0.] 0
[1.] 1
[0.] 0
[0.] 0
[0.] 1
[1.] 1
[0.] 0
[0.] 0
[1.] 1
[0.] 0
[0.] 1
[0.] 0
[1.] 1
[1.] 1
[1.] 1
[0.] 1
[0.] 0
[0.] 0
[0.] 1
[1.] 1
[1.] 1
[1.] 1
[1.] 1
[0.] 0
[1.] 1
[0.] 1
[1.] 1
[1.] 1
[0.] 0
[0.] 1
[1.] 1
[1.] 1
[0.] 1
[0.] 1
[1.] 1
[0.] 0
[1.] 1
[0.] 0
[0.] 0
[1.] 1
[1.] 1
[0.] 0
[0.] 0
[1.] 1
[0.] 0
[0.] 1
[1.] 0
[0.] 1
[0.] 0
[0.] 0
[0.] 0
[0.] 0

In [6]:
for a, b in zip(saved_predictions['test'], saved_labels['test']):
    print(a, b)

[0.] 1
[0.] 0
[0.] 0
[0.] 0
[1.] 1
[1.] 1
[1.] 1
[1.] 1
[0.] 0
[0.] 0
[1.] 1
[1.] 1
[1.] 1
[0.] 0
[1.] 1
[1.] 1
[0.] 0
[0.] 0
[0.] 0
[0.] 0
[1.] 1
[1.] 1
[0.] 0
[1.] 1
[1.] 1
[1.] 1
[0.] 0
[1.] 1
[0.] 0
[0.] 0
[0.] 1
[1.] 1
[0.] 0
[1.] 1
[0.] 0
[0.] 0
[1.] 1
[0.] 0
[1.] 1
[0.] 0
[0.] 0
[0.] 0
[1.] 1
[0.] 1
[1.] 1
[0.] 0
[0.] 0
[1.] 1
[0.] 0
[1.] 1
[1.] 1
[0.] 0
[0.] 0
[1.] 1
[1.] 1
[0.] 0
[0.] 0
[1.] 1
[1.] 1
[0.] 0
[1.] 1
[0.] 0
[1.] 1
[0.] 0
[1.] 1
[1.] 1
[0.] 0
[0.] 0
[0.] 0
[0.] 0
[1.] 1
[0.] 0
[0.] 0
[1.] 1
[0.] 0
[0.] 1
[0.] 0
[0.] 0
[1.] 1
[1.] 1
[0.] 0
[1.] 1
[0.] 0
[1.] 1
[0.] 0
[0.] 0
[0.] 0
[0.] 0
[1.] 1
[1.] 1
[0.] 0
[0.] 0
[1.] 1
[0.] 0
[0.] 0
[1.] 1
[0.] 0
[1.] 1
[0.] 0
[0.] 0
[0.] 1
[1.] 1
[0.] 0
[0.] 0
[1.] 1
[1.] 1
[1.] 1
[1.] 1
[0.] 0
[1.] 1
[0.] 0
[1.] 1
[0.] 0
[0.] 0
[0.] 0
[1.] 1
[0.] 0
[0.] 0
[0.] 0
[0.] 0
[0.] 0
[0.] 0
[1.] 1
[0.] 0
[1.] 1
[1.] 1
[0.] 0
[1.] 1
[0.] 0
[0.] 0
[0.] 0
[0.] 0
[1.] 1
[1.] 1
[0.] 0
[0.] 0
[1.] 1
[1.] 1
[0.] 0
[1.] 1
[0.] 0
[0.] 0
[1.] 1

In [7]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

# 정확도 계산
accuracy = accuracy_score(saved_labels['test'], saved_predictions['test'])

# F1 점수 계산 (binary classification이므로 average='binary')
f1 = f1_score(saved_labels['test'], saved_predictions['test'])

# 혼동 행렬 계산
tn, fp, fn, tp = confusion_matrix(saved_labels['test'], saved_predictions['test']).ravel()

# 결과 출력
print(f"Accuracy: {accuracy:}")
print(f"F1 Score: {f1:}")
print("Confusion Matrix:")
print(tn, fp, fn, tp)

Accuracy: 0.9555
F1 Score: 0.9525839104954715
Confusion Matrix:
1017 0 89 894


In [9]:
import pickle

pk = '/home/jina/reprod/data/pickle/train_ER2'
# pk = '/home/jina/reprod/data/pickle/train_ER'
# pk = '/home/jina/reprod/data/pickle/dense_test'
# pk = '/home/jina/reprod/data/pickle/train_1'
# pk = '/home/jina/reprod/data/pickle/new_1'
with open(pk, 'rb') as f:
    train_data = pickle.load(f)

# valid_data = train_data
# pk_v = '/home/jina/reprod/data/pickle/valid_ER'
pk_v = '/home/jina/reprod/data/pickle/valid_1'
# pk_v = '/home/jina/reprod/data/pickle/new_1'
with open(pk_v, 'rb') as f:
    valid_data = pickle.load(f)

In [10]:
x = train_data[1].x

In [11]:
edge_index = train_data[1].edge_index

In [12]:
edge_features = train_data[1].edge_attr
num_nodes = 100

In [13]:
x.shape, edge_index.shape, edge_features.shape, num_nodes

(torch.Size([100, 11]), torch.Size([2, 4950]), torch.Size([4950, 100]), 100)

In [14]:
import torch

v = edge_index[0, :] # 0
u = edge_index[1, :] # 1
v_node_features = x[v] # [1, 11]
u_node_features = x[u] # [1, 11]

message = torch.cat([v_node_features, u_node_features, edge_features], dim=-1) # [1, 122]
message_dim = message.size(1) # 122
node_messages = torch.zeros((num_nodes, message_dim)) # [2, 122]

In [15]:
unique_v, counts_v = torch.unique(v, return_counts=True)
unique_v, counts_v

(tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
         18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
         36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
         54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71,
         72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
         90, 91, 92, 93, 94, 95, 96, 97, 98]),
 tensor([99, 98, 97, 96, 95, 94, 93, 92, 91, 90, 89, 88, 87, 86, 85, 84, 83, 82,
         81, 80, 79, 78, 77, 76, 75, 74, 73, 72, 71, 70, 69, 68, 67, 66, 65, 64,
         63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46,
         45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28,
         27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10,
          9,  8,  7,  6,  5,  4,  3,  2,  1]))

In [21]:
sampled_edges = []
num_neighbor = 1

for node, count in zip(unique_v, counts_v):
    neighbors_idx = (v == node).nonzero(as_tuple=False).squeeze()
    if neighbors_idx.dim() == 0:
        neighbors_idx = neighbors_idx.unsqueeze(0)

    if count > num_neighbor:
        sampled_neighbors = neighbors_idx[torch.randperm(neighbors_idx.size(0))[:num_neighbor]]
        sampled_edges.append(sampled_neighbors)




In [22]:
sampled_edges = [edge for edge in sampled_edges if edge.numel() > 0]

In [23]:
sampled_edges

[tensor([69]),
 tensor([190]),
 tensor([232]),
 tensor([349]),
 tensor([433]),
 tensor([548]),
 tensor([625]),
 tensor([733]),
 tensor([798]),
 tensor([940]),
 tensor([990]),
 tensor([1117]),
 tensor([1192]),
 tensor([1217]),
 tensor([1335]),
 tensor([1400]),
 tensor([1506]),
 tensor([1547]),
 tensor([1691]),
 tensor([1719]),
 tensor([1853]),
 tensor([1878]),
 tensor([1967]),
 tensor([2042]),
 tensor([2102]),
 tensor([2181]),
 tensor([2275]),
 tensor([2336]),
 tensor([2400]),
 tensor([2498]),
 tensor([2576]),
 tensor([2644]),
 tensor([2704]),
 tensor([2764]),
 tensor([2866]),
 tensor([2873]),
 tensor([2973]),
 tensor([3017]),
 tensor([3065]),
 tensor([3161]),
 tensor([3223]),
 tensor([3247]),
 tensor([3319]),
 tensor([3376]),
 tensor([3420]),
 tensor([3472]),
 tensor([3548]),
 tensor([3597]),
 tensor([3668]),
 tensor([3700]),
 tensor([3740]),
 tensor([3813]),
 tensor([3825]),
 tensor([3895]),
 tensor([3926]),
 tensor([3999]),
 tensor([4041]),
 tensor([4053]),
 tensor([4121]),
 tensor([

In [28]:
v[sampled_edges]

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
        36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
        54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71,
        72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
        90, 91, 92, 93, 94, 95, 96, 97])

In [29]:
u[sampled_edges]

tensor([70, 93, 38, 59, 48, 69, 53, 69, 43, 95, 56, 95, 83, 22, 55, 36, 59, 18,
        81, 29, 84, 31, 43, 42, 27, 32, 53, 42, 35, 63, 72, 72, 65, 59, 96, 39,
        76, 58, 45, 81, 84, 50, 65, 66, 55, 53, 76, 73, 93, 75, 66, 91, 56, 80,
        66, 95, 94, 64, 91, 81, 64, 91, 71, 80, 71, 81, 68, 74, 96, 95, 87, 96,
        93, 86, 80, 85, 95, 89, 85, 90, 81, 97, 89, 99, 92, 92, 93, 89, 97, 96,
        99, 98, 98, 96, 96, 98, 99, 98])

In [30]:
message.shape, node_messages.shape

(torch.Size([4950, 122]), torch.Size([100, 122]))

In [24]:
node_messages = node_messages.index_add(0, v, message.float())
node_messages.shape

torch.Size([100, 122])

In [25]:
node_messages

tensor([[ 2.4219e+09, -8.2590e+08,  1.4028e+02,  ...,  9.0066e+01,
          1.5048e+03,  1.5048e+03],
        [ 1.9661e+06, -6.0616e+06,  1.3860e+02,  ...,  8.9542e+01,
          1.5194e+03,  1.5194e+03],
        [ 7.2882e+04, -2.2470e+05,  1.4001e+02,  ...,  8.8461e+01,
          1.4943e+03,  1.4943e+03],
        ...,
        [ 2.2930e+01, -1.7264e+01,  2.7799e+00,  ...,  1.7372e+00,
          1.7551e+01,  1.7551e+01],
        [ 1.9082e+12, -6.3658e+11,  1.3644e+00,  ...,  9.9956e-01,
          1.9708e+01,  1.9708e+01],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00]])

In [26]:
node_messages = node_messages.index_add(0, u, message.float())
node_messages

tensor([[ 2.4219e+09, -8.2590e+08,  1.4028e+02,  ...,  9.0066e+01,
          1.5048e+03,  1.5048e+03],
        [ 2.6429e+07, -1.4404e+07,  1.4002e+02,  ...,  9.0532e+01,
          1.5380e+03,  1.5380e+03],
        [ 2.4556e+07, -8.6290e+06,  1.4284e+02,  ...,  9.0449e+01,
          1.5325e+03,  1.5325e+03],
        ...,
        [ 4.5223e+12, -6.8071e+12,  1.3750e+02,  ...,  7.6668e+01,
          7.3239e+02,  7.3239e+02],
        [ 6.4305e+12, -7.4437e+12,  1.3748e+02,  ...,  9.0293e+01,
          1.5224e+03,  1.5224e+03],
        [ 6.4305e+12, -7.4437e+12,  1.3748e+02,  ...,  9.0468e+01,
          1.5350e+03,  1.5350e+03]])

In [27]:
num_neighbors = torch.bincount(v, minlength=num_nodes) + torch.bincount(u, minlength=num_nodes) # [2]
num_neighbors

tensor([99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
        99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
        99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
        99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
        99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
        99, 99, 99, 99, 99, 99, 99, 99, 99, 99])

In [28]:
node_messages /= (num_neighbors.unsqueeze(1) + 1e-6)
node_messages

tensor([[ 2.4463e+07, -8.3424e+06,  1.4170e+00,  ...,  9.0976e-01,
          1.5200e+01,  1.5200e+01],
        [ 2.6696e+05, -1.4550e+05,  1.4144e+00,  ...,  9.1446e-01,
          1.5535e+01,  1.5535e+01],
        [ 2.4804e+05, -8.7161e+04,  1.4428e+00,  ...,  9.1362e-01,
          1.5480e+01,  1.5480e+01],
        ...,
        [ 4.5680e+10, -6.8759e+10,  1.3889e+00,  ...,  7.7442e-01,
          7.3978e+00,  7.3978e+00],
        [ 6.4954e+10, -7.5189e+10,  1.3887e+00,  ...,  9.1205e-01,
          1.5377e+01,  1.5377e+01],
        [ 6.4954e+10, -7.5189e+10,  1.3887e+00,  ...,  9.1382e-01,
          1.5505e+01,  1.5505e+01]])

In [3]:
from torch_geometric.data import Data
import torch

def is_data_duplicate(data1, data2):
    # 노드 특징 비교
    if not torch.equal(data1.x, data2.x):
        return False
    # 엣지 인덱스 비교
    if not torch.equal(data1.edge_index, data2.edge_index):
        return False
    # 레이블 비교 (선택적)
    if hasattr(data1, 'y') and hasattr(data2, 'y'):
        if not torch.equal(data1.y, data2.y):
            return False
    return True

def find_duplicates(train_data, valid_data):
    duplicates = []
    for train_idx, train_item in enumerate(train_data):
        for valid_idx, valid_item in enumerate(valid_data):
            if is_data_duplicate(train_item, valid_item):
                duplicates.append((train_idx, valid_idx))
    return duplicates


duplicates = find_duplicates(train_data, valid_data)
print("Duplicate data indices:", duplicates)

Duplicate data indices: []
