# Implementacja sieci GCN

In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch_geometric.nn import GCNConv

from torch_geometric.utils.convert import from_networkx

from src.trainer import get_default_trainer
from src.utils import evaluate_embeddings
from src.supervised import SupervisedNodeClassificationGNN
from src.dataset import EmailEUCore
from src.dataloader import GraphData

import networkx as nx
import pandas as pd
from pathlib import Path
from gensim.models import KeyedVectors


In [None]:
EMBBEDDINGS_PATH = Path("embbeddings")
DATA_PATH = Path("data")
LOG_DIR = Path("logs")
RESULTS_DIR = Path("results")
RANDOM_STATE = 345

In [None]:
%load_ext tensorboard
%tensorboard --logdir $LOG_DIR

In [None]:
class GCNModel(nn.Module):
    def __init__(
        self,
        in_dim: int,
        hidden_dim: int,
        out_dim: int
    ):
        super().__init__()
        self.conv1 = GCNConv(in_dim, hidden_dim)
        self.act1 = nn.ReLU()
        self.conv2 = GCNConv(hidden_dim, out_dim)
        self.act2 = nn.ReLU()

    def forward(self, x, edge_index):
        z = self.act1(self.conv1(x, edge_index))
        z = self.act2(self.conv2(z, edge_index))
        return z

In [None]:
def evaluate_gcn(trainer, model):
    test_data = trainer.test(model=model, datamodule=datamodule, verbose=False)[0]
    auc_score = test_data["test/auc"]
    f1_score = test_data["test/f1"]
    
    print(f"Test f1 score: {f1_score}")
    
    z, y = trainer.predict(model=model, datamodule=datamodule)[0]
    fig, data = evaluate_embeddings(z=z, y=y, random_state=RANDOM_STATE)
    fig.suptitle(f"GCN - test AUC: {auc_score * 100.:.2f} [%]")
    plt.show()
    
    data["f1_test"] = f1_score
    data["auc_test"] = auc_score
    
    return data

# Eksperymenty

 ## Wektory o rozmiarze 128

In [3]:
hparams = {
    "num_epochs": 400,
    "hidden_dim":  64,
    "emb_dim": 128,
    "lr": 3e-2,
    "model_name": "Supervised_GCN_128"
}

In [None]:
datamodule = GraphData(
    root_path=f"dataset_{hparams['emb_dim']}",
    graph_path=DATA_PATH / 'network.gpickle',
    dataset_path=DATA_PATH / f"train_data_{hparams['emb_dim']}.tsv",
    embbeddings_path=EMBBEDDINGS_PATH / f"embbeddings_{hparams['emb_dim']}.graphvectors",
    random_state=RANDOM_STATE
)

In [None]:
def train_gcn():
    gnn = GCNModel(
        in_dim=datamodule.num_node_features,
        hidden_dim=hparams["hidden_dim"],
        out_dim=hparams["emb_dim"],
    )
    
    model=SupervisedNodeClassificationGNN(
        gnn=gnn, 
        emb_dim=hparams["emb_dim"],
        num_classes=datamodule.num_classes,
        lr=hparams["lr"]
    )

    trainer = get_default_trainer(
        num_epochs=hparams["num_epochs"],
        model_name=hparams["model_name"],
    )
    
    trainer.fit(model=model, datamodule=datamodule)
    return trainer, model
    

In [None]:
trainer, model = train_gcn()

In [None]:
evaluate_gcn(trainer, model)

## Wektory o rozmiarze 64

In [None]:
hparams = {
    "num_epochs": 200,
    "hidden_dim":  128,
    "emb_dim": 64,
    "lr": 5e-2,
    "model_name": "Supervised_GCN_64"
}

In [None]:
datamodule = GraphData(
    root_path=f"dataset_{hparams['emb_dim']}",
    graph_path=DATA_PATH / 'network.gpickle',
    dataset_path=DATA_PATH / f"train_data_{hparams['emb_dim']}.tsv",
    embbeddings_path=EMBBEDDINGS_PATH / f"embbeddings_{hparams['emb_dim']}.graphvectors",
    random_state=RANDOM_STATE
)

In [None]:
def train_gcn():
    gnn = GCNModel(
        in_dim=datamodule.num_node_features,
        hidden_dim=hparams["hidden_dim"],
        out_dim=hparams["emb_dim"],
    )
    
    model=SupervisedNodeClassificationGNN(
        gnn=gnn, 
        emb_dim=hparams["emb_dim"],
        num_classes=datamodule.num_classes,
        lr=hparams["lr"]
    )

    trainer = get_default_trainer(
        num_epochs=hparams["num_epochs"],
        model_name=hparams["model_name"],
    )
    
    trainer.fit(model=model, datamodule=datamodule)
    return trainer, model
    

In [None]:
trainer, model = train_gcn()

In [None]:
data = evaluate_gcn(trainer, model)

## Eksperymenty

In [None]:
results_dict_exp = {
    "model_name": [],
    "f1_test": [],
    "auc_test": [],
    "silhoute": [],
    "davies-bouldin": []  
}


hidden_dims =  [64, 128, 256, 512]
for hidden_dim in hidden_dims:
    for _ in range(10):
        hparams = {
            "num_epochs": 200,
            "hidden_dim":  hidden_dim,
            "emb_dim": 64,
            "lr": 2e-2,
            "model_name": f"GCN_64_hd_{hidden_dim}"
        }
        datamodule = GraphData(
            root_path=f"dataset_{hparams['emb_dim']}",
            graph_path=DATA_PATH / 'network.gpickle',
            dataset_path=DATA_PATH / f"train_data_{hparams['emb_dim']}.tsv",
            embbeddings_path=EMBBEDDINGS_PATH / f"embbeddings_{hparams['emb_dim']}.graphvectors",
            random_state=RANDOM_STATE
        )

        gnn = GCNModel(
            in_dim=datamodule.num_node_features,
            hidden_dim=hparams["hidden_dim"],
            out_dim=hparams["emb_dim"],
        )

        model=SupervisedNodeClassificationGNN(
            gnn=gnn, 
            emb_dim=hparams["emb_dim"],
            num_classes=datamodule.num_classes,
            lr=hparams["lr"]
        )

        trainer = get_default_trainer(
            num_epochs=hparams["num_epochs"],
            model_name=hparams["model_name"],
        )

        trainer.fit(model=model, datamodule=datamodule)
        data = evaluate_gcn(trainer, model)

        results_dict_exp["model_name"].append(hparams["model_name"])
        results_dict_exp["f1_test"].append(data["f1_test"])
        results_dict_exp["auc_test"].append(data["auc_test"])
        results_dict_exp["silhoute"].append(data["silhoute"])
        results_dict_exp["davies-bouldin"].append(data["davies-bouldin"])
        
results_df_exp = pd.DataFrame(results_dict_exp)
results_df_exp.to_csv(RESULTS_DIR / "gcn_experiment_hidden_dim.csv", index=False)

## Wpływ rozmiaru wektora reprezentacji

In [None]:
hparams = {
    "num_epochs": 400,
    "lr": 2e-2,
    "hidden_dim": 32
}
results_dict_exp = {
    "model_name": [],
    "f1_test": [],
    "auc_test": [],
    "silhoute": [],
    "davies-bouldin": []  
}

embbeding_dims = [2, 4, 8, 16, 32]
for embb_dim in embbeding_dims:
    hparams["model_name"] = f"Supervised_GCN_{embb_dim}"
    for _ in range(10):
        datamodule = GraphData(
            root_path=f"dataset_{embb_dim}",
            graph_path=DATA_PATH / 'network.gpickle',
            dataset_path=DATA_PATH / f"train_data_{embb_dim}.tsv",
            embbeddings_path=EMBBEDDINGS_PATH / f"embbeddings_{embb_dim}.graphvectors",
            random_state=RANDOM_STATE
        )

        gnn = GCNModel(
            in_dim=datamodule.num_node_features,
            hidden_dim=hparams["hidden_dim"],
            out_dim=embb_dim,
        )

        model=SupervisedNodeClassificationGNN(
            gnn=gnn, 
            emb_dim=embb_dim,
            num_classes=datamodule.num_classes,
            lr=hparams["lr"]
        )

        trainer = get_default_trainer(
            num_epochs=hparams["num_epochs"],
            model_name=hparams["model_name"],
        )

        trainer.fit(model=model, datamodule=datamodule)
        data = evaluate_gcn(trainer, model)
        
        results_dict_exp["model_name"].append(hparams["model_name"])
        results_dict_exp["f1_test"].append(data["f1_test"])
        results_dict_exp["auc_test"].append(data["auc_test"])
        results_dict_exp["silhoute"].append(data["silhoute"])
        results_dict_exp["davies-bouldin"].append(data["davies-bouldin"])
        

In [None]:
results_df = pd.DataFrame(results_dict_exp)
results_df.to_csv(RESULTS_DIR / "gcn_stats_embbeddings.csv", index=False)

## Zebranie ostatecznych wyników

In [None]:
params = [
    {
        "num_epochs": 200,
        "hidden_dim":  128,
        "emb_dim": 64,
        "lr": 5e-2,
        "model_name": "Supervised_GCN_64"
    },
    {
        "num_epochs": 400,
        "hidden_dim":  64,
        "emb_dim": 128,
        "lr": 3e-2,
        "model_name": "Supervised_GCN_128"
    },
    
]

In [None]:
results_dict = {
    "model_name": [],
    "f1_test": [],
    "auc_test": [],
    "silhoute": [],
    "davies-bouldin": []  
}

for hparams in params:
    for _ in range(10):
        datamodule = GraphData(
            root_path=f"dataset_{hparams['emb_dim']}",
            graph_path=DATA_PATH / 'network.gpickle',
            dataset_path=DATA_PATH / f"train_data_{hparams['emb_dim']}.tsv",
            embbeddings_path=EMBBEDDINGS_PATH / f"embbeddings_{hparams['emb_dim']}.graphvectors",
            random_state=RANDOM_STATE
        )

        gnn = GCNModel(
            in_dim=datamodule.num_node_features,
            hidden_dim=hparams["hidden_dim"],
            out_dim=hparams["emb_dim"],
        )

        model=SupervisedNodeClassificationGNN(
            gnn=gnn, 
            emb_dim=hparams["emb_dim"],
            num_classes=datamodule.num_classes,
            lr=hparams["lr"]
        )

        trainer = get_default_trainer(
            num_epochs=hparams["num_epochs"],
            model_name=hparams["model_name"],
        )

        trainer.fit(model=model, datamodule=datamodule)
        data = evaluate_gcn(trainer, model)
        
        results_dict["model_name"].append(hparams["model_name"])
        results_dict["f1_test"].append(data["f1_test"])
        results_dict["auc_test"].append(data["auc_test"])
        results_dict["silhoute"].append(data["silhoute"])
        results_dict["davies-bouldin"].append(data["davies-bouldin"])

In [None]:
results_dict

In [None]:
results_df = pd.DataFrame(results_dict)
results_df.to_csv(RESULTS_DIR / "gcn_stats.csv", index=False)