In [2]:
import torch
from sklearn.metrics import f1_score
from OpenAttMultiGL.utils.dataset import dataset
from OpenAttMultiGL.utils.process import * 
import datetime
import errno
import os
import pickle
import random
from pprint import pprint
from sklearn.metrics import normalized_mutual_info_score, pairwise, f1_score
from sklearn.cluster import KMeans

import dgl


from dgl.data.utils import _get_dgl_url, download, get_download_dir
import numpy as np

def score(logits, labels,num_classes):
    _, indices = torch.max(logits, dim=1)
    prediction = indices.long().cpu().numpy()
    labels = labels.cpu().numpy()

    sim = (prediction == labels).sum() / len(prediction)
    micro_f1 = f1_score(labels, prediction, average="micro")
    macro_f1 = f1_score(labels, prediction, average="macro")
    nmi = run_kmeans(labels, prediction, num_classes)

    return sim, micro_f1, macro_f1,nmi


def evaluate(model, g, features, labels, mask, loss_func,num_classes):
    model.eval()
    with torch.no_grad():
        logits = model(g, features)
    loss = loss_func(logits[mask], labels[mask])
    sim, micro_f1, macro_f1,nmi = score(logits[mask], labels[mask],num_classes)
    
    

    return micro_f1, macro_f1,nmi,sim

def run_kmeans(y,y_pred, k):
    estimator = KMeans(n_clusters=k,n_init=10)#, n_jobs=16)

    NMI_list = []
    for i in range(5):
        #estimator.fit(x)
        #y_pred = estimator.predict(x)
        s = normalized_mutual_info_score(y, y_pred, average_method='arithmetic')
        e = float("{:.4f}".format(s))
        NMI_list.append(e)

    mean = np.mean(NMI_list)
    std = np.std(NMI_list)
    print('\t[Clustering] NMI: {:.4f} | {:.4f}'.format(mean, std))
    return mean

def main(args):
    # If args['hetero'] is True, g would be a heterogeneous graph.
    # Otherwise, it will be a list of homogeneous graphs.
    dataname = "dblp"
    c = dataset(dataname)
    
    
    
    
    if dataname == "amazon":
        data = pkl.load(open('OpenAttMultiGL/data/HAN/AMAZON/amazon.pkl', "rb"))
        data["IVI"] = sp.csr_matrix(data["IVI"])
        data["IBI"] = sp.csr_matrix(data["IBI"])
        data["IOI"] = sp.csr_matrix(data["IOI"])
        author_g = dgl.from_scipy(data["IVI"])
        subject_g = dgl.from_scipy(data["IBI"])
        o_g = dgl.from_scipy(data["IOI"])
        gs = [author_g, subject_g, o_g]
    elif dataname == "acm":
        data = sio.loadmat('OpenAttMultiGL/data/HAN/ACM/acm.mat')
        data["PAP"] = sp.csr_matrix(data["PAP"])
        data["PLP"] = sp.csr_matrix(data["PLP"])
        author_g = dgl.from_scipy(data["PAP"])
        subject_g = dgl.from_scipy(data["PLP"])
        gs = [author_g, subject_g]
    elif dataname == "dblp":
        data = pkl.load(open('OpenAttMultiGL/data/HAN/DBLP/dblp.pkl', "rb"))
        data["PAP"] = sp.csr_matrix(data["PAP"])
        data["PPrefP"] = sp.csr_matrix(data["PPrefP"])
        data["PATAP"] = sp.csr_matrix(data["PATAP"])
        author_g = dgl.from_scipy(data["PAP"])
        subject_g = dgl.from_scipy(data["PPrefP"])
        o_g = dgl.from_scipy(data["PATAP"])
        gs = [author_g, subject_g, o_g]
    elif dataname == "imdb":
        data = pkl.load(open('OpenAttMultiGL/data/HAN/IMDB/imdb.pkl', "rb"))
        data["MDM"] = sp.csr_matrix(data["MDM"])
        data["MAM"] = sp.csr_matrix(data["MAM"])
        author_g = dgl.from_scipy(data["MDM"])
        subject_g = dgl.from_scipy(data["MAM"])
        gs = [author_g, subject_g]
    
    num_classes = c.gcn_labels.shape[1]
    c.gcn_labels = torch.from_numpy(data["label"]).long()
    c.gcn_labels = c.gcn_labels.nonzero()[:, 1]
    c.features = c.features.toarray()
    c.features = torch.from_numpy(data["feature"]).float()
    num_nodes = author_g.num_nodes()
    train_mask = get_binary_mask(num_nodes, c.train_id)
    val_mask = get_binary_mask(num_nodes, c.valid_id)
    test_mask = get_binary_mask(num_nodes, c.test_id)
    
    #t = dataset(args["dataset"])
    #print(type(t.edge_index))
    if hasattr(torch, "BoolTensor"):
        train_mask = train_mask.bool()
        val_mask = val_mask.bool()
        test_mask = test_mask.bool()

    c.features = c.features.to(args["device"])
    c.gcn_labels = c.gcn_labels.to(args["device"])
    
    
    train_mask = train_mask.to(args["device"])
    val_mask = val_mask.to(args["device"])
    test_mask = test_mask.to(args["device"])
    
    
    #print(args["hetero"])
    if args["hetero"]:
        from OpenAttMultiGL.HAN.model_hetero import HAN
        
        model = HAN(
            meta_paths=[["pa", "ap"], ["pf", "fp"]],
            in_size=c.features.shape[1],
            hidden_size=args["hidden_units"],
            out_size=t.HAN_num_classes,
            num_heads=args["num_heads"],
            dropout=args["dropout"],
        ).to(args["device"])
        gs = gs.to(args["device"])
        
    else:
        from OpenAttMultiGL.model.HAN.model import HAN

        model = HAN(
            num_meta_paths=len(gs),
            in_size=c.features.shape[1],
            hidden_size=args["hidden_units"],
            out_size=num_classes,
            num_heads=args["num_heads"],
            dropout=args["dropout"],
        ).to(args["device"])
        gs = [graph.to(args["device"]) for graph in gs]

    stopper = EarlyStopping(patience=args["patience"])
    loss_fcn = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(
        model.parameters(), lr=args["lr"], weight_decay=args["weight_decay"]
    )

    for epoch in range(args["num_epochs"]):
        model.train()
        logits = model(gs, c.features)
        loss = loss_fcn(logits[train_mask], c.gcn_labels[train_mask])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        #sim, train_micro_f1, train_macro_f1,nmi = score(
        #    logits[t.train_mask], t.HAN_labels[t.train_mask], t
        ##)
        #val_nmi, val_sim, val_micro_f1, val_macro_f1 = evaluate(
         #   model, t.gs, t.HAN_features, t.HAN_labels, t.val_mask, loss_fcn
        #)
        #early_stop = stopper.step(val_loss.data.item(), val_acc, model)

        #print(
          #  "Epoch {:d} | Train Loss {:.4f} | Train Micro f1 {:.4f} | Train Macro f1 {:.4f} | "
         #   "Val Loss {:.4f} | Val Micro f1 {:.4f} | Val Macro f1 {:.4f}".format(
          #        loss.item(),
          #      train_micro_f1,
          #      train_macro_f1,
           #     val_loss.item(),
          #      val_micro_f1,
          #      val_macro_f1,
         #   )
      #  )

       # if early_stop:
      #      break

    #stopper.load_checkpoint(model)
    test_micro_f1, test_macro_f1,nmi,sim = evaluate(
        model, gs, c.features, c.gcn_labels, test_mask, loss_fcn,num_classes
    )
    print(
        "Micro f1 {:.4f} | Macro f1 {:.4f} | NMI {:.4f} | SIM {:.4f}".format(
            test_micro_f1, test_macro_f1, nmi, sim
        )
    )


if __name__ == "__main__":
    import argparse

    #from utils import setup

    parser = argparse.ArgumentParser("HAN")
    parser.add_argument("-s", "--seed", type=int, default=1, help="Random seed")
    parser.add_argument(
        "-ld",
        "--log-dir",
        type=str,
        default="results",
        help="Dir for saving training results",
    )
    parser.add_argument(
        "--hetero",
        action="store_true",
        help="Use metapath coalescing with DGL's own dataset",
    )
    parser.add_argument("-f", "--fff", help="a dummy argument to fool ipython", default="1")
    args = parser.parse_args().__dict__

    args = setup(args)

    main(args)

Created directory results/DBLP_2023-07-11_17-10-33
	[Clustering] NMI: 0.4319 | 0.0000
Micro f1 0.7001 | Macro f1 0.7113 | NMI 0.4319 | SIM 0.7001
