In [22]:
import random
import numpy as np
import torch
import itertools
import os
import pandas as pd
import scipy.io as sio

import dgl
from dgl.data import citation_graph, rdf, knowledge_graph
from dgl.utils import extract_node_subframes, set_new_frames
import dgl.function as fn
from dgl.data import DGLDataset
from dgl.data.utils import _get_dgl_url, download, save_graphs, load_graphs, \
    generate_mask_tensor, idx2mask
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import f1_score, normalized_mutual_info_score, adjusted_rand_score
from sklearn.feature_extraction.text import CountVectorizer






### UTILS FUNCTIONS

In [23]:
def set_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
    dgl.seed(seed)
    
    
def accuracy(logits, labels):
    """Calculate accuracy
    :param logits: tensor(N, C) Prediction probability, N is the number of samples, C is the number of categories
    :param labels: tensor(N) correct label
    :return: float accuracy
    """
    return torch.sum(torch.argmax(logits, dim=1) == labels).item() * 1.0 / len(labels)


def micro_macro_f1_score(logits, labels):
    """Calculate Micro-F1 and Macro-F1 scores
    :param logits: tensor(N, C) Prediction probability, N is the number of samples, C is the number of categories
    :param labels: tensor(N) 
    Macro-average precision score can be defined as the arithmetic mean of all the precision scores of different classes.
    """
    prediction = torch.argmax(logits, dim=1).long().numpy()
    labels = labels.numpy()
    micro_f1 = f1_score(labels, prediction, average='micro')
    macro_f1 = f1_score(labels, prediction, average='macro')
    return micro_f1, macro_f1


def split_idx(samples, train_size, val_size, random_state=None):
    """The samples are divided into training set, test set and validation set, which must be satisfied (represented by floating point numbers):
    * 0 < train_size < 1
    * 0 < val_size < 1
    * train_size + val_size < 1
    """
    train, val = train_test_split(samples, train_size=train_size, random_state=random_state)
    if isinstance(val_size, float):
        val_size *= len(samples) / len(val)
    val, test = train_test_split(val, train_size=val_size, random_state=random_state)
    return train, val, test

### GET THE ACM DATASET AND CONVERT TO A DGL GRAPH

The basic DGL dataset for creating graph datasets. This class defines a basic template class for DGL Dataset. The following steps will be executed automatically:

Check whether there is a dataset cache on disk (already processed and stored on the disk) by invoking has_cache(). If true, goto 5.

1-Call download() to download the data if url is not None.

2-Call process() to process the data.

3-Call save() to save the processed dataset on disk and goto 6.

4-Call load() to load the processed dataset from disk.

Done.

In [24]:
class ACMDataset(DGLDataset):
    """ACM dataset, only one heterogeneous graph
    Statistical data
    -----
    * Apex: 17351 author, 4025 paper, 72 field
    * Sides: 13407 paper-author, 4025 paper-field
    * Number of categories: 3
    * paper vertex division: 808 train, 401 valid, 2816 test
    Attributes
    -----
    * num_classes: number of classes
    * metapaths: metapaths to use
    * predict_ntype: predict vertex type
    paper vertex attribute
    -----
    * feat: tensor(4025, 1903) bag-of-words representation of keywords
    * label: tensor(4025)
    * train_mask, val_mask, test_mask: tensor(4025)
    author vertex attribute
    -----
    * feat: tensor(17351, 1903) average of associated paper features
    field vertex attribute
    -----
    * feat: tensor(72, 72) one-hot encoding """
    

    def __init__(self):
        #Get DGL online url for download.
        super().__init__('ACM', _get_dgl_url('dataset/ACM.mat'))

    def download(self):
        file_path = os.path.join(self.raw_dir, 'ACM.mat')
        if not os.path.exists(file_path):
            download(self.url, path=file_path)

    def save(self):
        save_graphs(os.path.join(self.save_path, self.name + '_dgl_graph.bin'), [self.g])

    def load(self):
        graphs, _ = load_graphs(os.path.join(self.save_path, self.name + '_dgl_graph.bin'))
        self.g = graphs[0]
        for k in ('train_mask', 'val_mask', 'test_mask'):
            self.g.nodes['paper'].data[k] = self.g.nodes['paper'].data[k].bool()

    def process(self):
        data = sio.loadmat(os.path.join(self.raw_dir, 'ACM.mat'))
        p_vs_l = data['PvsL']  # paper-field
        p_vs_a = data['PvsA']  # paper-author
        p_vs_t = data['PvsT']  # paper-term, bag of words
        p_vs_c = data['PvsC']  # paper-conference, labels come from that

        # We assign
        # (1) KDD papers as class 0 (data mining),
        # (2) SIGMOD and VLDB papers as class 1 (database),
        # (3) SIGCOMM and MobiCOMM papers as class 2 (communication)
        conf_ids = [0, 1, 9, 10, 13]
        label_ids = [0, 1, 2, 2, 1]

        p_vs_c_filter = p_vs_c[:, conf_ids]
        #get indeces of all papers
        p_selected = (p_vs_c_filter.sum(1) != 0).A1.nonzero()[0]
        p_vs_l = p_vs_l[p_selected]
        p_vs_a = p_vs_a[p_selected]
        p_vs_t = p_vs_t[p_selected]
        p_vs_c = p_vs_c[p_selected]
        
        #building the graph
        self.g = dgl.heterograph({
            ('paper', 'pa', 'author'): p_vs_a.nonzero(),
            ('author', 'ap', 'paper'): p_vs_a.transpose().nonzero(),
            ('paper', 'pf', 'field'): p_vs_l.nonzero(),
            ('field', 'fp', 'paper'): p_vs_l.transpose().nonzero()
        })
        #the features of a paper are the bag of words associated to a paper
        paper_features = torch.FloatTensor(p_vs_t.toarray())  # (4025, 1903)
        
        #get indces and labels
        pc_p, pc_c = p_vs_c.nonzero()
        paper_labels = np.zeros(len(p_selected), dtype=np.int64)
        for conf_id, label_id in zip(conf_ids, label_ids):
            paper_labels[pc_p[pc_c == conf_id]] = label_id
        paper_labels = torch.from_numpy(paper_labels)

        float_mask = np.zeros(len(pc_p))
        for conf_id in conf_ids:
            pc_c_mask = (pc_c == conf_id)
            float_mask[pc_c_mask] = np.random.permutation(np.linspace(0, 1, pc_c_mask.sum()))
        train_idx = np.where(float_mask <= 0.2)[0]
        val_idx = np.where((float_mask > 0.2) & (float_mask <= 0.3))[0]
        test_idx = np.where(float_mask > 0.3)[0]

        num_paper_nodes = self.g.num_nodes('paper')
        train_mask = generate_mask_tensor(idx2mask(train_idx, num_paper_nodes))
        val_mask = generate_mask_tensor(idx2mask(val_idx, num_paper_nodes))
        test_mask = generate_mask_tensor(idx2mask(test_idx, num_paper_nodes))

        self.g.nodes['paper'].data['feat'] = paper_features
        self.g.nodes['paper'].data['label'] = paper_labels
        self.g.nodes['paper'].data['train_mask'] = train_mask
        self.g.nodes['paper'].data['val_mask'] = val_mask
        self.g.nodes['paper'].data['test_mask'] = test_mask
        # The feature of the author vertex is the average of the features of its associated paper vertex
        self.g.multi_update_all({'pa': (fn.copy_u('feat', 'm'), fn.mean('m', 'feat'))}, 'sum')
        self.g.nodes['field'].data['feat'] = torch.eye(self.g.num_nodes('field'))

    def has_cache(self):
        return os.path.exists(os.path.join(self.save_path, self.name + '_dgl_graph.bin'))

    def __getitem__(self, idx):
        if idx != 0:
            raise IndexError('This dataset has only one graph')
        return self.g

    def __len__(self):
        return 1

    @property
    def num_classes(self):
        return 3

    @property
    def metapaths(self):
        return [['pa', 'ap'], ['pf', 'fp']]

    @property
    def predict_ntype(self):
        return 'paper'


#### IMDBDataset

In [25]:
import pandas as pd

df = pd.read_csv('https://raw.githubusercontent.com/Jhy1993/HAN/master/data/imdb/movie_metadata.csv', encoding='utf8') \
            .dropna(axis=0, subset=['actor_1_name', 'director_name']).reset_index(drop=True)

print(df)

print(df.describe())

      color      director_name  num_critic_for_reviews  duration  \
0     Color      James Cameron                   723.0     178.0   
1     Color     Gore Verbinski                   302.0     169.0   
2     Color         Sam Mendes                   602.0     148.0   
3     Color  Christopher Nolan                   813.0     164.0   
4       NaN        Doug Walker                     NaN       NaN   
...     ...                ...                     ...       ...   
4927  Color       Edward Burns                    14.0      95.0   
4928  Color        Scott Smith                     1.0      87.0   
4929  Color   Benjamin Roberds                    13.0      76.0   
4930  Color        Daniel Hsia                    14.0     100.0   
4931  Color           Jon Gunn                    43.0      90.0   

      director_facebook_likes  actor_3_facebook_likes        actor_2_name  \
0                         0.0                   855.0    Joel David Moore   
1                       563.0

In [26]:
class IMDbDataset(DGLDataset):
    """IMDb movie dataset, only one heterogeneous graph
    Statistical data
    -----
    * Apex: 4278 movies, 5257 actors, 2081 directors
    * Sides: 12828 movie-actor, 4278 movie-director
    * Number of categories: 3
    * Movie vertex division: 400 train, 400 valid, 3478 test
    Attributes
    -----
    * num_classes: number of classes
    * metapaths: metapaths to use
    * predict_ntype: predict vertex type
    movie vertex attribute
    -----
    * feat: tensor(4278, 1299) bag-of-words representation of plot keywords
    * label: tensor(4278) 0: Action, 1: Comedy, 2: Drama
    * train_mask, val_mask, test_mask: tensor(4278)
    actor vertex attributes
    -----
    * feat: tensor(5257, 1299) average of associated movie features
    director vertex attribute
    -----
    * feat: tensor(2081, 1299) average of associated movie features
    """
    _url = 'https://raw.githubusercontent.com/Jhy1993/HAN/master/data/imdb/movie_metadata.csv'
    _seed = 42

    def __init__(self):
        super().__init__('imdb', self._url)

    def download(self):
        file_path = os.path.join(self.raw_dir, 'imdb.csv')
        if not os.path.exists(file_path):
            download(self.url, path=file_path)

    def save(self):
        save_graphs(os.path.join(self.save_path, self.name + '_dgl_graph.bin'), [self.g])

    def load(self):
        graphs, _ = load_graphs(os.path.join(self.save_path, self.name + '_dgl_graph.bin'))
        self.g = graphs[0]
        for k in ('train_mask', 'val_mask', 'test_mask'):
            self.g.nodes['movie'].data[k] = self.g.nodes['movie'].data[k].bool()

    def process(self):
        self.data = pd.read_csv(os.path.join(self.raw_dir, 'imdb.csv'), encoding='utf8') \
            .dropna(axis=0, subset=['actor_1_name', 'director_name']).reset_index(drop=True)
        self.labels = self._extract_labels()
        self.movies = list(sorted(m.strip() for m in self.data['movie_title']))
        self.directors = list(sorted(set(self.data['director_name'])))
        self.actors = list(sorted(set(itertools.chain.from_iterable(
            self.data[c].dropna().to_list()
            for c in ('actor_1_name', 'actor_2_name', 'actor_3_name')
        ))))
        self.g = self._build_graph()
        self._add_ndata()
        return self.data

    def _extract_labels(self):
        labels = np.full(len(self.data), -1)
        for i, genres in self.data['genres'].iteritems():
            for genre in genres.split('|'):
                if genre == 'Action':
                    labels[i] = 0
                    break
                elif genre == 'Comedy':
                    labels[i] = 1
                    break
                elif genre == 'Drama':
                    labels[i] = 2
                    break
        other_idx = np.where(labels == -1)[0]
        self.data = self.data.drop(other_idx).reset_index(drop=True)
        return np.delete(labels, other_idx, axis=0)

    def _build_graph(self):
        ma, md = set(), set()
        for m, row in self.data.iterrows():
            d = self.directors.index(row['director_name'])
            md.add((m, d))
            for c in ('actor_1_name', 'actor_2_name', 'actor_3_name'):
                if row[c] in self.actors:
                    a = self.actors.index(row[c])
                    ma.add((m, a))
        ma, md = list(ma), list(md)
        ma_m, ma_a = [e[0] for e in ma], [e[1] for e in ma]
        md_m, md_d = [e[0] for e in md], [e[1] for e in md]
        return dgl.heterograph({
            ('movie', 'ma', 'actor'): (ma_m, ma_a),
            ('actor', 'am', 'movie'): (ma_a, ma_m),
            ('movie', 'md', 'director'): (md_m, md_d),
            ('director', 'dm', 'movie'): (md_d, md_m)
        })

    def _add_ndata(self):
        vectorizer = CountVectorizer(min_df=5)
        features = vectorizer.fit_transform(self.data['plot_keywords'].fillna('').values)
        self.g.nodes['movie'].data['feat'] = torch.from_numpy(features.toarray()).float()
        self.g.nodes['movie'].data['label'] = torch.from_numpy(self.labels).long()

        # Actor and director vertex features are the average of their associated movie vertex features
        self.g.multi_update_all({
            'ma': (fn.copy_u('feat', 'm'), fn.mean('m', 'feat')),
            'md': (fn.copy_u('feat', 'm'), fn.mean('m', 'feat'))
        }, 'sum')

        n_movies = len(self.movies)
        train_idx, val_idx, test_idx = split_idx(np.arange(n_movies), 400, 400, self._seed)
        self.g.nodes['movie'].data['train_mask'] = generate_mask_tensor(idx2mask(train_idx, n_movies))
        self.g.nodes['movie'].data['val_mask'] = generate_mask_tensor(idx2mask(val_idx, n_movies))
        self.g.nodes['movie'].data['test_mask'] = generate_mask_tensor(idx2mask(test_idx, n_movies))

    def has_cache(self):
        return os.path.exists(os.path.join(self.save_path, self.name + '_dgl_graph.bin'))

    def __getitem__(self, idx):
        if idx != 0:
            raise IndexError('This dataset has only one graph')
        return self.g

    def __len__(self):
        return 1

    @property
    def num_classes(self):
        return 3

    @property
    def metapaths(self):
        return [['ma', 'am'], ['md', 'dm']]

    @property
    def predict_ntype(self):
        return 'movie'

In [27]:
#!/usr/bin/env python
# coding: utf-8

# In[1]:


"""Heterogeneous Graph Transformer (HGT)
论文链接：https://arxiv.org/pdf/2003.01332
"""
import math

import dgl.function as fn
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl.nn import HeteroGraphConv
from dgl.ops import edge_softmax
from dgl.utils import expand_as_pair


class HGTAttention(nn.Module):
    """HGT attention module
        :param out_dim: int output feature dimension
        :param num_heads: int Number of attention heads K
        :param k_linear: nn.Linear(d_in, d_out)
        :param q_linear: nn.Linear(d_in, d_out)
        :param v_linear: nn.Linear(d_in, d_out)
        :param w_att: tensor(K, d_out/K, d_out/K)
        :param w_msg: tensor(K, d_out/K, d_out/K)
        :param mu: tensor(1)
        """

    def __init__(self, out_dim, num_heads, k_linear, q_linear, v_linear, w_att, w_msg, mu):
   
        super().__init__()
        self.out_dim = out_dim
        self.num_heads = num_heads
        self.d_k = out_dim // num_heads
        self.k_linear = k_linear
        self.q_linear = q_linear
        self.v_linear = v_linear
        self.w_att = w_att
        self.w_msg = w_msg
        self.mu = mu

    def forward(self, g, feat):
        """
        :param g: DGLGraph bipartite graph (contains only one relation)
        :param feat: tensor(N_src, d_in) or (tensor(N_src, d_in), tensor(N_dst, d_in)) input feature
        :return: tensor(N_dst, d_out) The target vertex's representation of the relationship
        """
        
        #avoids changing the graph features when exiting the function.
        with g.local_scope():
            feat_src, feat_dst = expand_as_pair(feat, g)
            # (N_src, d_in) -> (N_src, d_out) -> (N_src, K, d_out/K)
            k = self.k_linear(feat_src).view(-1, self.num_heads, self.d_k)
            v = self.v_linear(feat_src).view(-1, self.num_heads, self.d_k)
            q = self.q_linear(feat_dst).view(-1, self.num_heads, self.d_k)

            # k[:, h] @= w_att[h] => k[n, h, j] = ∑(i) k[n, h, i] * w_att[h, i, j]
            k = torch.einsum('nhi,hij->nhj', k, self.w_att)
            v = torch.einsum('nhi,hij->nhj', v, self.w_msg)

            g.srcdata.update({'k': k, 'v': v})
            g.dstdata['q'] = q
            g.apply_edges(fn.v_dot_u('q', 'k', 't'))  # g.edata['t']: (E, K, 1)
            attn = g.edata.pop('t').squeeze(dim=-1) * self.mu / math.sqrt(self.d_k)
            attn = edge_softmax(g, attn)  # (E, K)
            g.edata['t'] = attn.unsqueeze(dim=-1)  # (E, K, 1)

            g.update_all(fn.u_mul_e('v', 't', 'm'), fn.sum('m', 'h'))
            out = g.dstdata['h'].view(-1, self.out_dim)  # (N_dst, d_out)
            return out


class HGTLayer(nn.Module):

    def __init__(self, in_dim, out_dim, num_heads, ntypes, etypes, dropout=0.2, use_norm=True):
        """HGT layer
        :param in_dim: int input feature dimension
        :param out_dim: int output feature dimension
        :param num_heads: int Number of attention heads K
        :param ntypes: List[str] list of vertex types
        :param etypes: List[(str, str, str)] list of canonical edge types
        :param dropout: dropout: float, optional Dropout probability, default is 0.2
        :param use_norm: bool, optional whether to use layer normalization, the default is True
        """
        super().__init__()
        d_k = out_dim // num_heads
        k_linear = {ntype: nn.Linear(in_dim, out_dim) for ntype in ntypes}
        q_linear = {ntype: nn.Linear(in_dim, out_dim) for ntype in ntypes}
        v_linear = {ntype: nn.Linear(in_dim, out_dim) for ntype in ntypes}
        w_att = {r[1]: nn.Parameter(torch.Tensor(num_heads, d_k, d_k)) for r in etypes}
        w_msg = {r[1]: nn.Parameter(torch.Tensor(num_heads, d_k, d_k)) for r in etypes}
        mu = {r[1]: nn.Parameter(torch.ones(num_heads)) for r in etypes}
        self.reset_parameters(w_att, w_msg)
        self.conv = HeteroGraphConv({
            etype: HGTAttention(
                out_dim, num_heads, k_linear[stype], q_linear[dtype], v_linear[stype],
                w_att[etype], w_msg[etype], mu[etype]
            ) for stype, etype, dtype in etypes
        }, 'mean')

        self.a_linear = nn.ModuleDict({ntype: nn.Linear(out_dim, out_dim) for ntype in ntypes})
        self.skip = nn.ParameterDict({ntype: nn.Parameter(torch.ones(1)) for ntype in ntypes})
        self.drop = nn.Dropout(dropout)

        self.use_norm = use_norm
        if use_norm:
            self.norms = nn.ModuleDict({ntype: nn.LayerNorm(out_dim) for ntype in ntypes})

    def reset_parameters(self, w_att, w_msg):
        for etype in w_att:
            nn.init.xavier_uniform_(w_att[etype])
            nn.init.xavier_uniform_(w_msg[etype])

    def forward(self, g, feats):
        """
        :param g: DGLGraph heterogeneous graph
        :param feats: Dict[str, tensor(N_i, d_in)] mapping of vertex types to input vertex features
        :return: Dict[str, tensor(N_i, d_out)] mapping of vertex types to output features
        """
        if g.is_block:
            feats_dst = {ntype: feats[ntype][:g.num_dst_nodes(ntype)] for ntype in feats}
        else:
            feats_dst = feats
        with g.local_scope():
            # STEP 1 --> Heterogeneous Mutual Attention + Heterogeneous Messaging + Goal-Related Aggregation
            hs = self.conv(g, (feats, feats))  # {ntype: tensor(N_i, d_out)}

            # Residual connections
            out_feats = {}
            for ntype in g.dsttypes:
                if g.num_dst_nodes(ntype) == 0:
                    continue
                alpha = torch.sigmoid(self.skip[ntype])
                trans_out = self.drop(self.a_linear[ntype](hs[ntype]))
                out = alpha * trans_out + (1 - alpha) * feats_dst[ntype]
                out_feats[ntype] = self.norms[ntype](out) if self.use_norm else out
            return out_feats


class HGT(nn.Module):

    def __init__(
            self, in_dims, hidden_dim, out_dim, num_heads, ntypes, etypes,
            predict_ntype, num_layers, dropout=0.2, use_norm=True):
        """HGT model
        :param in_dims: Dict[str, int] mapping of vertex types to input feature dimensions
        :param hidden_dim: int hidden feature dimension
        :param out_dim: int output feature dimension
        :param num_heads: int Number of attention heads K
        :param ntypes: List[str] list of vertex types
        :param etypes: List[(str, str, str)] list of canonical edge types
        :param predict_ntype: str The type of vertex to be predicted
        :param num_layers: int number of layers
        :param dropout: dropout: float, optional Dropout probability, default is 0.2
        :param use_norm: bool, optional whether to use layer normalization, the default is True
        """
        super().__init__()
        self.predict_ntype = predict_ntype
        self.adapt_fcs = nn.ModuleDict({
            ntype: nn.Linear(in_dim, hidden_dim) for ntype, in_dim in in_dims.items()
        })
        #create the HGT layers
        self.layers = nn.ModuleList([
            HGTLayer(hidden_dim, hidden_dim, num_heads, ntypes, etypes, dropout, use_norm)
            for _ in range(num_layers)
        ])
        self.predict = nn.Linear(hidden_dim, out_dim)

    def forward(self, g, feats):
        """
        :param g: DGLGraph heterogeneous graph
        :param feats: Dict[str, tensor(N_i, d_in)] mapping of vertex types to input vertex features
        :return: tensor(N_i, d_out) The final embedding of the vertex to be predicted
        """
        hs = {ntype: F.gelu(self.adapt_fcs[ntype](feats[ntype])) for ntype in feats}
        for layer in self.layers:
            hs = layer(g, hs)  # {ntype: tensor(N_i, d_hid)}
        out = self.predict(hs[self.predict_ntype])  # tensor(N_i, d_out)
        return out


# In[ ]:

In [29]:
import argparse
import warnings
import import_ipynb
import torch
import torch.nn.functional as F
import torch.optim as optim
 
DATASET = {
    'acm': ACMDataset(),
    'imdb': IMDbDataset()
}


def train():
    set_random_seed(1)
    data = DATASET['acm']
    g = data[0]
    #node type to predict
    predict_ntype = data.predict_ntype
    #dictionary containing node type and features
    features = {ntype: g.nodes[ntype].data['feat'] for ntype in g.ntypes}
    #lists containing labels, train-mask,val_mask and test-mask
    labels = g.nodes[predict_ntype].data['label']
    train_mask = g.nodes[predict_ntype].data['train_mask']
    val_mask = g.nodes[predict_ntype].data['val_mask']
    test_mask = g.nodes[predict_ntype].data['test_mask']
    
    #initialization of HGT model
    model = HGT(
        {ntype: g.nodes[ntype].data['feat'].shape[1] for ntype in g.ntypes},
        256, data.num_classes, 8, g.ntypes, g.canonical_etypes,
        predict_ntype, 2, 0.5
    )
    optimizer = optim.AdamW(model.parameters())
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer, 1e-3, total_steps=30)
    metrics = 'Epoch {:d} | Train Loss {:.4f} | Train Micro-F1 {:.4f} | Train Macro-F1 {:.4f}' \
              ' | Val Micro-F1 {:.4f} | Val Macro-F1 {:.4f}' \
              ' | Test Micro-F1 {:.4f} | Test Macro-F1 {:.4f}'
    warnings.filterwarnings('ignore', 'Setting attributes on ParameterDict is not supported')
    
    for epoch in range(20):
        model.train()
        # forward propagation by using all nodes
        logits = model(g, features)
        #compute loss
        loss = F.cross_entropy(logits[train_mask], labels[train_mask]) 
        # backward propagation
        optimizer.zero_grad()
        loss.backward()
        #Total norm of the parameter gradients (viewed as a single vector).
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        train_scores = micro_macro_f1_score(logits[train_mask], labels[train_mask])
        val_scores = evaluate(model, g, features, labels, val_mask, micro_macro_f1_score)
        test_scores = evaluate(model, g, features, labels, test_mask, micro_macro_f1_score)
        print(metrics.format(epoch, loss.item(), *train_scores, *val_scores, *test_scores))
    test_scores = evaluate(model, g, features, labels, test_mask, micro_macro_f1_score)
    print('Test Micro-F1 {:.4f} | Test Macro-F1 {:.4f}'.format(*test_scores))
    l = logits[test_mask]
    pred = l.argmax(1)
    print('la predizione è :')
    print(pred)
   
    


@torch.no_grad()
def evaluate(model, g, features, labels, mask, score):
    model.eval()
    logits = model(g, features)
    return score(logits[mask], labels[mask])


def main():
    train()


if __name__ == '__main__':
    main()

Epoch 0 | Train Loss 1.2534 | Train Micro-F1 0.2748 | Train Macro-F1 0.2160 | Val Micro-F1 0.3092 | Val Macro-F1 0.2302 | Test Micro-F1 0.2887 | Test Macro-F1 0.2123
Epoch 1 | Train Loss 1.1769 | Train Micro-F1 0.3403 | Train Macro-F1 0.2744 | Val Micro-F1 0.4439 | Val Macro-F1 0.3341 | Test Micro-F1 0.4059 | Test Macro-F1 0.2992
Epoch 2 | Train Loss 1.1105 | Train Micro-F1 0.4022 | Train Macro-F1 0.3232 | Val Micro-F1 0.5037 | Val Macro-F1 0.2656 | Test Micro-F1 0.4979 | Test Macro-F1 0.2525
Epoch 3 | Train Loss 1.0201 | Train Micro-F1 0.4963 | Train Macro-F1 0.3581 | Val Micro-F1 0.5112 | Val Macro-F1 0.2776 | Test Micro-F1 0.5107 | Test Macro-F1 0.2647
Epoch 4 | Train Loss 0.9565 | Train Micro-F1 0.5644 | Train Macro-F1 0.4157 | Val Micro-F1 0.5686 | Val Macro-F1 0.3904 | Test Micro-F1 0.5710 | Test Macro-F1 0.3885
Epoch 5 | Train Loss 0.8944 | Train Micro-F1 0.5804 | Train Macro-F1 0.4441 | Val Micro-F1 0.6384 | Val Macro-F1 0.5366 | Test Micro-F1 0.6584 | Test Macro-F1 0.5621
Epoc