### What:

Trying to get node classification with deepwalk+Label Propagation to work

### TODO:

- improve documentation/comments
- add train and test split to actually test performance for classiying
    - follow the authors code for the split. See main.py in the 'classifier' folder
        - same splits and hyperparameters
    - add code for LabelPropagation() to perform the task
- compare Perozzis embeddings with karateclub
    - make their shapes compatible

In [1]:
import numpy as np
import networkx as nx
from sklearn.semi_supervised import LabelPropagation
from sklearn.metrics import pairwise_distances
from sklearn.neighbors import NearestNeighbors
import embed_utils
from sklearn.model_selection import train_test_split
from copy import deepcopy
from collections import Counter

In [2]:
import random
import networkx as nx
from typing import Dict
from karateclub.estimator import Estimator


class MyLabelPropagation(Estimator):
    r"""An implementation of `"Label Propagation Clustering" <https://arxiv.org/abs/0709.2938>`_
    from the Physical Review '07 paper "Near Linear Time Algorithm to Detect Community Structures
    in Large-Scale Networks". The tool executes a series of label propagations with unique labels.
    The final labels are used as cluster memberships.
    Args:
        seed (int): Random seed. Default is 42.
        iterations (int): Propagation iterations. Default is 100.
    """

    def __init__(self, seed: int = 42, iterations: int = 7):
        self.seed = seed
        self.iterations = iterations

    def _make_a_pick(self, neighbors):
        """
        Choosing a neighbor from a propagation source node.
        Arg types:
            * **neigbours** *(list)* - Neighbouring nodes.
        """
        scores = {}
        for neighbor in neighbors:
            neighbor_label = self._labels[neighbor]

            if neighbor_label in scores.keys():
                scores[neighbor_label] = scores[neighbor_label] + 1
            else:
                scores[neighbor_label] = 1

        top = [key for key, val in scores.items() if val == max(scores.values())]
        return random.sample(top, 1)[0]

    def _do_a_propagation(self):
        """
        Doing a propagation round.
        """
        random.shuffle(self._nodes)
        new_labels = {}
        for node in self._nodes:
            neighbors = [neb for neb in nx.neighbors(self._graph, node)]
            pick = self._make_a_pick(neighbors)
            new_labels[node] = pick
        self._labels = new_labels

    def fit(self, graph: nx.classes.graph.Graph):
        """
        Fitting a Label Propagation clustering model.
        Arg types:
            * **graph** *(NetworkX graph)* - The graph to be clustered.
        """
        self._set_seed()
        graph = self._check_graph(graph)
        self._graph = graph
        self._nodes = [node for node in self._graph.nodes()]
        self._labels = nx.get_node_attributes(self._graph, "class") # Hier
        
        random.seed(self.seed)
        for _ in range(self.iterations):
            self._do_a_propagation()

    def get_memberships(self) -> Dict[int, int]:
        r"""Getting the cluster membership of nodes.
        Return types:
            * **memberships** *(dict)* - Node cluster memberships.
        """
        memberships = self._labels
        return memberships

In [3]:
def make_k_nearest_graph(embed, original_graph, k=10):
    """"
    Construct a graph from an embedding using the k-nearest graph algorithm
    embed: embedding
    original_graph: needed for preserving node attributes
    k: nearest k neighbors to a point in the embedding space are used fix edges in the returned graph
    """
    # create new graph based on original nodes and attributes
    k_nearest_graph = nx.Graph()
    k_nearest_graph.add_nodes_from(original_graph)
    nx.set_node_attributes(k_nearest_graph, nx.get_node_attributes(original_graph, "class"),  "class")
    # find the k nearest neighbors for each node and add between the node and these neighbors
    # edges in the new graph 
    nbrs = NearestNeighbors(n_neighbors=k, algorithm='ball_tree').fit(embed)
    _, indices = nbrs.kneighbors(embed)
    k_nearest_links = [[i[0], j] for i in indices for j in i[1:]]
    k_nearest_graph.add_edges_from(k_nearest_links)
    return k_nearest_graph

In [17]:
def get_metrics(t, y, c0_nodes, c1_nodes):
    # check_metrics_params(t, y, c0_nodes, c1_nodes)

    acc = sum(t == y) / len(t)
    acc_c0 = sum(t[c0_nodes] == y[c0_nodes]) / len(c0_nodes)
    acc_c1 = sum(t[c1_nodes] == y[c1_nodes]) / len(c1_nodes)

    disparity = np.var([acc_c0, acc_c1])

    metrics = {
        "accuracy":acc,
        "disparity":disparity,
        "accuracy_c0":acc_c0,
        "accuracy_c1":acc_c1,
    }

    return metrics


def check_classification_params(nodes, labels, embeddings):
    assert nodes == list(range(len(nodes)))
    assert len(labels) == len(nodes) == len(embeddings), f"{len(labels)}, {len(nodes)}, {len(embeddings)}"


def make_classification_model(nodes, labels, embeddings):
    """
    Makes the classification model
    :param nodes: should be the nodes as list of consecutive integers
    :param labels: should be a list of labels where a value of -1
        indicates a missing label
    :param embeddings: is the embeddings of all nodes obtained using a
        (modified) random walk

    :returns: a model with a predict() function that predicts the label
        from embeddings
    """
    check_classification_params(nodes, labels, embeddings)

    g = np.mean(pairwise_distances(embeddings))
    clf = LabelPropagation(gamma = g).fit(embeddings, labels)

    return clf


def run_classification(dataset, method):

    # get graph from data
    graph = embed_utils.data2graph(dataset)

    # Get labels of test nodes to other classes
    label_dict = deepcopy(nx.get_node_attributes(graph, "class"))

    # get embedding from graph
    embeddings = embed_utils.graph2embed(graph, 
                                        "default", 
                                        method
                                        )
    
    # Split in to equal sized train and test nodes
    nodes = list(graph.nodes())
    shuffled_nodes = list(graph.nodes())
    np.random.shuffle(shuffled_nodes)
    split_idx = len(shuffled_nodes) // 2
    train_nodes, test_nodes = shuffled_nodes[:split_idx], shuffled_nodes[split_idx:]

    # Get semi-supervised labels
    semi_supervised_y = [label_dict[node] if node in train_nodes else -1 for node in nodes]

    # Train the classifier
    clf = make_classification_model(nodes, semi_supervised_y, embeddings)

    # Get test node embbeddings, labels and find accuracy on test nodes
    test_node_embeddings = [emb for i, emb in enumerate(embeddings) if i in test_nodes]
    test_node_labels = [label_dict[node] for node in test_nodes]
    pred = clf.predict(test_node_embeddings)

    # Calculate accuracy
    accuracy = sum(test_node_labels == pred) / len(test_node_labels)
    
    print(f"Total accuracy: {accuracy}")
    print(f"Counter real lables: {Counter(test_node_labels).most_common(3)}")
    print(f"Counter prediction: {Counter(pred).most_common(3)}")

    return accuracy

n = 20
accs = []
for i in range(n):
    acc = run_classification(dataset="rice", method="deepwalk")
    accs.append(acc)

np.mean(accs)

Total accuracy: 0.7668161434977578
Counter real lables: [(1, 182), (0, 41)]
Counter prediction: [(1, 208), (0, 15)]
Total accuracy: 0.4484304932735426
Counter real lables: [(1, 173), (0, 50)]
Counter prediction: [(0, 131), (1, 92)]
Total accuracy: 0.36771300448430494
Counter real lables: [(1, 179), (0, 44)]
Counter prediction: [(0, 165), (1, 58)]
Total accuracy: 0.4170403587443946
Counter real lables: [(1, 174), (0, 49)]
Counter prediction: [(0, 151), (1, 72)]
Total accuracy: 0.7847533632286996
Counter real lables: [(1, 179), (0, 44)]
Counter prediction: [(1, 211), (0, 12)]
Total accuracy: 0.6367713004484304
Counter real lables: [(1, 172), (0, 51)]
Counter prediction: [(1, 165), (0, 58)]
Total accuracy: 0.7219730941704036
Counter real lables: [(1, 169), (0, 54)]
Counter prediction: [(1, 209), (0, 14)]


  self.label_distributions_ /= normalizer


Total accuracy: 0.20179372197309417
Counter real lables: [(1, 178), (0, 45)]
Counter prediction: [(0, 223)]
Total accuracy: 0.5874439461883408
Counter real lables: [(1, 172), (0, 51)]
Counter prediction: [(1, 158), (0, 65)]
Total accuracy: 0.6412556053811659
Counter real lables: [(1, 168), (0, 55)]
Counter prediction: [(1, 188), (0, 35)]
Total accuracy: 0.6457399103139013
Counter real lables: [(1, 165), (0, 58)]
Counter prediction: [(1, 186), (0, 37)]
Total accuracy: 0.7488789237668162
Counter real lables: [(1, 173), (0, 50)]
Counter prediction: [(1, 209), (0, 14)]
Total accuracy: 0.7309417040358744
Counter real lables: [(1, 172), (0, 51)]
Counter prediction: [(1, 212), (0, 11)]


  self.label_distributions_ /= normalizer


Total accuracy: 0.24663677130044842
Counter real lables: [(1, 168), (0, 55)]
Counter prediction: [(0, 223)]
Total accuracy: 0.6771300448430493
Counter real lables: [(1, 162), (0, 61)]
Counter prediction: [(1, 198), (0, 25)]
Total accuracy: 0.34977578475336324
Counter real lables: [(1, 168), (0, 55)]
Counter prediction: [(0, 176), (1, 47)]
Total accuracy: 0.6412556053811659
Counter real lables: [(1, 170), (0, 53)]
Counter prediction: [(1, 178), (0, 45)]
Total accuracy: 0.7623318385650224
Counter real lables: [(1, 173), (0, 50)]
Counter prediction: [(1, 214), (0, 9)]
Total accuracy: 0.4304932735426009
Counter real lables: [(1, 171), (0, 52)]
Counter prediction: [(0, 159), (1, 64)]
Total accuracy: 0.7219730941704036
Counter real lables: [(1, 168), (0, 55)]
Counter prediction: [(1, 210), (0, 13)]


0.5764573991031391

In [5]:
def get_accuracy(target_nodes, predicted_attributes, target_attributes):
    correct = 0
    total = len(target_nodes)
    for node in target_nodes:
        t = predicted_attributes[node]
        y = target_attributes[node]

        if y == t and t is not None:
            correct += 1

    return correct/total

nodes_c0 = [node for node in test_nodes if original_attributes[node] == 0]
acc_c0 = get_accuracy(nodes_c0, lp.get_memberships(), original_attributes)

nodes_c1 = [node for node in test_nodes if original_attributes[node] == 1]
acc_c1 = get_accuracy(nodes_c1, lp.get_memberships(), original_attributes)

print(f"Test nodes in c=0: {len(nodes_c0)}, Test nodes in c=1: {len(nodes_c1)}")
print("Number of nodes predicted in c0 and c1:", len([n for n in lp.get_memberships()if n == 0]), len([n for n in lp.get_memberships() if n == 1]))
print("Most common labels in predictions:", Counter(list(lp.get_memberships().values())).most_common(3))
print(acc_c0*100, acc_c1*100)
np.var([acc_c0*100, acc_c1*100])

NameError: name 'test_nodes' is not defined