### What:

Trying to get node classification with deepwalk+Label Propagation to work

### TODO:

- improve documentation/comments
- add train and test split to actually test performance for classiying
    - follow the authors code for the split. See main.py in the 'classifier' folder
        - same splits and hyperparameters
    - add code for LabelPropagation() to perform the task
- compare Perozzis embeddings with karateclub
    - make their shapes compatible

In [1]:
import numpy as np
import networkx as nx
from sklearn.semi_supervised import LabelPropagation
from sklearn.metrics import pairwise_distances
from sklearn.neighbors import NearestNeighbors
import embed_utils
from sklearn.model_selection import train_test_split
from copy import deepcopy
from collections import Counter

In [7]:
def get_metrics(classifier, test_nodes, embeddings, label_dict):

    test_node_embeddings = [emb for i, emb in enumerate(embeddings) if i in test_nodes]
    test_node_labels = [label_dict[node] for node in test_nodes]
    pred = classifier.predict(test_node_embeddings)
    accuracy = sum(test_node_labels == pred) / len(test_node_labels)
    
    return accuracy


def check_classification_params(nodes, labels, embeddings):
    assert nodes == list(range(len(nodes)))
    assert len(labels) == len(nodes) == len(embeddings), f"{len(labels)}, {len(nodes)}, {len(embeddings)}"


def make_classification_model(nodes, labels, embeddings):
    """
    Makes the classification model
    :param nodes: should be the nodes as list of consecutive integers
    :param labels: should be a list of labels where a value of -1
        indicates a missing label
    :param embeddings: is the embeddings of all nodes obtained using a
        (modified) random walk

    :returns: a model with a predict() function that predicts the label
        from embeddings
    """
    check_classification_params(nodes, labels, embeddings)

    g = np.mean(pairwise_distances(embeddings))
    clf = LabelPropagation(gamma = g).fit(embeddings, labels)

    return clf


def run_classification(dataset, method):

    # get graph from data
    graph = embed_utils.data2graph(dataset)

    # Get labels and attributes of test nodes to other classes
    label_dict = deepcopy(nx.get_node_attributes(graph, embed_utils.CLASS_NAME))
    attr_dict = deepcopy(nx.get_node_attributes(graph, embed_utils.SENSATTR))

    has_attr = True
    if label_dict == {}:
        label_dict = attr_dict
        has_attr = False

    # get embedding from graph
    embeddings = embed_utils.graph2embed(
                                        graph, 
                                        "crosswalk", 
                                        method
                                        )
    
    # Split in to equal sized train and test nodes
    nodes = list(graph.nodes())
    shuffled_nodes = list(graph.nodes())
    np.random.shuffle(shuffled_nodes)
    split_idx = len(shuffled_nodes) // 2
    train_nodes, test_nodes = shuffled_nodes[:split_idx], shuffled_nodes[split_idx:]

    # Get semi-supervised labels
    print(nodes)
    semi_supervised_y = [label_dict[node] if node in train_nodes else -1 for node in nodes]

    # Train the classifier
    clf = make_classification_model(nodes, semi_supervised_y, embeddings)

    # Get test node embeddings, labels and find accuracy on test nodes
    if has_attr: 
        c0_nodes = [node for node in test_nodes if attr_dict[node] == 0]
        acc_c0 = get_metrics(clf, c0_nodes, embeddings, label_dict) * 100

        c1_nodes = [node for node in test_nodes if attr_dict[node] == 1]
        acc_c1 = get_metrics(clf, c1_nodes, embeddings, label_dict) * 100

    accuracy = get_metrics(clf, test_nodes, embeddings, label_dict) * 100
    
    if has_attr:
        print(f"Accuracy c0: {acc_c0}")
        print(f"Accuracy c1: {acc_c1}")
        print(f"Disparity: {np.var([acc_c0, acc_c1])}") 
        print()

    print(f"Total accuracy: {accuracy}")
    print()
    print(f"Counter training lables: {Counter([label_dict[node] for node in train_nodes]).most_common(3)}")
    print(f"Counter real lables: {Counter([label_dict[node] for node in test_nodes]).most_common(3)}")
    print(
        f"Counter prediction: {Counter(clf.predict([emb for i, emb in enumerate(embeddings) if i in test_nodes])).most_common(3)}"
        )
    print()

    return accuracy

n = 1
accs = []
for i in range(n):
    acc = run_classification(dataset="rice", method="node2vec")
    accs.append(acc)

np.mean(accs)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221,

KeyError: 0