### What:

Trying to get node classification with deepwalk+Label Propagation to work

### TODO:

- improve documentation/comments
- add train and test split to actually test performance for classiying
    - follow the authors code for the split. See main.py in the 'classifier' folder
        - same splits and hyperparameters
    - add code for LabelPropagation() to perform the task
- compare Perozzis embeddings with karateclub
    - make their shapes compatible

In [1]:
import numpy as np
import networkx as nx
from sklearn.semi_supervised import LabelPropagation
from sklearn.metrics import pairwise_distances
from sklearn.neighbors import NearestNeighbors
import embed_utils
from sklearn.model_selection import train_test_split
from copy import deepcopy
from collections import Counter

In [40]:
def get_metrics(classifier, test_nodes, embeddings, label_dict):

    test_node_embeddings = [emb for i, emb in enumerate(embeddings) if i in test_nodes]
    test_node_labels = [label_dict[node] for node in test_nodes]
    pred = classifier.predict(test_node_embeddings)
    accuracy = sum(test_node_labels == pred) / len(test_node_labels)
    
    return accuracy


def check_classification_params(nodes, labels, embeddings):
    assert nodes == list(range(len(nodes)))
    assert len(labels) == len(nodes) == len(embeddings), f"{len(labels)}, {len(nodes)}, {len(embeddings)}"


def make_classification_model(nodes, labels, embeddings):
    """
    Makes the classification model
    :param nodes: should be the nodes as list of consecutive integers
    :param labels: should be a list of labels where a value of -1
        indicates a missing label
    :param embeddings: is the embeddings of all nodes obtained using a
        (modified) random walk

    :returns: a model with a predict() function that predicts the label
        from embeddings
    """
    check_classification_params(nodes, labels, embeddings)

    g = np.mean(pairwise_distances(embeddings))
    clf = LabelPropagation(gamma = g).fit(embeddings, labels)

    return clf


def run_classification(dataset, method):

    # get graph from data
    graph = embed_utils.data2graph(dataset)

    # Get labels and attributes of test nodes to other classes
    label_dict = deepcopy(nx.get_node_attributes(graph, embed_utils.CLASS_NAME))
    attr_dict = deepcopy(nx.get_node_attributes(graph, embed_utils.SENSATTR))

    # get embedding from graph
    embeddings = embed_utils.graph2embed(
                                        graph, 
                                        "crosswalk", 
                                        method
                                        )
    
    # Split in to equal sized train and test nodes
    nodes = list(graph.nodes())
    shuffled_nodes = list(graph.nodes())
    np.random.shuffle(shuffled_nodes)
    split_idx = len(shuffled_nodes) // 2
    train_nodes, test_nodes = shuffled_nodes[:split_idx], shuffled_nodes[split_idx:]

    # Get semi-supervised labels
    semi_supervised_y = [label_dict[node] if node in train_nodes else -1 for node in nodes]

    # Train the classifier
    clf = make_classification_model(nodes, semi_supervised_y, embeddings)

    # Get test node embeddings, labels and find accuracy on test nodes
    c0_nodes = [node for node in test_nodes if attr_dict[node] == 0]
    acc_c0 = get_metrics(clf, c0_nodes, embeddings, label_dict)

    c1_nodes = [node for node in test_nodes if attr_dict[node] == 1]
    acc_c1 = get_metrics(clf, c1_nodes, embeddings, label_dict)

    accuracy = get_metrics(clf, test_nodes, embeddings, label_dict)
    
    print(f"Accuracy c0: {acc_c0}")
    print(f"Accuracy c1: {acc_c1}")
    print(f"Total accuracy: {accuracy}")
    print(f"Disparity: {np.var([acc_c0, acc_c1])}")
    print()
    print(f"Counter training lables: {Counter([label_dict[node] for node in train_nodes]).most_common(3)}")
    print(f"Counter real lables: {Counter([label_dict[node] for node in test_nodes]).most_common(3)}")
    print(
        f"Counter prediction: {Counter(clf.predict([emb for i, emb in enumerate(embeddings) if i in test_nodes])).most_common(3)}"
        )
    print()

    return accuracy

n = 1
accs = []
for i in range(n):
    acc = run_classification(dataset="stanford", method="node2vec")
    accs.append(acc)

np.mean(accs)

Accuracy c0: 0.509090909090909
Accuracy c1: 0.5573770491803278
Total accuracy: 0.49122807017543857
Disparity: 0.0005828878311837439

Counter training lables: [(1, 91), (0, 79)]
Counter real lables: [(1, 94), (0, 77)]
Counter prediction: [(1, 95), (0, 76)]



0.49122807017543857

In [None]:
def get_accuracy(target_nodes, predicted_attributes, target_attributes):
    correct = 0
    total = len(target_nodes)
    for node in target_nodes:
        t = predicted_attributes[node]
        y = target_attributes[node]

        if y == t and t is not None:
            correct += 1

    return correct/total

nodes_c0 = [node for node in test_nodes if original_attributes[node] == 0]
acc_c0 = get_accuracy(nodes_c0, lp.get_memberships(), original_attributes)

nodes_c1 = [node for node in test_nodes if original_attributes[node] == 1]
acc_c1 = get_accuracy(nodes_c1, lp.get_memberships(), original_attributes)

print(f"Test nodes in c=0: {len(nodes_c0)}, Test nodes in c=1: {len(nodes_c1)}")
print("Number of nodes predicted in c0 and c1:", len([n for n in lp.get_memberships()if n == 0]), len([n for n in lp.get_memberships() if n == 1]))
print("Most common labels in predictions:", Counter(list(lp.get_memberships().values())).most_common(3))
print(acc_c0*100, acc_c1*100)
np.var([acc_c0*100, acc_c1*100])

NameError: name 'test_nodes' is not defined