## Co-author Network Label Propagation

This experiment tests how different data integration setups can affect community label propagation in a co-author network

We use the feature setup and network integrator.

In [1]:
from nidmod.datahandling.dataintegration import FeatureSetup, NetworkIntegrator

We have an adjacency matrix, an authors list, community labels for the nodes/authors, and noisy author names.

In [2]:
import pandas as pd
import networkx as nx
import numpy as np
from networkx.algorithms import node_classification

adj = np.loadtxt('adjacency.txt')
G = nx.from_numpy_matrix(adj)

author_names = pd.read_csv('authors.txt', header = None)

author_comms = pd.read_csv('commLabels.txt', sep = ' ')

adjusted_names = pd.read_csv('authors_noisy.csv')

We create dataframes and dictionaries with all the correct, non-noisy information.

In [3]:
num_authors = len(author_names)
network0_names = author_names.rename(columns={0:'name'}, index=dict(zip(np.arange(num_authors), ['network0_' + str(x) for x in np.arange(num_authors)])))
author_comms = author_comms.rename(index=dict(zip(np.arange(num_authors), ['network0_' + str(x) for x in np.arange(num_authors)])))
author_comms_dict = author_comms.to_dict()

We create a function that splits the true network randomly in two and then randomly relabels the nodes in the second network with the noisy author names.

In [4]:
def create_split_networks():
    G_0 = nx.Graph()
    G_1 = nx.Graph()

    G_0.add_nodes_from(list(G.nodes()))
    G_1.add_nodes_from(list(G.nodes()))

    # randomly dividing the edges between the two networks
    for edge in list(G.edges()):

        reassignment = np.random.randint(0, 2)
        if reassignment == 0:
            G_0.add_edge(*edge)
        else:
            G_1.add_edge(*edge)

    G_0 = nx.relabel_nodes(G_0, dict(zip(list(G_0.nodes()), ['network0_' + str(x) for x in list(G_0.nodes())])))
    G_1 = nx.relabel_nodes(G_1, dict(zip(list(G_1.nodes()), ['network1_' + str(x) for x in list(G_1.nodes())])))

    network1_names = pd.DataFrame(index = ['network1_' + str(x) for x in np.arange(num_authors)])

    # randomly relabelling the the nodes in the noisy network
    names = []
    for i in range(num_authors):
        if np.random.randint(0,2) == 0:
            names.append(adjusted_names.iloc[i].values[0])
        else:
            names.append(author_names.iloc[i].values[0])
    network1_names['name'] = names
    
    return G_0, G_1, network1_names

We create a function that splits the networks and integrates them before running the label propagation. We can define which string similarity thresholds are tested and the number of simulations that are tested.

In [5]:
def integrate_and_diffuse(thresholds, splits):
    all_comm_prop = pd.DataFrame()
    all_graph_prop = pd.DataFrame()

    for num_split in range(splits):
    
        # we create the two networks with noisy author names
        G_0, G_1, network1_names = create_split_networks()
        
        index_setup = {'Full': [[]]}
        integrated_graphs = []

        # for each string threshold, we perform integration
        for string_threshold in thresholds:

            compare_setup = {'String': [['name', 'name', 'levenshtein', string_threshold]]}

            feature_setup = FeatureSetup(index_setup, compare_setup, network0_names, network1_names)
            features = feature_setup.calculate_features()

            pred_matches = features.loc[features[0] > 0]
            pred_matches = pred_matches.index

            network_integrator = NetworkIntegrator([G_0, G_1], pred_matches)
            integrated_graphs.append(network_integrator.integrate_network('multigraph_walktrap_integration'))
            
        community_prop = pd.DataFrame(columns = ['comm_1', 'comm_2', 'nodes'])
        graph_properties = pd.DataFrame(columns = ['graph', 'close_cent', 'bet_cent', 'har_cent'])
        
        # for each graph, we perform the label propagation
        for graph in integrated_graphs:
            
            # the labels are available for the nodes network_0
            nx.set_node_attributes(graph, author_comms_dict['SCORE'], "label")
            
            # we propagate with a harmonic function and check the number of nodes and proportion in each community
            node_class = node_classification.harmonic_function(graph)
            community_prop = community_prop.append({'comm_1': sum([x == 1 for x in node_class])/len(graph.nodes), 
                                                    'comm_2': 1 - sum([x == 1 for x in node_class])/len(graph.nodes),
                                                    'nodes': len(graph.nodes())},
                                                  ignore_index = True)
            
            # we calculate some graph properties
            graph_properties = graph_properties.append({'graph': graph, 'deg_cent': np.mean(list(nx.degree_centrality(graph).values())),
                                                       'close_cent': np.mean(list(nx.closeness_centrality(graph).values())),
                                                       'bet_cent': np.mean(list(nx.betweenness_centrality(graph).values())),
                                                       'har_cent': np.mean(list(nx.harmonic_centrality(graph).values()))}, ignore_index = True)

        community_prop['threshold'] = thresholds
        graph_properties['threshold'] = thresholds
        all_comm_prop = pd.concat([all_comm_prop, community_prop])
        all_graph_prop = pd.concat([all_graph_prop, graph_properties])

        
    return all_comm_prop, all_graph_prop

We can execute the experiments with a number of thresholds and simulations.

In [None]:
label_prop_results, graph_properties = integrate_and_diffuse(np.arange(0.5, 0.92, 0.02), 100)