## Dark Network Information Transfer

This experiment tests the impact that different data integration setups can have on the ability of a partially-synthetic dark network to transfer information

We use the combination builder, parameter sweeper, multiple network diffusion, and custom diffusion model

In [1]:
import pandas as pd
import numpy as np
import networkx as nx

from nidmod.parameter_sweeper import CombinationBuilder, ParameterSweeper

no display found. Using non-interactive Agg backend
no display found. Using non-interactive Agg backend


We have a dataframe (df) with duplicate entries from FEBRL imported through recordlinkage toolkit. We have an associated graph (BA randomly generated) and the true matches

In [2]:
febrl_df = pd.read_csv('febrl_df.csv')
febrl_df.index = febrl_df.rec_id
febrl_df = febrl_df.drop(columns = ['rec_id'])

all_matches = np.load('febrl_matches.npy', allow_pickle = True)
all_matches = pd.MultiIndex.from_tuples(all_matches)

febrl_graph = nx.read_gml('febrl_graph.gml')

In [3]:
febrl_df.head()

Unnamed: 0_level_0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
rec-223-org,,waller,6.0,tullaroop street,willaroo,st james,4011,wa,19081209.0,6988048
rec-122-org,lachlan,berry,69.0,giblin street,killarney,bittern,4814,qld,19990219.0,7364009
rec-373-org,deakin,sondergeld,48.0,goldfinch circuit,kooltuo,canterbury,2776,vic,19600210.0,2635962
rec-10-dup-0,kayla,harrington,,maltby circuit,coaling,coolaroo,3465,nsw,19150612.0,9004242
rec-227-org,luke,purdon,23.0,ramsay place,mirani,garbutt,2260,vic,19831024.0,8099933


We add the attributes from the df to the network

In [4]:
febrl_dict = febrl_df.to_dict()
febrl_graph_attr = febrl_graph.copy()

for attribute in febrl_dict:
    nx.set_node_attributes(febrl_graph_attr, febrl_dict[attribute], attribute)

We define the different integration setups. We define two different blocking setups and two attribute comparison setups.

In [7]:
# different indexing setups
index_setup_0 = {'Block': [['given_name', 'given_name']]}
index_setup_1 = {'Block': [['surname', 'surname']]}
all_index_setups = [index_setup_0, index_setup_1]

# different comparison setups
compare_setup_0 = {'Exact':  [['given_name', 'given_name'], 
                              ['date_of_birth', 'date_of_birth'],
                              ['suburb', 'suburb'], ['state', 'state']],
                'String': [['surname', 'surname', 'jarowinkler', 0.85], 
                           ['address_1', 'address_1', 'levenshtein', 0.85]]}

compare_setup_1 = {'Exact':  [['surname', 'surname'], ['suburb', 'suburb'], 
                              ['state', 'state']],
                'String': [['given_name', 'given_name', 'jarowinkler', 0.85], 
                           ['address_1', 'address_1', 'levenshtein', 0.85]]}

all_compare_setups = [compare_setup_0, compare_setup_1]

# different classifier setups
classifier_name_0 = 'NaiveBayesClassifier'
all_classifier_names = [classifier_name_0]

# clustering algorithms
clustering_algs = ['walktrap_integration']

# building all combinations
from nidmod.parameter_sweeper import CombinationBuilder
combination_builder = CombinationBuilder(all_index_setups, all_compare_setups, 
                                        all_classifier_names, clustering_algs)
integration_setups = combination_builder.get_all_combinations()

# integrating according to different integration setups
from nidmod.parameter_sweeper import ParameterSweeper

graphs = [febrl_graph_attr]

training_matches = np.random.choice(all_matches, replace = False, size = int(0.25*len(all_matches)))
training_matches = pd.MultiIndex.from_tuples(training_matches)

parameter_sweeper = ParameterSweeper(integration_setups, graphs, training_matches)
different_integrated_networks = parameter_sweeper.get_integrated_networks()

We define the algorithm to testing information transfer.

In [None]:
from cdlib import algorithms
def packet_transfer_sims(networks, q, num_packets = 1000):
    times_taken_df = pd.DataFrame()
    for network in networks:
        T = len(network.nodes())
        times_taken = []
        delivered = []
        for sim_num in range(num_packets):
            network_nodes = list(network.nodes())
            source_node = np.random.choice(network_nodes)
            network_nodes.remove(source_node)
            target_node = np.random.choice(network_nodes)
            current_node = source_node
            target_node_comm = [x for x in algorithms.walktrap(network).communities if target_node in x][0]
            t = 0
            while t < T:
                node_neighborhood = [n for n in network.neighbors(current_node)]
                node_overlaps = [x for x in node_neighborhood if x in target_node_comm]
                if not node_neighborhood:
                    t = T
                    break
                if target_node in node_neighborhood and np.random.uniform() < (1-q):
                    current_node = target_node
                    t += 1
                    break
                elif node_overlaps and np.random.uniform() < (1-q):
                    current_node = np.random.choice(node_overlaps)
                else:
                    current_node = np.random.choice(node_neighborhood)
                t += 1
            times_taken.append(t)
            delivered.append(t < T)
        times_taken_df = times_taken_df.append({'times': times_taken, 'delivered': delivered,'T': T, 'num_packets': num_packets}, ignore_index=True)
    return times_taken_df

We execute the algorithm on each of the integrated networks and calculate average measures.

In [None]:
q = 0.2
times_taken_df = packet_transfer_sims(different_integrated_networks, q, num_packets = 10000)
times_taken_df.index = ['integration_setup_0', 'integration_setup_1', 'integration_setup_2', 'integration_setup_3']
times_taken_df['proportion_delivered'] = times_taken_df.delivered.apply(lambda x: np.mean(x))
times_taken_df['inv_ave_delivery_time'] = [1/times_taken_df.num_packets[i] * (np.array([1/x for x in times_taken_df.times[i]])[times_taken_df.delivered[i]]).sum() for i in range(len(times_taken_df))]