## Online Social Network Information Spread

This experiment tests how different data integration setups can affect the spread of information in online social networks

We use the feature setup, network integrator, and custom diffusion model and multinetwork diffusion

In [1]:
import sys
sys.path.append("C:/Users/jnevin/Documents/GitHub/nidmod")

from nidmod.datahandling.dataintegration import FeatureSetup, MatchClassifier, NetworkIntegrator
from nidmod.diffusionmodel.diffusionmodel import CustomDiffusionModel
from nidmod.parameter_sweeper import MultiNetworkDiffusion

no display found. Using non-interactive Agg backend
no display found. Using non-interactive Agg backend


We have nodes and edges for the myspace and last.fm graphs, plus matches between the two

In [2]:
import pandas as pd
import networkx as nx
import numpy as np

myspace_nodes = pd.read_csv('myspace.nodes', sep = '\t', header = None)
myspace_edges = pd.read_csv('myspace.edges', sep = ' ', header = None)

lastfm_nodes = pd.read_csv('lastfm.nodes', sep = '\t', header = None)
lastfm_edges = pd.read_csv('lastfm.edges', sep = ' ', header = None)

true_mapping = pd.read_csv('lastfm-myspace.map.raw', sep = ' ', header = None)

We relabel the nodes and edges to be clear to which network they belong and construct the graphs from the edge list

In [3]:
myspace_nodes[0] = myspace_nodes[0].map(lambda x: 'myspace_' + str(x))
myspace_nodes = myspace_nodes.rename(columns={0: 'myspace_index', 1: 'username'})
myspace_nodes = myspace_nodes.set_index('myspace_index')

lastfm_nodes[0] = lastfm_nodes[0].map(lambda x: 'lastfm_' + str(x))
lastfm_nodes = lastfm_nodes.rename(columns={0: 'lastfm_index', 1: 'username'})
lastfm_nodes = lastfm_nodes.set_index('lastfm_index')

myspace_edges[0] = myspace_edges[0].map(lambda x: 'myspace_' + str(x))
myspace_edges[1] = myspace_edges[1].map(lambda x: 'myspace_' + str(x))

lastfm_edges[0] = lastfm_edges[0].map(lambda x: 'lastfm_' + str(x))
lastfm_edges[1] = lastfm_edges[1].map(lambda x: 'lastfm_' + str(x))

myspace_graph = nx.Graph()
for edge in myspace_edges.values:
    myspace_graph.add_edge(edge[0], edge[1])
    
lastfm_graph = nx.Graph()
for edge in lastfm_edges.values:
    lastfm_graph.add_edge(edge[0], edge[1])

We work with only the largest connected component (almost the entire graph) plus only consider duplicates for seed nodes if they are in both connected components

In [4]:
largest_cc = max(nx.connected_components(myspace_graph), key=len)
myspace_graph = myspace_graph.subgraph(largest_cc)

largest_cc = max(nx.connected_components(lastfm_graph), key=len)
lastfm_graph = lastfm_graph.subgraph(largest_cc)

myspace_nodes = myspace_nodes.loc[list(myspace_graph.nodes())]
lastfm_nodes = lastfm_nodes.loc[list(lastfm_graph.nodes())]

useable_mappings = true_mapping.loc[(true_mapping[1].isin(myspace_nodes.username)) & (true_mapping[0].isin(lastfm_nodes.username))]

We define neighbour exploration to create the subsampled graphs from the seed nodes

In [5]:
def neighbour_explore(graph, seed_nodes, steps, size):
    start_nodes = seed_nodes
    included_nodes = set(start_nodes)

    already_checked_nodes = set()
    to_check_nodes = start_nodes

    for i in range(steps):
        nodes_to_add = set()
        for node in to_check_nodes:
            already_checked_nodes = already_checked_nodes | set([node])
            neighbours = [n for n in graph.neighbors(node)]
            nodes_to_add = nodes_to_add | set(np.random.choice(neighbours, replace = False, size = min(size, len(neighbours))))
        included_nodes = included_nodes | set(nodes_to_add)
        to_check_nodes = nodes_to_add - already_checked_nodes
        
    return included_nodes

We define the diffusion model and simulation parameters

In [6]:
statuses = ['Susceptible', 'Infected', 'Removed']
compartments = {'NodeStochastic': {'c1': [0.02, 'Infected'], 'c2': [0.01]}}
transition_rules = [["Susceptible", "Infected", "c1"], ["Infected", "Removed", "c2"]]
model_parameters = [['fraction_infected', 0.1]]
simulation_parameters = [25, 600, None, 5]
model_name = 'sir'

custom_diffusion_model = CustomDiffusionModel(statuses, compartments,
                                             transition_rules, model_parameters)

We test different string thresholds with a Levenshtein similarity function

In [None]:
levenshtein_graph_df = pd.DataFrame()
levenshtein_diff_df = pd.DataFrame()

# we test 25 different initial seed mappings
for seed_node_num in range(25):
    # we choose 200 random seed mappings

    seed_mappings = useable_mappings.loc[np.random.choice(list(useable_mappings.index), size = 200, replace = False)]

    # and then find the subsampled networks based on these seed nodes

    myspace_seed_nodes = myspace_nodes.loc[myspace_nodes.username.isin(seed_mappings[1])].index
    lastfm_seed_nodes = lastfm_nodes.loc[lastfm_nodes.username.isin(seed_mappings[0])].index

    myspace_neighbour_nodes = neighbour_explore(myspace_graph, myspace_seed_nodes, 2, 5)
    lastfm_neighbour_nodes = neighbour_explore(lastfm_graph, lastfm_seed_nodes, 2, 5)

    myspace_neighbour_graph = myspace_graph.subgraph(myspace_neighbour_nodes)
    lastfm_neighbour_graph = lastfm_graph.subgraph(lastfm_neighbour_nodes)

    myspace_nodes_reduced = myspace_nodes.loc[list(myspace_neighbour_graph.nodes())]
    lastfm_nodes_reduced = lastfm_nodes.loc[list(lastfm_neighbour_graph.nodes())]

    index_setup = {'Full': [[]]}
    integrated_graphs = []

    # we test a number of differet string thresholds
    tested_thresholds = [0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9]

    for string_threshold in tested_thresholds:
        compare_setup = {'String': [['username', 'username', 'levenshtein', string_threshold]]}

        feature_setup = FeatureSetup(index_setup, compare_setup, myspace_nodes_reduced, lastfm_nodes_reduced)
        features = feature_setup.calculate_features()

        pred_matches = features.loc[features[0] > 0]
        pred_matches = pred_matches.index

        network_integrator = NetworkIntegrator([myspace_neighbour_graph, lastfm_neighbour_graph], pred_matches)
        integrated_graphs.append(network_integrator.integrate_network('multigraph_walktrap_integration'))

    multi_network_diffusion = MultiNetworkDiffusion(integrated_graphs, custom_diffusion_model)
    graph_assc_results_analysers = multi_network_diffusion.run_diffusion_model(simulation_parameters)
    final_stats = graph_assc_results_analysers.get_average_stat_comparison()

    # we track the size of the graphs and the proportion of the population adopting the information
    levenshtein_graph_df = levenshtein_graph_df.append(dict(zip(tested_thresholds, 
                                                               [len(integrated_graphs[i].nodes()) for i in range(len(integrated_graphs))])), ignore_index = True)

    levenshtein_diff_df = levenshtein_diff_df.append(dict(zip(tested_thresholds, 
                                                              final_stats['Removed_final'].values)), ignore_index = True)

In [7]:
levenshtein_graph_df = pd.DataFrame()
levenshtein_diff_df = pd.DataFrame()

# we test 25 different initial seed mappings
for seed_node_num in range(1):
    # we choose 200 random seed mappings

    seed_mappings = useable_mappings.loc[np.random.choice(list(useable_mappings.index), size = 200, replace = False)]

    # and then find the subsampled networks based on these seed nodes

    myspace_seed_nodes = myspace_nodes.loc[myspace_nodes.username.isin(seed_mappings[1])].index
    lastfm_seed_nodes = lastfm_nodes.loc[lastfm_nodes.username.isin(seed_mappings[0])].index

    myspace_neighbour_nodes = neighbour_explore(myspace_graph, myspace_seed_nodes, 2, 5)
    lastfm_neighbour_nodes = neighbour_explore(lastfm_graph, lastfm_seed_nodes, 2, 5)

    myspace_neighbour_graph = myspace_graph.subgraph(myspace_neighbour_nodes)
    lastfm_neighbour_graph = lastfm_graph.subgraph(lastfm_neighbour_nodes)

    myspace_nodes_reduced = myspace_nodes.loc[list(myspace_neighbour_graph.nodes())]
    lastfm_nodes_reduced = lastfm_nodes.loc[list(lastfm_neighbour_graph.nodes())]

    index_setup = {'Full': [[]]}
    integrated_graphs = []

    # we test a number of differet string thresholds
    tested_thresholds = [0.6, 0.65]

    for string_threshold in tested_thresholds:
        compare_setup = {'String': [['username', 'username', 'levenshtein', string_threshold]]}

        feature_setup = FeatureSetup(index_setup, compare_setup, myspace_nodes_reduced, lastfm_nodes_reduced)
        features = feature_setup.calculate_features()

        pred_matches = features.loc[features[0] > 0]
        pred_matches = pred_matches.index

        network_integrator = NetworkIntegrator([myspace_neighbour_graph, lastfm_neighbour_graph], pred_matches)
        integrated_graphs.append(network_integrator.integrate_network('multigraph_walktrap_integration'))

    multi_network_diffusion = MultiNetworkDiffusion(integrated_graphs, custom_diffusion_model)
    graph_assc_results_analysers = multi_network_diffusion.run_diffusion_model(simulation_parameters)
    final_stats = graph_assc_results_analysers.get_average_stat_comparison()

    # we track the size of the graphs and the proportion of the population adopting the information
    levenshtein_graph_df = levenshtein_graph_df.append(dict(zip(tested_thresholds, 
                                                               [len(integrated_graphs[i].nodes()) for i in range(len(integrated_graphs))])), ignore_index = True)

    levenshtein_diff_df = levenshtein_diff_df.append(dict(zip(tested_thresholds, 
                                                              final_stats['Removed_final'].values)), ignore_index = True)