In [1]:
import sys, warnings
import numpy as np
import pandas as pd
import networkx as nx
from collections import defaultdict
from scipy.stats import binom

In [2]:
filename = "../data/[UNC]ADNI-network/AD-Data/AD-Data/S90388_fdt_network_matrix"
sep = '\s+'

In [54]:
mat = pd.read_csv(filename, sep = sep,header = None)
table_origin = mat.stack().reset_index()
table_origin.columns = ["src", "trg", "weight"]

gp = table_origin.groupby(["trg"], as_index = False)["weight"].sum()
gp.columns = ["trg", "sum_weight"]

table_norm = pd.merge(table_origin, gp, on = ["trg"], how = "inner")

table_norm["normalized_weight"] = table_norm["weight"] / table_norm["sum_weight"]

table = table_norm.drop("sum_weight", axis = 1)
table

Unnamed: 0,src,trg,weight,normalized_weight
0,0,0,0,0.000000
1,1,0,784,0.000385
2,2,0,807,0.000396
3,3,0,1411,0.000693
4,4,0,304321,0.149387
...,...,...,...,...
21899,143,147,0,0.000000
21900,144,147,38,0.000075
21901,145,147,3,0.000006
21902,146,147,81,0.000161


In [3]:

#sourcehttp://www.michelecoscia.com/?page_id=287

def read(filename, column_of_interest, triangular_input = False, consider_self_loops = True, undirected = False, drop_zeroes = True, sep = "\t"):
    """Reads a field separated input file into the internal backboning format (a Pandas Dataframe).
   The input file should have three or more columns (default separator: tab).
   The input file must have a one line header with the column names.
   There must be two columns called 'src' and 'trg', indicating the origin and destination of the interaction.
   All other columns must contain integer or floats, indicating the edge weight.
   In case of undirected network, the edges have to be present in both directions with the same weights, or set triangular_input to True.

   Args:
   filename (str): The path to the file containing the edges.
   column_of_interest (str): The column name identifying the weight that will be used for the backboning.

   KWArgs:
   triangular_input (bool): Is the network undirected and are the edges present only in one direction? default: False
   consider_self_loops (bool): Do you want to consider self loops when calculating the backbone? default: True
   undirected (bool): Is the network undirected? default: False
   drop_zeroes (bool): Do you want to keep zero weighted connections in the network? Important: it affects methods based on degree, like disparity_filter. default: False
   sep (char): The field separator of the inout file. default: tab

   Returns:
   The parsed network data, the number of nodes in the network and the number of edges.
   """
    

    mat = pd.read_csv(filename, sep = sep,header = None)
    table_origin = mat.stack().reset_index()
    table_origin.columns = ["src", "trg", "weight"]

    gp = table_origin.groupby(["trg"], as_index = False)["weight"].sum()
    gp.columns = ["trg", "sum_weight"]

    table_norm = pd.merge(table_origin, gp, on = ["trg"], how = "inner")

    table_norm["normalized_weight"] = table_norm["weight"] / table_norm["sum_weight"]

    table = table_norm.drop("sum_weight", axis = 1)
    
    table = table[["src", "trg", column_of_interest]]
    table.rename(columns = {column_of_interest: "nij"}, inplace = True)
    if drop_zeroes:
        table = table[table["nij"] > 0]
    if not consider_self_loops:
        table = table[table["src"] != table["trg"]]
    if triangular_input:
        table2 = table.copy()
        table2["new_src"] = table["trg"]
        table2["new_trg"] = table["src"]
        table2.drop("src", 1, inplace = True)
        table2.drop("trg", 1, inplace = True)
        table2 = table2.rename(columns = {"new_src": "src", "new_trg": "trg"})
        table = pd.concat([table, table2], axis = 0)
        table = table.drop_duplicates(subset = ["src", "trg"])
    original_nodes = len(set(table["src"]) | set(table["trg"]))
    original_edges = table.shape[0]
    if undirected:
        return table, original_nodes, original_edges / 2
    else:
        return table, original_nodes, original_edges
    
def disparity_filter(table, undirected = False, return_self_loops = False):
    sys.stderr.write("Calculating DF score...\n")
    table = table.copy()
    table_sum = table.groupby(table["src"]).sum().reset_index()
    table_deg = table.groupby(table["src"]).count()["trg"].reset_index()
    table = table.merge(table_sum, on = "src", how = "left", suffixes = ("", "_sum"))
    table = table.merge(table_deg, on = "src", how = "left", suffixes = ("", "_count"))
    table["score"] = 1.0 - ((1.0 - (table["nij"] / table["nij_sum"])) ** (table["trg_count"] - 1))
    table["variance"] = (table["trg_count"] ** 2) * (((20 + (4.0 * table["trg_count"])) / ((table["trg_count"] + 1.0) * (table["trg_count"] + 2) * (table["trg_count"] + 3))) - ((4.0) / ((table["trg_count"] + 1.0) ** 2)))
    if not return_self_loops:
        table = table[table["src"] != table["trg"]]
    if undirected:
        table["edge"] = table.apply(lambda x: "%s-%s" % (min(x["src"], x["trg"]), max(x["src"], x["trg"])), axis = 1)
        table_maxscore = table.groupby(by = "edge")["score"].max().reset_index()
        table_minvar = table.groupby(by = "edge")["variance"].min().reset_index()
        table = table.merge(table_maxscore, on = "edge", suffixes = ("_min", ""))
        table = table.merge(table_minvar, on = "edge", suffixes = ("_max", ""))
        table = table.drop_duplicates(subset = ["edge"])
        table = table.drop("edge", 1)
        table = table.drop("score_min", 1)
        table = table.drop("variance_max", 1)
    return table[["src", "trg", "nij", "score", "variance"]]


def thresholding(table, threshold):
    """Reads a preprocessed edge table and returns only the edges supassing a significance threshold.

    Args:
    table (pandas.DataFrame): The edge table.
    threshold (float): The minimum significance to include the edge in the backbone.

    Returns:
    The network backbone.
    """
    table = table.copy()
    if "sdev_cij" in table:
        return table[(table["score"] - (threshold * table["sdev_cij"])) > 0][["src", "trg", "nij", "score"]]
    else:
        return table[table["score"] > threshold][["src", "trg", "nij", "score"]]

    
def test_densities(table, start, end, step):
    if start > end:
        raise ValueError("start must be lower than end")
    steps = []
    x = start
    while x <= end:
        steps.append(x)
        x += step
    onodes = len(set(table["src"]) | set(table["trg"]))
    oedges = table.shape[0]
    oavgdeg = (2.0 * oedges) / onodes
    for s in steps:
        edge_table = thresholding(table, s)
        nodes = len(set(edge_table["src"]) | set(edge_table["trg"]))
        edges = edge_table.shape[0]
        avgdeg = (2.0 * edges) / nodes
        yield (s, nodes, (100.0 * nodes) / onodes, edges, (100.0 * edges) / oedges, avgdeg, avgdeg / oavgdeg)
        
def write(table, network, method, folder):
    if not table.empty and "src" in table:
        table.to_csv("%s/%s_%s.csv" % (folder, network, method), sep = "\t", index = False)
    else:
        warnings.warn("Incorrect/empty output. Nothing written on disk", RuntimeWarning)

In [92]:
table,_,_ = read(filename, "normalized_weight", undirected = False, drop_zeroes = True, sep = sep)
table,_,_ = read(filename, "weight", undirected = False, drop_zeroes = True, sep = sep)

In [93]:
table

Unnamed: 0,src,trg,nij
1,1,0,784
2,2,0,807
3,3,0,1411
4,4,0,304321
5,5,0,110290
...,...,...,...
21895,139,147,82
21896,140,147,8
21900,144,147,38
21901,145,147,3


In [94]:
df_table = disparity_filter(table)
df_table

Calculating DF score...


Unnamed: 0,src,trg,nij,score,variance
0,1,0,784,0.045539,0.030256
1,2,0,807,0.061220,0.026114
2,3,0,1411,0.071507,0.025945
3,4,0,304321,1.000000,0.027363
4,5,0,110290,0.982459,0.026114
...,...,...,...,...,...
20577,139,147,82,0.002303,0.025778
20578,140,147,8,0.000198,0.026459
20579,144,147,38,0.001935,0.026459
20580,145,147,3,0.000129,0.025778


In [110]:
for (s, n,np,e,ep,ad,adp) in test_densities(df_table, 0.01, 0.05, 0.01) : 
    fstr = "alpha : %.2f, nodes : %4d, node ratio : %05.2f, edges : %6d, edge ratio : %06.3f, deg %4d, deg ratio : %06.3f"%(s,n,np,e,ep,ad,adp)
    print(fstr)

alpha : 0.01, nodes :  148, node ratio : 100.00, edges :  10135, edge ratio : 49.242, deg  136, deg ratio : 00.492
alpha : 0.02, nodes :  148, node ratio : 100.00, edges :   8346, edge ratio : 40.550, deg  112, deg ratio : 00.405
alpha : 0.03, nodes :  148, node ratio : 100.00, edges :   7346, edge ratio : 35.691, deg   99, deg ratio : 00.357
alpha : 0.04, nodes :  148, node ratio : 100.00, edges :   6705, edge ratio : 32.577, deg   90, deg ratio : 00.326
alpha : 0.05, nodes :  148, node ratio : 100.00, edges :   6224, edge ratio : 30.240, deg   84, deg ratio : 00.302


In [109]:
for (s, n,np,e,ep,ad,adp) in test_densities(df_table, 0.05, 0.99, 0.05) : 
    fstr = "alpha : %.2f, nodes : %4d, node ratio : %05.2f, edges : %6d, edge ratio : %06.3f, deg %4d, deg ratio : %06.3f"%(s,n,np,e,ep,ad,adp)
    print(fstr)

alpha : 0.05, nodes :  148, node ratio : 100.00, edges :   6224, edge ratio : 30.240, deg   84, deg ratio : 00.302
alpha : 0.10, nodes :  148, node ratio : 100.00, edges :   4883, edge ratio : 23.725, deg   65, deg ratio : 00.237
alpha : 0.15, nodes :  148, node ratio : 100.00, edges :   4197, edge ratio : 20.392, deg   56, deg ratio : 00.204
alpha : 0.20, nodes :  148, node ratio : 100.00, edges :   3737, edge ratio : 18.157, deg   50, deg ratio : 00.182
alpha : 0.25, nodes :  148, node ratio : 100.00, edges :   3426, edge ratio : 16.646, deg   46, deg ratio : 00.166
alpha : 0.30, nodes :  148, node ratio : 100.00, edges :   3140, edge ratio : 15.256, deg   42, deg ratio : 00.153
alpha : 0.35, nodes :  148, node ratio : 100.00, edges :   2913, edge ratio : 14.153, deg   39, deg ratio : 00.142
alpha : 0.40, nodes :  148, node ratio : 100.00, edges :   2718, edge ratio : 13.206, deg   36, deg ratio : 00.132
alpha : 0.45, nodes :  148, node ratio : 100.00, edges :   2523, edge ratio : 12

In [112]:
for (s, n,np,e,ep,ad,adp) in test_densities(df_table, 0.95, 0.99, 0.01) : 
    fstr = "alpha : %.2f, nodes : %4d, node ratio : %05.2f, edges : %6d, edge ratio : %06.3f, deg %4d, deg ratio : %06.3f"%(s,n,np,e,ep,ad,adp)
    print(fstr)

alpha : 0.95, nodes :  148, node ratio : 100.00, edges :   1224, edge ratio : 05.947, deg   16, deg ratio : 00.059
alpha : 0.96, nodes :  148, node ratio : 100.00, edges :   1180, edge ratio : 05.733, deg   15, deg ratio : 00.057
alpha : 0.97, nodes :  148, node ratio : 100.00, edges :   1132, edge ratio : 05.500, deg   15, deg ratio : 00.055
alpha : 0.98, nodes :  148, node ratio : 100.00, edges :   1069, edge ratio : 05.194, deg   14, deg ratio : 00.052
alpha : 0.99, nodes :  148, node ratio : 100.00, edges :    983, edge ratio : 04.776, deg   13, deg ratio : 00.048


In [113]:
thresholding(df_table, 0.99)

Unnamed: 0,src,trg,nij,score
3,4,0,304321,1.000000
22,23,0,562533,1.000000
29,30,0,182301,0.999998
30,31,0,33464,0.998643
51,52,0,87986,0.991295
...,...,...,...,...
20482,145,146,175430,0.999558
20557,106,147,136914,1.000000
20558,107,147,107834,0.995630
20560,109,147,140039,0.999992


In [22]:
filepath_r = "../data/[UNC]ADNI-network/AD-Data/AD-Data"
filepath_w = "./data_50"


In [23]:
from os import walk
import os.path
_, _, filenames = next(walk(filepath_r))

In [24]:
filenames[0]

'S100790_fdt_network_matrix'

In [25]:
os.path.join(filepath_r,filenames[0])

'../data/[UNC]ADNI-network/AD-Data/AD-Data\\S100790_fdt_network_matrix'

In [26]:
#alpha = 0.95
alpha = 0.50
#alpha = 0.05
for filename in filenames:
    table,_,_ = read(os.path.join(filepath_r,filename), "weight", undirected = False, drop_zeroes = True, sep = sep)
    df_table = disparity_filter(table)
    th_table = thresholding(df_table, alpha)
    write(th_table,filename,"df",filepath_w)

Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF s

Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF score...
Calculating DF s

In [2]:
!python sparsification.py

alpha : 0.05, nodes :  148, node ratio : 100.00, edges :   6224, edge ratio : 30.240, deg   84, deg ratio : 00.302

Calculating DF score...



alpha : 0.10, nodes :  148, node ratio : 100.00, edges :   4883, edge ratio : 23.725, deg   65, deg ratio : 00.237
alpha : 0.15, nodes :  148, node ratio : 100.00, edges :   4197, edge ratio : 20.392, deg   56, deg ratio : 00.204
alpha : 0.20, nodes :  148, node ratio : 100.00, edges :   3737, edge ratio : 18.157, deg   50, deg ratio : 00.182
alpha : 0.25, nodes :  148, node ratio : 100.00, edges :   3426, edge ratio : 16.646, deg   46, deg ratio : 00.166
alpha : 0.30, nodes :  148, node ratio : 100.00, edges :   3140, edge ratio : 15.256, deg   42, deg ratio : 00.153
alpha : 0.35, nodes :  148, node ratio : 100.00, edges :   2913, edge ratio : 14.153, deg   39, deg ratio : 00.142
alpha : 0.40, nodes :  148, node ratio : 100.00, edges :   2718, edge ratio : 13.206, deg   36, deg ratio : 00.132
alpha : 0.45, nodes :  148, node ratio : 100.00, edges :   2523, edge ratio : 12.258, deg   34, deg ratio : 00.123
alpha : 0.50, nodes :  148, node ratio : 100.00, edges :   2360, edge ratio : 1