In [1]:
import sys
import os
import pandas as pd
import numpy as np
import networkx as nx
from Swing.util.Evaluator import Evaluator
from Swing.util.lag_identification import get_experiment_list, xcorr_experiments, calc_edge_lag
from nxpd import draw
from nxpd import nxpdParams
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from scipy.stats import fisher_exact, linregress, ttest_rel

%matplotlib inline

## Functions!

#### Note: Several of the functions do not pass variables correctly. Code will need to be cleaned if shared

In [2]:
def is_square(n):
    """
    Determine if a number is a perfect square
    :param n: int or float
        The number to check
    :return: Boolean
        Return True if the number is a perfect square
    """
    return np.sqrt(n).is_integer()


def get_factors(n):
    """
    Calculate the factors of a number
    :param n: int
        The number to be factored
    :return: list
        A sorted list of the unique factors from smallest to largest
    """
    factor_list = np.array([[i, n // i] for i in range(1, int(n ** 0.5) + 1) if n % i == 0]).flatten().astype(int)
    return sorted(factor_list.tolist())


def calc_subplot_dimensions(x):
    """
    Calculate the dimensions for a matplotlib subplot object.
    :param x: int
        Number of plots that need to be made
    :return: rows, columns
        The number of rows and columns that should be in the subplot
    """
    if x <= 3:
        rows = x
        columns = 1
    else:
        factor_list = get_factors(x)
        while len(factor_list) <= 2 and not is_square(x):
            x += 1
            factor_list = get_factors(x)
        if is_square(x):
            rows = int(np.sqrt(x))
            columns = int(np.sqrt(x))

        else:
            rows = factor_list[int(len(factor_list)/2-1)]
            columns = factor_list[int(len(factor_list)/2)]

    return rows, columns


def get_true_edges(gold_filename):
    evaluator = Evaluator(gold_filename, '\t')
    edges = evaluator.gs_flat.tolist()
    return edges, evaluator


def get_edge_lags(data_filename):
    df = pd.read_csv(data_filename, sep="\t")
    gene_list = df.columns.values[1:].tolist()
    experiment_list = get_experiment_list(data_filename, 21, 10)
    xcorr_array = xcorr_experiments(experiment_list)
    lags = calc_edge_lag(xcorr_array, gene_list, 0.1, 0.5, timestep=1)
    return lags, df


def get_network_changes(pickle_filename, edge_str='regulator-target', base_str='rank_importance_RF-td_21',
                        shortener_str='rank_importance_', replace=''):
    results_df = pd.read_pickle(pickle_filename)
    edges = results_df[edge_str].values
    baseline = results_df[base_str].values

    rank_df = pd.DataFrame()
    rank_df[edge_str] = edges
    rank_df[('Base_%s' %replace)] = baseline
    for column in results_df.columns:
        if column != edge_str and column!=base_str:
            short_name = column.replace(shortener_str, replace)
            rank_df[short_name] = results_df[column].values
    rank_df.set_index(['regulator-target'], inplace=True)
    diff_df = (rank_df.T-rank_df.T.iloc[0]).T
    parameters = set(rank_df.columns[1:].values)
    return diff_df, rank_df, parameters


def get_network_data(goldstandard, timeseries, ignore_self=True):
    # Get true network
    true_edges, evaluator = get_true_edges(goldstandard)
    dg = nx.DiGraph()
    dg.add_edges_from(true_edges)
    
    #Network statistics - deprecated
    #degree = nx.degree_centrality(dg)
    #b_cent = pd.DataFrame.from_dict({k: [v] for k, v in nx.edge_betweenness_centrality(dg).items()}, 'index')
    #b_cent.columns = ['Bcent']
    
    #Calculate edge lags
    edge_lags, data = get_edge_lags(timeseries)
    if ignore_self:
        edge_lags = edge_lags[edge_lags['Parent'] != edge_lags['Child']]
    edge_df = pd.DataFrame(edge_lags['Lag'].values, index=edge_lags['Edge'].values, columns=['Lag'])

    return true_edges, edge_df, data, dg, evaluator

def get_signed_edges(signed):
    df = pd.read_csv(signed, sep='\t', header=None)
    df['regulator-target'] = list(zip(df[0], df[1]))
    df.set_index(['regulator-target'], inplace=True)
    df.drop([0, 1], axis=1, inplace=True)
    df.columns=['sign']
    return df

def calc_scores(ranking_df):
    filtered_ranks = ranking_df.copy()
    filtered_ranks.reset_index(level=0, inplace=True)
    roc = []
    aupr = []
    for c in filtered_ranks.columns[1:]:
        filtered_ranks.sort_values(filtered_ranks.columns[1], inplace=True)
        roc.append(evaluator.calc_roc(filtered_ranks.iloc[:, :2])[2].values[-1])
        aupr.append(evaluator.calc_pr(filtered_ranks.iloc[:, :2])[2].values[-1])
        filtered_ranks.drop(c, axis=1, inplace=True)
    return roc, aupr

def calc_promotion(change_df, columns):
    t_promoted = np.sum(change_df.loc[:, columns].values > 0, axis=0)
    t_demoted = np.sum(change_df.loc[:, columns].values < 0, axis=0)
    t_same = np.sum(change_df.loc[:, columns].values == 0, axis=0)

    t_lagged = change_df[change_df['Lag'] > 0]
    l_promoted = np.sum(t_lagged.loc[:, columns].values > 0, axis=0)
    l_demoted = np.sum(t_lagged.loc[:, columns].values < 0, axis=0)
    l_same = np.sum(t_lagged.loc[:, columns].values == 0, axis=0)
    rows = ['true+', 'true-', 'true=', 'lag+', 'lag-', 'lag=']
    return pd.DataFrame([t_promoted, t_demoted, t_same, l_promoted, l_demoted, l_same], index=rows, columns=columns).T

def get_net_stats(dg):
    g = dg.to_undirected()
    assort = nx.degree_pearson_correlation_coefficient(dg)
    if np.isnan(assort):
        assort = 0
    clust = nx.average_clustering(g)
    trans= nx.transitivity(g)
    try:
        rad = nx.radius(g)
    except nx.NetworkXError:
        rad = 0
    try:
        diam = nx.diameter(g)
    except nx.NetworkXError:
        diam = 0
    return [assort, clust, trans, rad, diam]

def get_c_table(summary_df):
    """
    C table format: [[lagged_promoted, lagged_not_promoted],
                 [not_lagged_promoted, not_lagged_not_promoted]]
    """

    c_table = np.array([[summary_df['lag+'], summary_df['lag-']+summary_df['lag=']],
                        [summary_df['true+']-summary_df['lag+'],
                         summary_df['true-']+summary_df['true=']-summary_df['lag-']-summary_df['lag=']]])
    c_table = np.array([c_table[:, :, ii] for ii in range(c_table.shape[2])])
    return c_table

### Compile results

In [3]:
lag_range = {'ml_0': [0, 1], 'ml_1': [0, 2], 'ml_2': [0, 3], 'ml_3': [1, 2], 'ml_4': [1, 4], 'ml_5': [2, 3]}
num_nets = 20
methods = ['Dionesus', 'RF']
replace_dict = {'Dionesus':'D', 'RF':'RF'}
models = ['Ecoli', 'Yeast']
result_types = ['te_change', 'te_rank', 'roc', 'pr']
overall = {method:{model:{result:pd.DataFrame() for result in result_types} for model in models} 
                for method in methods}
network_stats = {method:{model:{net:{} for net in range(1, num_nets+1)} for model in models} for method in methods}
for ii, method in enumerate(methods):
    for model in models:
        for net in range(1, num_nets+1):
            short = 'rank_importance_%s' %method
            pickle_file = "%s_net%i_%s_promotion.pkl" % (model, net, method.lower())
            base_str = ('rank_importance_%s-td_21' %method)
            
            roc_df = pd.DataFrame()
            pr_df = pd.DataFrame()
            # Get the network information
            gold_file = "../data/gnw_insilico/network_data/%s/%s-%i_goldstandard.tsv" % (model, model, net)
            signed_file = gold_file.replace('.tsv', '_signed.tsv')
            data_file = "../data/gnw_insilico/network_data/%s/%s-%i_timeseries.tsv" % (model, model, net)
            true_edges, edge_df, data, dg, evaluator = get_network_data(gold_file, data_file)
            signed_edges = get_signed_edges(signed_file)
        
            change, ranks, params = get_network_changes(pickle_file, base_str= base_str, 
                                                        shortener_str=short, replace=replace_dict[method])
            
            change_df = change.reindex_axis(sorted(change.columns), axis=1)
            ranks_df = ranks.reindex_axis(sorted(ranks.columns), axis=1)
            conditions = change_df.columns.values
            network_stats[method][model][net]['rank'] = ranks_df
            network_stats[method][model][net]['rank_change'] = change_df
        
            # Calculate the auroc and aupr for each parameter set of the network
            roc_df[model + str(net)], pr_df[model + str(net)]= calc_scores(ranks_df)
            roc_df.index = conditions
            pr_df.index = conditions
            network_stats[method][model][net]['auroc'] = roc_df.T
            network_stats[method][model][net]['aupr'] = pr_df.T
            
            # Compile results
            full_change = pd.concat([edge_df, change_df], axis=1, join='inner')
            full_rank = pd.concat([edge_df, ranks_df], axis=1, join='inner')
            te_rank = pd.concat([signed_edges, full_rank[full_rank.index.isin(true_edges)]], 
                                        axis=1, join='inner')
            te_change = pd.concat([signed_edges, full_change[full_change.index.isin(true_edges)]],
                                           axis=1, join='inner')
            promotions = calc_promotion(te_change, conditions)
            stats = get_net_stats(dg)
            network_stats[method][model][net]['conditions'] = conditions
            network_stats[method][model][net]['change'] = full_change
            network_stats[method][model][net]['rank'] = full_rank
            network_stats[method][model][net]['te_change'] = te_change
            network_stats[method][model][net]['te_rank'] = te_rank
            network_stats[method][model][net]['promotions'] = promotions
            network_stats[method][model][net]['contingency'] = get_c_table(promotions)
            network_stats[method][model][net]['stats'] = pd.DataFrame(stats, index=['assort', 'clust', 
                                                                                    'trans', 'rad', 'diam'])


# stats_df = pd.DataFrame(stats, index = roc_df.columns.values, columns = ['assort', 'clust', 'trans', 'rad', 'diam'])

## Summarize Results at each level

In [26]:
s_dict = {}
for meth, mod in network_stats.items():
    s_dict[meth] = {"aupr": pd.DataFrame(), "auroc":pd.DataFrame(), 
                          "te_change":pd.DataFrame(), "te_rank":pd.DataFrame()}
    for m in mod.keys():
        s_dict[meth][m] = {"aupr": pd.DataFrame(), "auroc":pd.DataFrame(), 
                                 "te_change":pd.DataFrame(), "te_rank":pd.DataFrame()}
        # Caclulate it for each model organism
        for k in network_stats[meth][m].keys():
            s_dict[meth][m]['aupr'] = pd.concat([s_dict[meth][m]['aupr'], 
                                                 network_stats[meth][m][k]['aupr']], join='inner')
            s_dict[meth][m]['auroc'] = pd.concat([s_dict[meth][m]['auroc'], 
                                                  network_stats[meth][m][k]['auroc']], join='inner')
            s_dict[meth][m]['te_change'] = pd.concat([s_dict[meth][m]['te_change'], 
                                                      network_stats[meth][m][k]['te_change']], join='outer')
            s_dict[meth][m]['te_rank'] = pd.concat([s_dict[meth][m]['te_rank'], 
                                                    network_stats[meth][m][k]['te_rank']], join='outer')
        s_dict[meth][m]['promotion'] = calc_promotion(s_dict[meth][m]['te_change'], s_dict[meth][m]['aupr'].columns)
        s_dict[meth][m]['contingency'] = get_c_table(s_dict[meth][m]['promotion'])
        
        #Summarize it for each method
        s_dict[meth]['aupr'] = pd.concat([s_dict[meth]['aupr'], s_dict[meth][m]['aupr']], join='inner')
        s_dict[meth]['auroc'] = pd.concat([s_dict[meth]['auroc'], s_dict[meth][m]['auroc']], join='inner')
        s_dict[meth]['te_change'] = pd.concat([s_dict[meth]['te_change'], s_dict[meth][m]['te_change']], join='outer')
        s_dict[meth]['te_rank'] = pd.concat([s_dict[meth]['te_rank'], s_dict[meth][m]['te_rank']], join='outer')
    s_dict[meth]['promotion'] = calc_promotion(s_dict[meth]['te_change'], s_dict[meth]['aupr'].columns)
    s_dict[meth]['contingency'] = get_c_table(s_dict[meth]['promotion'])

(nan, 1.0)
(0.70588235294117652, 0.21291987101812371)
(0.48518918918918919, 0.010862258220910654)
(0.37209302325581395, 0.00059678604791438272)
(0.4626736111111111, 0.006611078391902515)
(0.44788975021533162, 0.0059820461212942407)
(0.35999999999999999, 0.00033992183432555187)
(0.37309489941068891, 0.00073622441705142405)
(0.39983494945327008, 0.0014315216493716852)
(0.9464285714285714, 0.89297761029133271)


In [None]:
# Calculate true edges promoted
conditions = te_change.columns.values[4:]
summary_df = calc_promotion(te_change, conditions)

for ii in range(c_table.shape[-1]):
    print(conditions[ii], fisher_exact(c_table[:, :, ii])[1])

# for ii, run in enumerate(conditions):
#     if 'ml' in run:
#         key = run.split('-')[1]
#         min_lag = lag_range[key][0]
#         max_lag = lag_range[key][1]
#         in_range = true_edge_change[(true_edge_change['Lag'] >= min_lag) & (true_edge_change['Lag'] <= max_lag)][[run]]
#         print(run, np.sum(in_range.values > 0)/len(in_range))

In [None]:
c = ['b', 'r', 'g']
plt.figure()
for ii in range(3,6):
    if ii==3:
        plt.bar(range(len(summary_df)), summary_df.iloc[:, ii]/345, color=c[ii-3])
    else:
        bar_bottom = np.sum(summary_df.iloc[:, 3:ii]/345, axis=1)
        plt.bar(range(len(summary_df)), summary_df.iloc[:, ii]/345, color=c[ii-3], bottom=bar_bottom)
# summary_df.columns[3:]

In [None]:
f, axarr = plt.subplots(2, 2, figsize=(20,10))
# axarr[0, 0].plot([0.3, 0.9], [0.3, 0.9], '-', c='0.5', lw=5)
# axarr[1, 1].plot([0.3, 0.9], [0.3, 0.9], '-', c='0.5', lw=5)
for ii in range(2,len(roc_df)):
    m = roc_df.index[ii]
    if 'D' in m:
        roc_pval = ttest_rel(roc_df.iloc[0], roc_df.iloc[ii])[1]
        pr_pval = ttest_rel(pr_df.iloc[0], pr_df.iloc[ii])[1]
        if roc_pval <0.05:
            axarr[0, 0].plot(roc_df.iloc[0], roc_df.iloc[ii], '.', label=m, ms=10)
        if pr_pval <0.05:
            axarr[0, 1].plot(pr_df.iloc[0], pr_df.iloc[ii], '.', label=m, ms=10)
        
    else:
        roc_pval = ttest_rel(roc_df.iloc[1], roc_df.iloc[ii])[1]
        pr_pval = ttest_rel(pr_df.iloc[0], pr_df.iloc[ii])[1]
        if roc_pval < 0.05:
            axarr[1, 0].plot(roc_df.iloc[1], roc_df.iloc[ii], '.', label=m, ms=10)
        if pr_pval <0.05:
            axarr[1, 1].plot(pr_df.iloc[0], pr_df.iloc[ii], '.', label=m, ms=10)


# axarr[0].set_xlabel('Baseline')
# axarr[0].set_ylabel('SWING score')
# axarr[0].legend(loc='best')
# axarr[1].set_xlabel('Baseline')
# axarr[1].set_ylabel('SWING score')
# axarr[1].legend(loc='best')

In [None]:
for row in range(len(roc_df)):
    for col in stats_df.columns.values:
        plt.figure()
        print(roc_df.index.values[row], col, linregress(stats_df[col], roc_df.iloc[row]).rvalue)
#         plt.plot(stats_df[col], roc_df.iloc[1], '.')