In [15]:
from graphdriver.load import labels
from graphdriver.commons import data as cm_data, config
from graphdriver.main import transformers
from graphdriver.utils import cons
import matplotlib.pyplot as plt
import pandas as pd
from tabulate import tabulate

In [12]:
# ipynb.debug()
cancer = "brca"
conf = config.default(cancer, ["genes", "ppi"], 0)
conf.type = "train"
# conf.gcnk = 1
# conf.dropouts = 0.3
# conf.epochs = 30
conf.weighted = True
conf.use_tumor = True


## Find out average Stats per Driver Type for Tumors 

In [None]:
def avg_connections(data, gcnk: int):
    data = dataset.Dataset(conf).data
    df = pd.DataFrame(data.edge_index.numpy().T, columns=[cons.SYM_SOURCE, cons.SYM_TARGET])
    genes_dict_inv = {v: k for k, v in data.genes_dict.items()}
    df[cons.SYM_SOURCE] = df[cons.SYM_SOURCE].map(genes_dict_inv).to_numpy()
    df[cons.SYM_TARGET] = df[cons.SYM_TARGET].map(genes_dict_inv).to_numpy()
    types = [cons.DRIVER_SPEC, cons.DRIVER, cons.CANDIDATE, cons.PASSENGER]
    driver_types = {}
    before = df.shape[0]
    for t in types:
        # check if t is in data.drivers (passengers as last, only use remaining df)
        if t in data.drivers:
            # only get driver t types
            d = df[df[cons.SYM_SOURCE].isin(data.drivers[t].index)]
            # remove drivers t from df
            df = df[~df[cons.SYM_SOURCE].isin(data.drivers[t].index)]
        else:
            d = df
        # classify targets
        for tt in types:
            if tt in data.drivers:
                d[tt] = 0
                d.loc[d[cons.SYM_TARGET].isin(data.drivers[tt].index), tt] = 1
            else:
                d[tt] = 0
                d.loc[d[cons.CANDIDATE]+d[cons.DRIVER] + d[cons.DRIVER_SPEC]==0, tt] = 1
        driver_types[t] = d
    means = []
    for t in types:
        means.append(driver_types[t].iloc[:,2:].mean())
    m = pd.DataFrame(means)
    m.index = m.columns
    m.loc['Σ']= m.sum()
    m['Σ'] = m.sum(axis=1)
    return m

In [None]:
tfs = transformers.from_conf(conf)
data = cm_data.Dataset('brca', tfs).get_data()

In [100]:
import itertools
# data = data.Dataset(conf).get_data()
df = pd.DataFrame(data.gene_edge_index.numpy().T, columns=[cons.SYM_SOURCE, cons.SYM_TARGET])
genes_dict_inv = {v: k for k, v in data.symbol_index_dict.items()}
df[cons.SYM_SOURCE] = df[cons.SYM_SOURCE].map(genes_dict_inv).to_numpy()
df[cons.SYM_TARGET] = df[cons.SYM_TARGET].map(genes_dict_inv).to_numpy()
types = [cons.DRIVERS_CANCER, cons.DRIVERS_OTHER, cons.CANDIDATES, cons.PASSENGERS]
driver_types = {}
before = df.shape[0]

drivers_cancer = data.labels.drivers_cancer.numpy()
drivers_others = data.labels.drivers_others.numpy()
candidates = data.labels.candidates.numpy()
passengers = data.labels.passengers.numpy()

lbls = [(drivers_cancer, cons.DRIVERS_CANCER), (drivers_others, cons.DRIVERS_OTHER), (candidates, cons.CANDIDATES), (passengers, cons.PASSENGERS)]
genes_all = list(data.symbol_index_dict.keys())

col_names = []
for l in lbls:
    for ll in lbls:
        col_name = "{}_{}".format(l[1],ll[1])
        col_names.append(col_name)
        df[col_name] = 0
        genes_source = [genes_dict_inv[g] for g in l[0]]
        genes_target = [genes_dict_inv[g] for g in ll[0]]
        df.loc[df[cons.SYM_SOURCE].isin(genes_source) & df[cons.SYM_TARGET].isin(genes_target), col_name] = 1
df.head()

Unnamed: 0,source_symbol,target_symbol,drivers_cancer_drivers_cancer,drivers_cancer_drivers_other,drivers_cancer_candidates,drivers_cancer_passengers,drivers_other_drivers_cancer,drivers_other_drivers_other,drivers_other_candidates,drivers_other_passengers,candidates_drivers_cancer,candidates_drivers_other,candidates_candidates,candidates_passengers,passengers_drivers_cancer,passengers_drivers_other,passengers_candidates,passengers_passengers
0,A1BG,MZT2B,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,A1BG,ZNF446,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,A1BG,CHMP2A,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,A1BG,ALKBH7,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,A1BG,SCAND1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [108]:
df[col_names].sum(axis=0).to_frame()

Unnamed: 0,0
drivers_cancer_drivers_cancer,6
drivers_cancer_drivers_other,27
drivers_cancer_candidates,203
drivers_cancer_passengers,46
drivers_other_drivers_cancer,36
drivers_other_drivers_other,439
drivers_other_candidates,2817
drivers_other_passengers,502
candidates_drivers_cancer,401
candidates_drivers_other,4055


### Print results to file

In [9]:
tumor_normal = [["tumor", True], ["normal", False]]
ks = [3,5,7,9,11,15]

for tn in tumor_normal:
    plots = {}
    conf.use_tumor = tn[1]
    df_print = ""
    for k in ks:
        tfs = transformers.from_conf(conf)
        d = data.Dataset()

        df = avg_connections(ds.data, k)
        df_print = df_print + "k is " + str(k) + "\n"
        df_print = df_print + tabulate(df, headers='keys', tablefmt='psql') + "\n"
        plots[k] = df.iloc[:4,:4].T
    with open("brca_stats_" + tn[0] + ".txt", 'w') as f:
        f.write(df_print)

    fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(20, 10))
    ki = 0
    for i in range(2):
        for j in range(3):
            plots[ks[ki]].plot(ax = axes[i,j], title="k is " + str(ks[ki]))
            ki +=1
    plt.tight_layout()
    plt.savefig("brca_stats_"+tn[0]+".txt.png")
    plt.close(fig)

Unnamed: 0,driver_specific,driver,candidate,passenger
driver_specific,1.0,0.804958,-0.504231,-0.340942
driver,0.804958,1.0,-0.037887,-0.757985
candidate,-0.504231,-0.037887,1.0,-0.615131
passenger,-0.340942,-0.757985,-0.615131,1.0
