In [1]:
import pyro
import pyro.distributions as dist
import torch
import pandas as pd
import torch
import torch.nn as nn
import pyro
import pyro.distributions as dist
import torch.nn.functional as F
import generate_toy_data as data
import math
from tqdm import trange

# import classes
from pyro.infer import SVI, TraceMeanField_ELBO
import model_signals_only as MSO
import model_signals_only_fixedBeta as MSB
import model_signals_and_refState as MSR
import model_signals_refStates_fixedBeta as MSRB

# setting up devices
seed = 0
torch.manual_seed(seed)
pyro.set_rng_seed(seed)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [3]:
num_bins = 10000
num_references = 3
num_groups = 3
state_vary_rate = 0.01
num_signals= 3
num_states = 3
output_folder = '../../experiments/strict_genData_circularState/b{}_r{}_m{}_s{}'\
.format(num_bins, num_references, num_signals, num_states)
helper.make_dir(args.output_folder)

In [None]:
generator_params = {'num_bins': num_bins, 'num_references': num_references, 'num_groups': num_groups, \
                    'state_vary_rate': state_vary_rate, 'num_signals': num_signals, 'num_states': num_states}
generator = data.CircularStateGenerator(**generator_params, high_w=100)
'''
M: # regions
N: # bins per region
L: # signals (marks)
alpha: params of dirichlet prior over reference epigenomics
beta: ref --> sample state categorical distribution
p: state --> signal bernoulli distribution 
r: reference state at each bin. one-hot encoding, matrix size : #bins * #ref * #states
theta: the mixture probabilities of reference epigenome
'''
m = generator.get_sampled_signals()
r = generator.get_ref_state_indicators()
p = generator.params['p']
# this will save the simulated data of state assignment 
# (ground truth state assignment probabilities)
generator.save_collapsed_theta(os.path.join(args.output_folder, 'collapsed_theta.txt.gz'))


Setting up all the models 

In [None]:
hidden = 32
dropout = 0.2
# declare 4 models with similar parameters
m_SigOnly = MSO.model_Signals(num_signals, num_references, num_states, hidden, dropout)
m_SigBeta = MSB.model_signals_only_fixedBeta(num_signals, num_references, num_states, hidden, dropout, p)
m_SigRef = MSR.model_signals_refStates(num_signals, num_references, num_states, hidden, dropout)
m_SigRefBeta = MSRB.model_signals_refStates_fixedBeta(num_signals, num_references, num_states, hidden, dropout, p)
m_SigOnly.to(device)
m_SigBeta.to(device)
m_SigRef.to(device)
m_SigRefBeta.to(device)
print(isinstance(m_SigOnly, (type, MSO.model_Signals)))
print(isinstance(m_SigBeta, (type, MSB.model_signals_only_fixedBeta)))
print(isinstance(m_SigRef, (type, MSR.model_signals_refStates)))
print(isinstance(m_SigRefBeta, (type, MSRB.model_signals_refStates_fixedBeta)))

Define a function to learn and reconstruct the data, and gives the results of ratios of the data that gets reconstructed

In [None]:
def learn_and_reconstruct_input(state_model, m, r, p, posterior_fn):
    batch_size = 200
    learning_rate = 1e-3
    num_epochs = 1000
    pyro.clear_param_store()
    optimizer = pyro.optim.Adam({"lr": learning_rate})
    num_batches = int(math.ceil(m.shape[0] / batch_size))
    bar = trange(num_epochs)
    withRef = True # some models needs reference states data, others don't
    if isinstance(state_model, (type, MSO.model_Signals)) or isinstance(state_model, (type, MSB.model_signals_only_fixedBeta)):
        withRef = False 
    svi = SVI(state_model.model, state_model.guide, optimizer, loss=TraceMeanField_ELBO())
    for epoch in bar:
        running_loss = 0.0
        for i in range(num_batches):
            batch_m = m[i * batch_size:(i+1) * batch_size, :]
            if withRef:
                batch_r = r[i * batch_size:(i+1) * batch_size, :, :]
                loss = svi.step(batch_m, batch_r)
            else:
                loss = svi.step(batch_m)
            running_loss += loss / batch_m.size(0)
    bar.set_postfix(epoch_loss='{:.2e}'.format(running_loss))
    if withRef:
        ratio_m_CR, ratio_r_CR = state_model.get_percentage_correct_reconstruct(m,r)
        state_model.write_predicted_state_assignment(m, r, posterior_fn)
        return ratio_m_CR, ratio_r_CR
    else:
        ratio_m_CR = state_model.get_percentage_correct_reconstruct(m)
        state_model.write_predicted_state_assignment(m, posterior_fn)
        return ratio_m_CR, 0 # return 0 as a placeholder for ratio_r_CR that is not applicable for this case


REPORT THE OUTPUT DATA

In [None]:
result_df = pd.DataFrame(columns = ['model', 'num_signals', 'num_references', 'num_states', 'hidden', 'dropout', 'ratio_m_CR', 'ratio_r_CR'])
def get_one_line_to_report(state_model, model_name):
    results = [model_name, state_model.num_signals, state_model.num_references, state_model.num_states, state_model.hidden, state_model.dropout]
    posterior_fn = os.path.join(args.output_folder, '{}_pos.txt.gz'.format(model_name))
    ratio_m_CR, ratio_r_CR = learn_and_reconstruct_input(state_model, m, r, p, posterior_fn)
    results += [ratio_m_CR, ratio_r_CR]
    return results

result_df.loc[0] = get_one_line_to_report(m_SigOnly, 'SigOnly')
result_df.loc[1] = get_one_line_to_report(m_SigBeta, 'SigBeta')
result_df.loc[2] = get_one_line_to_report(m_SigRef, 'SigRef')
result_df.loc[3] = get_one_line_to_report(m_SigRefBeta, 'SigRefBeta')
report_fn = os.path.join(args.output_folder, 'report_ratio_CR.txt')
result_df.to_csv(report_fn, header = True, index = False, sep = '\t')

In [None]:
# the fn that contains posterior probabilities of state assignments at different models
truth_fn = os.path.join(output_folder,'collapsed_theta.txt.gz')
sigOnly_fn = os.path.join(output_folder, 'SigOnly_pos.txt.gz')
sigBeta_fn = os.path.join(output_folder, 'SigBeta_pos.txt.gz')
sigRefBeta_fn = os.path.join(output_folder, 'SigRefBeta_c_pos.txt.gz')
sigRef_fn = os.path.join(output_folder, 'SigRef_c_pos.txt.gz')

In [None]:
def read_state_df(fn, model_name):
    df = pd.read_csv(fn, header = 0, index_col = None, sep = '\t')
    df['max_prob'] = df.apply(lambda x: np.max(x[:-1]), axis = 1)
    df.columns = list(map(lambda x: '{}|{}'.format(model_name, x), df.columns))
    return df
truth_df = read_state_df(truth_fn, 'truth')
sigOnly_df = read_state_df(sigOnly_fn, 'sigOnly')
sigBeta_df = read_state_df(sigBeta_fn, 'sigBeta')
sigRefBeta_df = read_state_df(sigRefBeta_fn, 'sigRefBetaC')
sigRef_df = read_state_df(sigRef_fn, 'sigRefC')
all_df = pd.concat([truth_df, sigOnly_df, sigBeta_df, sigRefBeta_df, sigRef_df], axis = 1)

In [None]:
# have a look at the data
all_df.head(30)

Plots of the distribution of the maximum probabilities of state assignments across all genomic position

In [None]:
# draw the histogram of max_probs for the different models
model_name_list = ['truth', 'sigRefBetaC', 'sigRefC', 'sigOnly', 'sigBeta']
plot_nrow = 1
plot_ncol = 3
fig, axes = plt.subplots(ncols = plot_ncol, nrows = plot_nrow, figsize = (9,6))
for model_index, model_name in enumerate(model_name_list[:3]):
    ax = (axes.flat)[model_index] 
    sns.histplot(all_df['{}|max_prob'.format(model_name)], kde= True, ax = ax).set_title(model_name)
fig.tight_layout()
plt.show()

Plots of the confusion matrix between the true state assignment (rows) and the predicted state assignment (columns)

In [None]:
model_name_list = ['truth', 'sigOnly', 'sigBeta', 'sigRefBetaC', 'sigRefC']
# draw the confusion matrix
state_df = all_df.filter(regex='max_state',axis=1)
# select columns ending with 'max_state'
plot_nrow = 2
plot_ncol = 2
fig, axes = plt.subplots(ncols = plot_ncol, nrows = plot_nrow, figsize = (9,9))
for model_index, model_name in enumerate(model_name_list[1:]):
    colnames = ['truth|max_state', '{}|max_state'.format(model_name)]
    df = state_df[colnames]
    df = df.groupby(colnames).size().to_frame(name = 'size').reset_index()
    df = df.pivot(colnames[0], colnames[1], 'size')
    df = df.div(df.sum(axis = 1), axis = 0) # row normalize
    ax = (axes.flat)[model_index] 
    sns.heatmap(df, cbar=True, linewidths=2,vmax=1, vmin=0, square=True, cmap='Blues', ax=ax).set_title(model_name)
fig.tight_layout()
plt.show()