In [1]:
import pyro
import pyro.distributions as dist
import torch
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import pyro
import pyro.distributions as dist
import torch.nn.functional as F
from pyro.infer import SVI, TraceMeanField_ELBO
from tqdm import trange
import math

## Class to generate toy data

In [24]:
'''
M: # regions
N: # bins per region
L: # signals (marks)
alpha: params of dirichlet prior over reference epigenomics
beta: ref --> sample state categorical distribution
p: state --> signal bernoulli distribution 
r: reference state at each bin. one-hot encoding, matrix size : #bins * #ref * #states
theta: the mixture probabilities of reference ethetagenome
'''

class CircularStateGenerator:
    # Within the number of references, there is a group of references that will be similar to the 
    # sample of interests in terms of state assignments
    def __init__(self,  
                 num_bins=5, 
                 num_references=10, 
                 num_groups=3,
                 state_vary_rate=0.01, 
                 # fraction of the genome where the state assignments among references of the same group are diff
                 num_signals=3,
                 num_states=5,
                 high_w=100):
        self.num_bins = num_bins
        self.num_references = num_references
        self.num_groups = num_groups
        self.state_vary_rate = state_vary_rate
        self.num_signals = num_signals
        self.num_states = num_states
        self.high_w = high_w
        self.sample = None
        self.params = self.set_params()
    
        
    # parameter of state->signal distributions
    # shape is (num_states, num_signals)
    def generate_param_p(self):
        p = torch.zeros((self.num_states, self.num_signals))
        for i in range(self.num_states):
            w = -self.high_w * torch.ones(self.num_signals)
            w[i % self.num_signals] = self.high_w
            p[i,:] = w
        return p
    
    # generate a state assignment tensor
    # shape is (num_regions, num_bins_per_region, num_references)
    def generate_ref_states(self):
        # this is code for the case where we want varied state patterns from each reference
        # and that there are actually groups of references that are similar to each other
        num_ref_per_groups = np.ceil(self.num_references/self.num_groups).astype(int)
        sample_r = torch.zeros(self.num_states, self.num_groups)
        for i in range(self.num_groups):
            sample_r[:,i] = torch.arange(self.num_states).roll(i)
            # each group has a circular permutation of states that are characteristics to that group
        sample_r = sample_r.repeat(np.ceil(self.num_bins / self.num_states).astype(int), 1)
        # now r is just a repeated sequence of sample_r
        r = torch.zeros(sample_r.shape[0], self.num_references)
        for i in range(self.num_references):
            r[:,i] = sample_r[:, i % self.num_groups]
        # now we will start to introduce some random changes to the state assignments among references from
        # the same groups
        num_change = int(self.state_vary_rate * self.num_bins)
        for i in range(self.num_states, self.num_references): 
            # for the first num_states columns, keep all the state assignments
            # if num_references < num_states, this loop will not be called
            org_r = r[:,i]
            indices_to_change = np.random.choice(self.num_bins, num_change)
            indices_to_change = torch.tensor(indices_to_change).type(torch.LongTensor)
            states_to_change = torch.tensor(np.random.choice(self.num_states, num_change)).float()
            r[indices_to_change,i] = states_to_change
        r = r[:self.num_bins,:self.num_references]
        return r.long() # num_bins, num_references --> values: state-0-based 
    
    # set parameters of the data generator
    def set_params(self):
        # parameters of the dirichlet over references
        # same one for every region
        # very high probability that generated sample looks like
        # reference 0
        # shape is (num_references,)
        alpha = torch.ones(self.num_references)
        num_ref_per_groups = np.ceil(self.num_references/self.num_groups).astype(int)
        for i in range(self.num_references):
            if i % self.num_groups == 0:
                alpha[i] = self.high_w # all refs in group 1 will be more similar to sample of interest
        
        # parameters of bernoulli distribution for each signal
        # for each state
        # shape is (num_states, num_signals)
        p = self.generate_param_p()
        
        # an indicator matrix along genome of the state for 
        # each refenrece
        # shape is (num_regions, num_bins_per_region, num_states, num_references)
        ref_states_indicator = F.one_hot(self.generate_ref_states(), self.num_states)
        params = {
            'alpha': alpha,
            'p': p,
            'ref_states_indicator': ref_states_indicator
        }
        self.params = params
        return params
        
    # collapse a prob vector over references to a prob vector over states
    # takes the cross product of prob vector theta and reference state indicator matrix r
    # shapes:
    #  theta: (None, num_references)
    #  r: (None, num_references, num_states)
    #  out: (None, num_states)
    def collapse_theta(self, theta, r=None):
        if r is None:
            assert self.params is not None
            r = self.params['ref_states_indicator']
            
        r = r.float()
        collapsed_theta = torch.zeros(theta.shape[0], r.shape[2])
        for i in range(theta.shape[0]):
            collapsed_theta[i,:] = torch.matmul(r[i,:,:].T, theta[i,:])
        return collapsed_theta
    
    def generate_sample(self):
        if self.params is None:
            self.set_params()
            
        r = self.params['ref_states_indicator']
                
        # generate reference distribution for each region
        with pyro.plate('bins', self.num_bins):
            # theta is shape (num_regions, num_references)
            theta = pyro.sample('theta', dist.Dirichlet(self.params['alpha']))
            # collapse the reference distribution for each bin to a 
            # state distribution 
            collapsed_theta = self.collapse_theta(theta, r)

            signal_params = torch.sigmoid(torch.matmul(collapsed_theta, self.params['p']))
            m = pyro.sample('m', dist.Bernoulli(signal_params).to_event(1))

        result = {
            'theta': theta,
            'm': m
        }
        self.sample = result
        return self.sample
    

    def get_sampled_collapsed_theta(self):
        if self.sample is None:
            self.generate_sample()
        theta = self.sample['theta']
        return self.collapse_theta(theta)
    
    def get_sampled_signals(self):
        if self.sample is None:
            self.generate_sample()
        return self.sample['m']
    
    def get_sampled_theta(self):
        if self.sample is None:
            self.generate_sample()
        return self.sample['theta']
    
    def get_signal_parms(self):
        collapsed_theta = self.get_sampled_collapsed_theta()
        return torch.sigmoid(torch.matmul(collapsed_theta, self.params['p']))
    
    def get_ref_state_indicators(self):
        if self.params is None:
            self.set_params()
        return self.params['ref_states_indicator']
    

In [28]:
serious_parms = {
    'num_bins': 10000,
    'num_references': 10,
    'num_groups': 3,
    'state_vary_rate': 0.003,
    'num_signals': 3,
    'num_states': 3,
    'high_w': 100
}

seed = 0
torch.manual_seed(seed)
pyro.set_rng_seed(seed)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

generator = CircularStateGenerator(**serious_parms)

m = generator.get_sampled_signals()
r = generator.get_ref_state_indicators()
collapsed_theta = generator.get_sampled_collapsed_theta()
theta = generator.get_sampled_theta()
signal_params = generator.get_signal_parms()

In [29]:
print(serious_parms)
print('m: obs. signals at each position')
print(m.shape)
print(m)
print('r: reference epigenome state indicator at each position')
print(r.shape)
print('collapsed_theta: state assignment at each position')
print(collapsed_theta.shape)
print(collapsed_theta)
print('theta: the reference mixture at each position')
print(theta.shape)
print(theta)
print('signal_params: bernoulli dist. params generating signal at each position')
print(signal_params.shape)
print('p')
p = generator.params['p']
print (p)
print (p.shape)

{'num_bins': 10000, 'num_references': 10, 'num_groups': 3, 'state_vary_rate': 0.003, 'num_signals': 3, 'num_states': 3, 'high_w': 100}
m: obs. signals at each position
torch.Size([10000, 3])
tensor([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.],
        ...,
        [0., 1., 0.],
        [0., 0., 1.],
        [1., 0., 0.]])
r: reference epigenome state indicator at each position
torch.Size([10000, 10, 3])
collapsed_theta: state assignment at each position
torch.Size([10000, 3])
tensor([[0.9912, 0.0020, 0.0068],
        [0.0109, 0.9862, 0.0029],
        [0.0024, 0.0276, 0.9700],
        ...,
        [0.0027, 0.9882, 0.0091],
        [0.0047, 0.0035, 0.9917],
        [0.9931, 0.0049, 0.0019]])
theta: the reference mixture at each position
torch.Size([10000, 10])
tensor([[2.8926e-01, 1.1351e-03, 2.8802e-04,  ..., 5.0673e-03, 7.5635e-04,
         2.5826e-01],
        [2.4719e-01, 6.4616e-03, 1.9981e-03,  ..., 2.9050e-03, 3.1032e-04,
         2.2649e-01],
        [2.3822e-01, 2.2

In [30]:
# how many positions that emit each of the mark
torch.sum(m, dim = 0)

tensor([3334., 3333., 3333.])

In [31]:
# save the data of collapsed theta (--> state probabilities at each position)
collapsed_theta = pd.DataFrame(collapsed_theta.numpy())
collapsed_theta['max_state'] = collapsed_theta.idxmax(axis = 1)

In [32]:
print(collapsed_theta.head())

          0         1         2  max_state
0  0.991196  0.002015  0.006789          0
1  0.010920  0.986210  0.002871          1
2  0.002429  0.027560  0.970011          2
3  0.984627  0.006708  0.008665          0
4  0.008432  0.983749  0.007819          1


In [33]:
class Encoder(nn.Module):
	def __init__(self, num_signals, num_states, num_references, hidden, dropout):
		super().__init__()
		self.drop = nn.Dropout(dropout)
		input_dim = num_signals + num_states * num_references
		self.fc1 = nn.Linear(num_signals, hidden)
		self.fc2 = nn.Linear(hidden, hidden)
		self.fcmu = nn.Linear(hidden, num_states)
		self.fclv = nn.Linear(hidden, num_states)

	def forward(self, m):
		inputs = m
		h = F.softplus(self.fc1(inputs))
		h = F.softplus(self.fc2(h))
		h = self.drop(h)
		logpi_loc = F.softplus(self.fcmu(h))
		logpi_logvar = self.fclv(h)
		logpi_loc = self.drop(logpi_loc)
		logpi_scale = (0.5 * logpi_logvar).exp()
		return logpi_loc, logpi_scale

class Decoder(nn.Module):
	def __init__(self, num_states, num_signals, hidden, dropout):
		super().__init__()
		self.drop = nn.Dropout(dropout)
		self.beta = nn.Linear(num_states, num_signals, bias=False)
		self.bn = nn.BatchNorm1d(num_signals, affine=True)

	def forward(self, inputs):
		# takes in the values of collapsed pi: probabilities of state 
		# assignments at each positions, and then apply a linear trans
		# to get the probabilities of observing signals at each position
		# --> vector size #signals
		# used as parameters for bernoulli dist. to get obs. signals
		inputs = self.drop(inputs)
		beta = self.beta(inputs)
		return torch.sigmoid(beta) # to transform to [0,1]
    
class model_Signals(nn.Module):
	def __init__(self, num_signals, num_references, num_states, hidden, dropout):
		super().__init__()
		self.num_signals = num_signals
		self.num_references = num_references
		self.num_states = num_states
		self.hidden = hidden
		self.dropout = dropout
		self.encoder = Encoder(num_signals, num_states, num_references, hidden, dropout)
		self.decoder = Decoder(num_states, num_signals, hidden, dropout)

	# shapes: 
	#  m: (bins x signals) signal matrix
	#  r: (bins x reference x state) indicator matrix
	def model(self, m):
		# flatten out the r indicator matrix
		pyro.module("decoder", self.decoder)
		with pyro.plate('bins', m.shape[0]):
			logCpi_loc = m.new_zeros((m.shape[0], self.num_states))
			logCpi_scale = m.new_ones((m.shape[0], self.num_states))
			logCpi = pyro.sample('log_collapsedPi', dist.Normal(logCpi_loc, logCpi_scale).to_event(1))
			Cpi = F.softmax(logCpi, -1) # softmax is right because Cpi ~ LogNormal
			signal_param = self.decoder(Cpi)          
			pyro.sample('m', dist.Bernoulli(signal_param).to_event(1), obs=m)
	            
	def guide(self, m):
		pyro.module("encoder", self.encoder)
		with pyro.plate('bins', m.shape[0]):
			logpi_loc, logpi_scale = self.encoder(m)
			logpi = pyro.sample('log_collapsedPi', dist.Normal(logpi_loc, logpi_scale).to_event(1))

	def predict_state_assignment(self, m):
		logpi_loc, logpi_scale = self.encoder(m)
		Cpi = F.softmax(logpi_loc, -1)
		return(Cpi)

	def write_predicted_state_assignment(self, m, output_fn):
		Cpi = self.predict_state_assignment(m)
		df = pd.DataFrame(Cpi.detach().numpy())
		df['max_state'] = df.idxmax(axis =1)
		df.to_csv(output_fn, header = True, index = False, sep = '\t', compression = 'gzip')
		return
		
	def beta(self):
		return self.decoder.beta.weight.cpu().detach().T

	def generate_reconstructed_data(self, m):
		'''
		m: num_bins, num_signals
		logpi_loc, logpi_scale: num_bins, num_states
		signal_param: num_bins, num_signals
		'''
		logpi_loc, logpi_scale = self.encoder(m)
		Cpi = F.softmax(logpi_loc, -1)
		signal_param = self.decoder(Cpi) 
		re_m = pyro.sample('reconstructed_m', dist.Bernoulli(signal_param).to_event(1))
		return (re_m)

	def get_percentage_correct_reconstruct(self, m):
		# m and r can be different from the m and r used in training
		re_m = self.generate_reconstructed_data(m)
		total_m_entries = re_m.shape[0] * re_m.shape[1]
		signals_CR = (re_m==m).sum() # correct reconstruct entries of signals
		ratio_m_CR = (signals_CR / total_m_entries).item()
		return ratio_m_CR


In [34]:
batch_size = 200
learning_rate = 1e-3
num_epochs = 1000
pyro.clear_param_store()
state_model = model_Signals(
    num_signals = generator.num_signals,
    num_references = generator.num_references,
    num_states = generator.num_states,
    hidden = 32,
    dropout = 0.2)
state_model.to(device)
optimizer = pyro.optim.Adam({"lr": learning_rate})
svi = SVI(state_model.model, state_model.guide, optimizer, loss=TraceMeanField_ELBO())
num_batches = int(math.ceil(m.shape[0] / batch_size))

bar = trange(num_epochs)
for epoch in bar:
    running_loss = 0.0
    for i in range(num_batches):
        batch_m = m[i * batch_size:(i+1) * batch_size, :]
        batch_r = r[i * batch_size:(i+1) * batch_size, :, :]
        loss = svi.step(batch_m)
        running_loss += loss / batch_m.size(0)
        
    bar.set_postfix(epoch_loss='{:.2e}'.format(running_loss))

100%|██████████████████| 1000/1000 [02:38<00:00,  6.30it/s, epoch_loss=9.64e+01]


In [36]:
re_m = state_model.generate_reconstructed_data(m)
print(re_m.shape)
print(m)
print(re_m)
print(((m==re_m).all(dim=1)).sum())
print((m==re_m).sum())

torch.Size([10000, 3])
tensor([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.],
        ...,
        [0., 1., 0.],
        [0., 0., 1.],
        [1., 0., 0.]])
tensor([[0., 0., 0.],
        [0., 1., 0.],
        [0., 1., 1.],
        ...,
        [0., 0., 0.],
        [1., 0., 1.],
        [0., 0., 0.]])
tensor(1446)
tensor(16433)


Evaluating the state posterior, compared with generated collapsed_theta

In [37]:
Cpi = state_model.predict_state_assignment(m)
df = pd.DataFrame(Cpi.detach().numpy())
df['max_state'] = df.idxmax(axis =1)
df = df.merge(collapsed_theta, left_index = True, right_index = True)
df

Unnamed: 0,0_x,1_x,2_x,max_state_x,0_y,1_y,2_y,max_state_y
0,0.333331,0.333331,0.333339,2,0.991196,0.002015,0.006789,0
1,0.332853,0.332846,0.334301,2,0.010920,0.986210,0.002871,1
2,0.333273,0.333273,0.333454,2,0.002429,0.027560,0.970011,2
3,0.333319,0.333319,0.333363,2,0.984627,0.006708,0.008665,0
4,0.332854,0.332846,0.334300,2,0.008432,0.983749,0.007819,1
...,...,...,...,...,...,...,...,...
9995,0.333261,0.333261,0.333478,2,0.011204,0.013802,0.974994,2
9996,0.333329,0.333329,0.333341,2,0.987448,0.007399,0.005153,0
9997,0.332976,0.332979,0.334045,2,0.002654,0.988218,0.009128,1
9998,0.333271,0.333271,0.333459,2,0.004716,0.003536,0.991748,2


In [38]:
class Encoder(nn.Module):
	def __init__(self, num_signals, num_states, num_references, hidden, dropout):
		super().__init__()
		self.drop = nn.Dropout(dropout)
		input_dim = num_signals + num_states * num_references
		self.fc1 = nn.Linear(num_signals, hidden)
		self.fc2 = nn.Linear(hidden, hidden)
		self.fcmu = nn.Linear(hidden, num_states)
		self.fclv = nn.Linear(hidden, num_states)

	def forward(self, m):
		inputs = m
		h = F.softplus(self.fc1(inputs))
		h = F.softplus(self.fc2(h))
		h = self.drop(h)
		logpi_loc = F.softplus(self.fcmu(h))
		logpi_logvar = self.fclv(h)
		logpi_loc = self.drop(logpi_loc)
		logpi_scale = (0.5 * logpi_logvar).exp()
		return logpi_loc, logpi_scale

class Decoder(nn.Module):
	def __init__(self, num_states, num_signals, hidden, dropout, fixed_signalP):
		super().__init__()
		self.drop = nn.Dropout(dropout)
		self.beta = nn.Linear(num_states, num_signals, bias=False)
		self.bn = nn.BatchNorm1d(num_signals, affine=True)
		self.fixed_signalP = fixed_signalP

	def forward(self, inputs):
		# takes in the values of collapsed pi (inputs): probabilities of state 
		# assignments at each positions, and then multiply by fixed_signalP (beta)
		# to get the probabilities of observing signals at each position
		# --> vector size #signals
		# used as parameters for bernoulli dist. to get obs. signals
		# fixed_signalP: #states, #marks, from the random number generator
		signal_param = torch.matmul(inputs, self.fixed_signalP)
		return torch.sigmoid(signal_param) # to transform to [0,1]
    
class model_signals_only_fixedBeta(nn.Module):
	def __init__(self, num_signals, num_references, num_states, hidden, dropout, fixed_signalP):
		super().__init__()
		self.num_signals = num_signals
		self.num_references = num_references
		self.num_states = num_states
		self.fixed_signalP = fixed_signalP
		self.hidden = hidden
		self.dropout = dropout
		self.encoder = Encoder(num_signals, num_states, num_references, hidden, dropout)
		self.decoder = Decoder(num_states, num_signals, hidden, dropout, fixed_signalP)

	# shapes: 
	#  m: (bins x signals) signal matrix
	#  r: (bins x reference x state) indicator matrix
	def model(self, m):
		# flatten out the r indicator matrix
		pyro.module("decoder", self.decoder)
		with pyro.plate('bins', m.shape[0]):
			logCpi_loc = m.new_zeros((m.shape[0], self.num_states))
			logCpi_scale = m.new_ones((m.shape[0], self.num_states))
			logCpi = pyro.sample('log_collapsedPi', dist.Normal(logCpi_loc, logCpi_scale).to_event(1))
			Cpi = logCpi.exp()			
			Cpi = F.softmax(logCpi, -1)
			signal_param = self.decoder(Cpi)          
			pyro.sample('m', dist.Bernoulli(signal_param).to_event(1), obs=m)
	            
	def guide(self, m):
		pyro.module("encoder", self.encoder)
		with pyro.plate('bins', m.shape[0]):
			logpi_loc, logpi_scale = self.encoder(m)
			logpi = pyro.sample('log_collapsedPi', dist.Normal(logpi_loc, logpi_scale).to_event(1))

	def predict_state_assignment(self, m):
		logpi_loc, logpi_scale = self.encoder(m)
		Cpi = F.softmax(logpi_loc, -1)
		return(Cpi)

	def write_predicted_state_assignment(self, m, output_fn):
		Cpi = self.predict_state_assignment(m)
		df = pd.DataFrame(Cpi.detach().numpy())
		df['max_state'] = df.idxmax(axis =1)
		df.to_csv(output_fn, header = True, index = False, sep = '\t', compression = 'gzip')
		return

	def generate_reconstructed_data(self, m):
		'''
		m: num_bins, num_signals
		logpi_loc, logpi_scale: num_bins, num_states
		signal_param: num_bins, num_signals
		'''
		logpi_loc, logpi_scale = self.encoder(m)
		Cpi = F.softmax(logpi_loc, -1)
		signal_param = self.decoder(Cpi) 
		re_m = pyro.sample('reconstructed_m', dist.Bernoulli(signal_param).to_event(1))
		return (re_m)

	def get_percentage_correct_reconstruct(self, m):
		# m can be different from the m used in training
		re_m = self.generate_reconstructed_data(m)
		total_m_entries = re_m.shape[0] * re_m.shape[1]
		signals_CR = (re_m==m).sum() # correct reconstruct entries of signals
		ratio_m_CR = (signals_CR / total_m_entries).item()
		return ratio_m_CR

In [39]:
batch_size = 200
learning_rate = 1e-3
num_epochs = 1000
pyro.clear_param_store()
m_SigBeta = model_signals_only_fixedBeta(
    num_signals = generator.num_signals,
    num_references = generator.num_references,
    num_states = generator.num_states,
    hidden = 32,
    dropout = 0.2,
    fixed_signalP = p)
state_model.to(device)
optimizer = pyro.optim.Adam({"lr": learning_rate})
svi = SVI(m_SigBeta.model, m_SigBeta.guide, optimizer, loss=TraceMeanField_ELBO())
num_batches = int(math.ceil(m.shape[0] / batch_size))

bar = trange(num_epochs)
for epoch in bar:
    running_loss = 0.0
    for i in range(num_batches):
        batch_m = m[i * batch_size:(i+1) * batch_size, :]
        batch_r = r[i * batch_size:(i+1) * batch_size, :, :]
        loss = svi.step(batch_m) # p from the generated data
        running_loss += loss / batch_m.size(0)
        
    bar.set_postfix(epoch_loss='{:.2e}'.format(running_loss))

100%|██████████████████| 1000/1000 [04:24<00:00,  3.79it/s, epoch_loss=2.92e+02]


In [49]:
# new_m = torch.tensor([[1,0,0], [0,1,0], [0,0,1]]).float()
# logpi_loc, logpi_scale = state_model.encoder(new_m)
# print(logpi_loc.shape)
# print(logpi_loc)
# print(F.softmax(logpi_loc, 1))
# x = logpi_loc[0,:] * 1.0e5
# print(logpi_loc[0,:])
# print(x)
# print(F.softmax(x, 0))
re_m = m_SigBeta.generate_reconstructed_data(m)
print(re_m.shape)
print(m)
print(re_m)
print(((m==re_m).all(dim=1)).sum())
print((m==re_m).sum())

torch.Size([10000, 30])
tensor([[1., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 1.],
        [0., 1., 0.,  ..., 0., 1., 0.],
        ...,
        [0., 1., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 1., 0.]])
tensor([[1., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 1.],
        [0., 1., 0.,  ..., 0., 1., 0.],
        ...,
        [0., 1., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 1., 0.]])
tensor(8111)
tensor(284327)


In [50]:
Cpi = m_SigBeta.predict_state_assignment(m)
t = pd.DataFrame(Cpi.detach().numpy())
t['max_state'] = t.idxmax(axis =1)
t = t.merge(collapsed_theta, left_index = True, right_index = True)
t

Unnamed: 0,0_x,1_x,2_x,max_state_x,0_y,1_y,2_y,max_state_y
0,0.899916,0.050063,0.050021,0,0.834900,0.131686,0.033414,0
1,0.046013,0.046014,0.907973,2,0.077855,0.347474,0.574671,2
2,0.037355,0.925289,0.037355,1,0.072862,0.806724,0.120414,1
3,0.048280,0.048280,0.903441,2,0.215188,0.214566,0.570246,2
4,0.037980,0.924041,0.037980,1,0.219506,0.609871,0.170623,1
...,...,...,...,...,...,...,...,...
9995,0.879433,0.060287,0.060280,0,0.695981,0.124986,0.179033,0
9996,0.053285,0.053285,0.893430,2,0.160222,0.126768,0.713009,2
9997,0.333333,0.333333,0.333333,0,0.116979,0.704853,0.178168,1
9998,0.333334,0.333333,0.333334,0,0.425955,0.253359,0.320686,0


In [52]:
(t['max_state_x'] == t['max_state_y']).sum()

6960

In [None]:
folder='/Users/vuh6/Desktop/mount_ros/source/hector_transferChromState/simulation_model_design/experiments/strict_genData'
import glob
import pandas as pd
fn_list = glob.glob(folder+ '/*.txt')
df_list = list(map(lambda x: pd.read_csv(x, header = 0, index_col = None, sep = '\t'), fn_list))
df = pd.concat(df_list)

In [None]:
t1 = df.groupby(['model', 'num_states'])['ratio_m_CR'].mean().reset_index()
import seaborn as sns
sns.lineplot(data = t1, x = 'num_states', y = 'ratio_m_CR', hue = 'model')

In [None]:
t2 = df.groupby(['model', 'num_references'])['ratio_m_CR'].mean().reset_index()
sns.lineplot(data = t2, x = 'num_references', y = 'ratio_m_CR', hue = 'model')

In [None]:
df