In [None]:
# lets make NB distributions of expression

# testing ideas
from scipy.stats import nbinom
import matplotlib.pyplot as plt
import numpy as np
import random

import anndata as ad
import pandas as pd
from scipy.sparse import coo_matrix, csc_matrix

In [None]:
# how many samples to generate
patients_to_generate = 10000
genes_to_generate = 10000

# balance between batches
# set to 0 or 1 if you want one batch
batch_balance = 0

# other parameters
# whether or not to draw distributions when generating simulated genes
# note that there's no limitations to this; if you're creating 10000 genes it'll draw 10000 plots so be mindful
draw_NB_data = False

# Create gene-gene and/or sample-sample edges by duplicating rows/columns (with a small amount of added variation
copy_neighbors = False

In [None]:
# function copies the indicated column and duplicates it with +1 to counts randomly distributed across it
# this creates gene-gene/sample-sample edges (actual correlations occurring in the simulated data is extrememly unlikely)

def copy_and_randomly_increment_neighbors(arr, col_index, samples=False):
    '''
    Create gene/gene or sample/sample edge by duplicating row or column (with random +1 to give variability)
    Edges don't naturally occur because distributions are completely random, so we need this function
    arr - pandas data frame of expression counts
    col_index - what column will be duplicated
    samples - whether we duplicate a gene (False) or a sample (True)

    '''
    # flip array if we want to duplicate a sample, otherwise duplicate a gene
    if (samples is True):
        data = arr.T
    else:
        data = arr

    nrows = data.shape[0]  

    col_index = int(np.round(col_index,0))
    
    # Determine which neighbor column to copy based on the column index
    # First Column
    if col_index == 0:
        neighbor_col = data[:, col_index + 1]
    # Last Column
    elif col_index == data.shape[1] - 1:
        neighbor_col = data[:, col_index - 1]
    # All middle columns
    else:
        neighbor_col = data[:, col_index - 1] if random.choice([True, False]) else data[:, col_index + 1]
    
    # Copy the neighbor column into the target column
    data[:, col_index] = neighbor_col
    
    # Randomly decide to increment the value by 1 for each row in the column
    for row in range(nrows):
        if random.choice([True, False]):
            data[row, col_index] += 1
    
    if (samples is True):
        return data.T
    else:
        return data

In [None]:
# In this version of the code, I want to create a distribution to "pull" our NB Mu parameters from
# we discussed log-normal
mu_pull_dist_1 = 3 * 10
mu_pull_dist_2 = 4 * 10

mus_use = str(mu_pull_dist_1) + "_" + str(mu_pull_dist_2)

sigma1 = 0.6  # standard deviation of the logarithm of the batch 1 distribution
sigma2 = 0.2  # standard deviation of the logarithm of the batch 2 distribution

# Generate a sample from the log-normal distribution
# Generate samples for both distributions
sample_size = patients_to_generate*100 # multiplication ensures a varied number of possible values
samples1 = np.random.lognormal(mean=mu_pull_dist_1, sigma=sigma1, size=sample_size)
samples2 = np.random.lognormal(mean=mu_pull_dist_2, sigma=sigma2, size=sample_size)

samples1_rounded = np.round(samples1, 6)
samples2_rounded = np.round(samples2, 6)

print(min(samples1_rounded))

# Plotting the samples
plt.hist(samples1_rounded, bins=100, density=True, alpha=0.5, color='blue', label=f'Log-normal with $\mu={mu_pull_dist_1}$')
plt.hist(samples2_rounded, bins=100, density=True, alpha=0.5, color='orange', label=f'Log-normal with $\mu={mu_pull_dist_2}$')
plt.title('Selection of Mu between "batches"')
plt.xlabel('Mu')
plt.ylabel('Density')
#plt.xscale('log')
plt.legend()
plt.show()

In [None]:
# for now, theta will be simple, both batch 1 and batch 2 will have the same parameters
# will likely make this more complex in the future
theta_pull_dist_1 = theta_pull_dist_2 = 1 # will try 1, 2, etc


In [None]:
# Container for simulated data
train_data, valid_data, gene_names = [], [], []
gene_mu1, gene_var1, gene_mu2, gene_var2 = [], [], [], []

# we have to sample from both log normals outside of the loop so that we can ensure replacement
sampled_mus_batch1 = np.random.choice(samples1_rounded, size=genes_to_generate, replace=False)
sampled_mus_batch2 = np.random.choice(samples2_rounded, size=genes_to_generate, replace=False)


# Bernouli trial idea didn't work out
# This way, all samples will get the same mu, not a partial amount
samples_indices = np.random.binomial(n=1, p=batch_balance, size=patients_to_generate)

for i in range(genes_to_generate):
    # so we compute the p and n parameters for each batch using their respective Mu/thetas
    mu_batch1 = sampled_mus_batch1[i]
    var_batch1 = mu_batch1 + (mu_batch1**2 / theta_pull_dist_1)
    n_batch1 = mu_batch1**2 / (var_batch1 - mu_batch1)
    p_batch1 = n_batch1 / (n_batch1 + mu_batch1)

    mu_batch2 = sampled_mus_batch2[i]
    var_batch2 = mu_batch2 + (mu_batch2**2 / theta_pull_dist_2)
    n_batch2 = mu_batch2**2 / (var_batch2 - mu_batch2)
    p_batch2 = n_batch2 / (n_batch2 + mu_batch2)

    # we then sample M (patients) values from both NBs
    counts_batch1 = nbinom.rvs(n_batch1, p_batch1, size=patients_to_generate)
    counts_batch2 = nbinom.rvs(n_batch2, p_batch2, size=patients_to_generate)

    # and we use the Bernouli to select from which NB distribution do we take values from
    selected_data_train = np.where(samples_indices == 0, counts_batch1, counts_batch2)
    
    # append it to a list of gene expression values
    train_data.append(selected_data_train)  

    # repeat for validation
    counts_batch3 = nbinom.rvs(n_batch1, p_batch1, size=patients_to_generate)
    counts_batch4 = nbinom.rvs(n_batch2, p_batch2, size=patients_to_generate)

    # and we use the Bernouli to select from which NB distribution do we take values from
    selected_data_val = np.where(samples_indices == 0, counts_batch3, counts_batch4)
    
    # append it to a list of gene expression values
    valid_data.append(selected_data_val)  

    # Saving Mus/Thetas; to be placed in the AnnData 'var' table
    gene_mu1.append(mu_batch1)
    gene_var1.append(theta_pull_dist_1)
    gene_mu2.append(mu_batch2)
    gene_var2.append(theta_pull_dist_2)

    # Creating a unique gene name
    gene_name = "Gene_" + str(i + 1)
    gene_names.append(gene_name)

    # draw expression selected out if desired
    title = "Expression for Gene " + str(i + 1)

    minval1 = min(counts_batch1)
    minval3 = min(counts_batch3)
    
    #if (minval1 < 10) or (minval3 < 10):
    #    print(minval1, minval3)

    if (draw_NB_data is True):
        plt.hist(selected_data_train, bins=50, color="blue")
        plt.xlabel("Expression")
        plt.ylabel("Frequency")
        plt.title(title)
        plt.show()

        


In [None]:
print(np.sum(samples_indices))

In [None]:
# So now we have to save these results as an AnnData table with batch in the obs
# training data

# append genes into a single matrix, and transpose so rows are samples and not rows
all_counts = np.array(train_data)
counts_in_anndata_orientation = np.transpose(all_counts)

# if we want, this forcing edges in gene/gene | sample/sample index by copying over its neighboring entry
# samples = True means we create a sample/sample edge, otherwise gene/gene
if (copy_neighbors == True):
    copy_and_randomly_increment_neighbors(train_data, patients_to_generate/10, samples=True)
    copy_and_randomly_increment_neighbors(train_data, patients_to_generate/9, samples=True)
    copy_and_randomly_increment_neighbors(train_data, patients_to_generate/8, samples=True)
    copy_and_randomly_increment_neighbors(train_data, genes_to_generate/7, samples=False)
    copy_and_randomly_increment_neighbors(train_data, genes_to_generate/6, samples=False)
    copy_and_randomly_increment_neighbors(train_data, genes_to_generate/5, samples=False)

adata = ad.AnnData(X=counts_in_anndata_orientation, var=pd.DataFrame(index=gene_names), dtype=np.int64)

# make batch labels
#batch_labels = ['1'] * size_batch1 + ['2'] * size_batch2  # Adjust according to your data structure
adata.obs['batch'] = samples_indices

# add gene names to var
adata.var['gene'] = gene_names
adata.var['mu_batch1'] = gene_mu1
adata.var['theta_batch1'] = gene_var1
adata.var['mu_batch2'] = gene_mu2
adata.var['theta_batch2'] = gene_var2


# I still need to generate the gene_gene and sample_sample adjacency matrix!
# gene/gene
#df = pd.DataFrame(adata.X, columns=adata.var_names)
#df_sqrt = df.applymap(lambda x: x**0.5 if x >= 0 else x) # shouldn't ever be negative

# we no longer create a gene/gene matrix for the simple data
#gene_correlation_matrix = df_sqrt.corr()
#gene_gene_adj = (gene_correlation_matrix.abs() > 0.6).astype(int)
#np.fill_diagonal(gene_gene_adj.values, 1) # diagonal has been 1s in our other data

# sample/sample
#df_transposed = pd.DataFrame(adata.X.T, columns=adata.obs_names, index=adata.var_names)
#df_transposed_sqrt = df_transposed.applymap(lambda x: x**0.5 if x >= 0 else x) # shouldn't ever be negative
# Compute the Pearson correlation matrix between samples
#correlation_matrix_samples = df_transposed_sqrt.corr()

#sample_sample_adj = (correlation_matrix_samples.abs() > 0.6).astype(int)

# Optionally, remove self-loops (sample correlated with itself)
#np.fill_diagonal(sample_sample_adj.values, 1)

# Count the number of 1s in the adjacency matrix
#num_ones_samples = np.sum(sample_sample_adj.values)

# and the adjacency tables need to be 'coo'
#sample_sample_adj_coo = csc_matrix(sample_sample_adj)
#adata.obsm['sample_sample_adj'] = sample_sample_adj_coo

# gene_gene_adj_coo = csc_matrix(gene_gene_adj)
# adata.varm['gene_gene_adj'] = gene_gene_adj_coo


# and lets save the data
file_path = '/path/to/output/simulated_simple_' + str(patients_to_generate) + '_' + str(genes_to_generate) + '_theta_' + str(theta_pull_dist_1) + '_muselect' + mus_use + '_batch_balance_' + str(batch_balance) + '.test1.tau_1.h5ad'

# Save the AnnData object
adata.write(file_path)


In [None]:
# repeat for validation data
all_counts = np.array(valid_data)
counts_in_anndata_orientation = np.transpose(all_counts)

adata_val = ad.AnnData(X=counts_in_anndata_orientation, var=pd.DataFrame(index=gene_names), dtype=np.int64)

# make batch labels
#batch_labels = ['1'] * size_batch1 + ['2'] * size_batch2  # Adjust according to your data structure
adata_val.obs['batch'] = samples_indices

# add gene names to var
adata_val.var['gene'] = gene_names
adata_val.var['mu_batch1'] = gene_mu1
adata_val.var['theta_batch1'] = gene_var1
adata_val.var['mu_batch2'] = gene_mu2
adata_val.var['theta_batch2'] = gene_var2

# and lets save the data
file_path = '/path/to/output/simulated_simple_' + str(patients_to_generate) + '_' + str(genes_to_generate) + '_theta_' + str(theta_pull_dist_1) + '_muselect' + mus_use + '_batch_balance_' + str(batch_balance) + '.test2.tau_1.h5ad'

# Save the AnnData object
adata_val.write(file_path)


In [None]:
# draw library size information distrubtions
library_sizes = np.log(adata_val.X.sum(axis=1))

batches = adata_val.obs['batch']
library_sizes_batch1 = library_sizes[batches == 0]
library_sizes_batch2 = library_sizes[batches == 1]

# Plotting the samples
#plt.hist(library_sizes, bins=50, density=True, alpha=0.5, color='blue', label=f'Library Size [all samples]')
plt.figure(figsize=(8, 6))
plt.hist(library_sizes_batch1, bins=50, density=True, alpha=0.5, color='blue', label='Library Size [Batch 1]')
plt.hist(library_sizes_batch2, bins=50, density=True, alpha=0.5, color='orange', label='Library Size [Batch 2]')
plt.title('Library Size of Simulated Samples Between Each Batch')
plt.xlabel('Log Library Size (Log of Total Counts)')
plt.ylabel('Density')
# plt.yscale('log')  # Use a log scale for the y-axis
#plt.xscale('log')
plt.legend()
plt.show()

# you can clearly see the difference caused by the two Mus