In [225]:
import pandas as pd
import numpy as np
import uuid
from scipy import special as spsp
import itertools
import random
import pickle

# local vars
n_pcs = 2
n_cells = 50 # unique cells
n_cc = 500 # unique cell pairs
n_ctypes = 3 # unique cell types
n_ct = 2 # unique cell type pairs

n_proteins = 200 # unique proteins
n_lr = 3000 # unique protein pairs
n_lrtypes = 60 # unique protein types
n_lrt = 3 # unique protein type pairs

n_t = 5 # time points

In [220]:
# cell-cell loadings-----------------------------------------------------------------------------------

# generate cell pairs
if spsp.binom(n_cells, 2) < n_cc:
    raise ValueError('Must increase total number of cells or decrease number of unique cell pairs')
if spsp.binom(n_ctypes, 2) < n_ct:
    raise ValueError('Must increase total number of cell types or decrease number of unique cell type pairs')

cc = list(itertools.combinations(list(range(n_cells)), 2))[:n_cc]
for i in range(len(cc)):
    cc[i] = '-'.join([str(n) for n in cc[i]])

cell_types = [str(uuid.uuid4()) for i in range(n_ctypes)]
cell_types = list(itertools.combinations(list(cell_types), 2))[:n_ct]

cc = pd.DataFrame(data = {'Cell-Cell Pairs': cc})
cc['CellType-CellType Pairs'] = [random.sample(cell_types, 1)[0] for i in range(cc.shape[0])]  
cc.sort_values(by = 'CellType-CellType Pairs', inplace = True)


# generate simulated data - enforce separate mean for each celltype - PC combination
dist = list(range(-10,10,1))# define the possible mean values for a cell type
dist.remove(0)
loadings = dict() # generate loadings data per PC
for pc in range(n_pcs):
    loadings['PC ' + str(pc + 1)] = dict()
    for ctp in cc['CellType-CellType Pairs'].unique():
        mean_val = random.sample(dist, 1)[0]
        loadings['PC ' + str(pc+1)][ctp] = np.random.normal(loc = mean_val, scale = abs(mean_val*0.2), 
                                         size = cc[cc['CellType-CellType Pairs'] == ctp].shape[0])

for k,v in loadings.items():
    if cc['CellType-CellType Pairs'].unique().tolist() != list(v.keys()):
        raise ValueError('Improper ordering ordering')
    cc[k] = [item for sublist in [list(v_) for v_ in v.values()] for item in sublist]
cc.reset_index(inplace = True, drop = True)

# LR loadings-----------------------------------------------------------------------------------

# generate protein pairs
if spsp.binom(n_proteins, 2) < n_lr:
    raise ValueError('Must increase total number of proteins or decrease number of unique protein pairs')
if spsp.binom(n_lrtypes, 2) < n_lrt:
    raise ValueError('Must increase total number of protein types or decrease number of unique protein type pairs')

lr = list(itertools.combinations(list(range(n_proteins)), 2))[:n_lr]
for i in range(len(lr)):
    lr[i] = '-'.join([str(n) for n in lr[i]])

protein_types = [str(uuid.uuid4()) for i in range(n_lrtypes)]
protein_types = list(itertools.combinations(list(protein_types), 2))[:n_lrt]

lr = pd.DataFrame(data = {'Ligand-Receptor Pairs': lr})
lr['Ligand-Receptor Type Pairs Pairs'] = [random.sample(protein_types, 1)[0] for i in range(lr.shape[0])]  
lr.sort_values(by = 'Ligand-Receptor Type Pairs Pairs', inplace = True)


# generate simulated data - enforce separate mean for each proteintype - PC combination
dist = list(range(-10,10,1))# define the possible mean values for a protein type
dist.remove(0)
loadings = dict() # generate loadings data per PC
for pc in range(n_pcs):
    loadings['PC ' + str(pc + 1)] = dict()
    for lrtp in lr['Ligand-Receptor Type Pairs Pairs'].unique():
        mean_val = random.sample(dist, 1)[0]
        loadings['PC ' + str(pc+1)][lrtp] = np.random.normal(loc = mean_val, scale = abs(mean_val*0.2), 
                                         size = lr[lr['Ligand-Receptor Type Pairs Pairs'] == lrtp].shape[0])

for k,v in loadings.items():
    if lr['Ligand-Receptor Type Pairs Pairs'].unique().tolist() != list(v.keys()):
        raise ValueError('Improper ordering ordering')
    lr[k] = [item for sublist in [list(v_) for v_ in v.values()] for item in sublist]
lr.reset_index(inplace = True, drop = True)

t = np.arange(n_t)
time = pd.DataFrame()
time['time'] = t + 1
for pc in range(n_pcs):
    if pc % 2 == 0:
        slope = abs(random.sample(dist,1)[0])
        intercept = random.sample(dist,1)[0]
        time['PC ' + str(pc + 1)] = slope*t + intercept 
    else:
        slope = -abs(random.sample(dist,1)[0])
        intercept = random.sample(dist,1)[0]
        time['PC ' + str(pc + 1)] = slope*t + intercept 

loadings = {'cell-cell pairs': cc, 'ligand-receptor pairs': lr, 'time': time}

with open('loadings.pickle', 'wb') as f:
    pickle.dump(loadings, f)

In [228]:
loadings['cell-cell pairs']

Unnamed: 0,Cell-Cell Pairs,CellType-CellType Pairs,PC 1,PC 2
0,0-1,"(dd7e1bfb-83f5-4153-abe3-75ba742fb6c7, 7bb45bb...",-2.955534,-4.605986
1,8-41,"(dd7e1bfb-83f5-4153-abe3-75ba742fb6c7, 7bb45bb...",-2.786750,-5.041356
2,3-49,"(dd7e1bfb-83f5-4153-abe3-75ba742fb6c7, 7bb45bb...",-2.975024,-3.762600
3,4-5,"(dd7e1bfb-83f5-4153-abe3-75ba742fb6c7, 7bb45bb...",-3.776496,-5.127177
4,4-6,"(dd7e1bfb-83f5-4153-abe3-75ba742fb6c7, 7bb45bb...",-2.722015,-3.288222
...,...,...,...,...
495,4-23,"(dd7e1bfb-83f5-4153-abe3-75ba742fb6c7, d7e7bd0...",6.736052,-9.195078
496,4-22,"(dd7e1bfb-83f5-4153-abe3-75ba742fb6c7, d7e7bd0...",6.753269,-8.043520
497,4-19,"(dd7e1bfb-83f5-4153-abe3-75ba742fb6c7, d7e7bd0...",5.010632,-14.630219
498,4-40,"(dd7e1bfb-83f5-4153-abe3-75ba742fb6c7, d7e7bd0...",5.190422,-8.505604
