In [1]:
import pandas as pd

In [2]:
# load the data
df = pd.read_csv('norris_et_al_2017_cisplatin_exp_data.csv.gz',
                 low_memory=False)

In [3]:
# get only the protein columns (assuming model only has proteins)
genes = df[df['species_type'] == 'protein'].copy()


In [4]:
# Create a list of genes to have in model
# This uses HGNC naming standard. 
# youll have to pass the right names from the model
genes_to_include = ['TP53']

# Filter to get only these genes
select_genes = genes[genes['gene'].isin(genes_to_include)].copy()


In [5]:
# pivot tables to get into format where columns is the protein, rows are time points
pivoted = pd.pivot_table(select_genes, 
                         index ='time',
                         columns='protein', 
                         values='treated_control_fold_change')

# drop index
pivoted.reset_index(inplace=True)
del pivoted.columns.name
pivoted.set_index('time', inplace=True)
print(pivoted)

# note that there are multiple measurement types
# Phosphylation, rnaseq (rna level), silac and label-free(protein level)
# Decide based on model

      TP53_S(ph)313_phsilac  TP53_S(ph)392_phsilac  TP53_rnaseq  TP53_silac
time                                                                       
01hr                    NaN                    NaN    -1.099477         NaN
06hr               4.337450                    NaN     1.289942         NaN
24hr              11.911994                7.37830     1.582521      8.7825
48hr               6.074965                4.31349          NaN         NaN


In [6]:
# Rename the columns to match the PySB model

exp_data = pivoted.rename(
    columns={'TP53_S(ph)313_phsilac':'obs1'}
)


# Have to think of what to fill these with!
exp_data.fillna(0, inplace=True)

In [7]:
def cost_function(trajectory):
    # can use any distance metric
    error1 = ((exp_data['obs1'] - trajectory['obs1'])**2).sum()
    
    # if there is more than one observable, you just repeat
    # error2 = ((exp_data['obs2'] - trajectory['obs2'])**2).sum()
    # return = error1 + error2
    return error1,

In [8]:
# demo data
# Dont need to worry about this, i just had to make data.
# this will be replaced with the simulation result from pysb.


x = [1, 2, 3, 4]
time = ['01hr', '06hr', '24hr', '48hr']
combines = [(i,j) for i,j in zip(time, x)]

traj = pd.DataFrame(combines, columns=['time', 'obs1'])
traj.set_index('time', inplace=True)

print(traj)

      obs1
time      
01hr     1
06hr     2
24hr     3
48hr     4


In [9]:
print(cost_function(traj))

(90.1927929272021,)


In [10]:
exp_data.to_csv('experimental_data.csv')