In [1]:
using CSV, DataFrames, Serialization

### Extract Training Data from CSV
This script extracts relevant inputs from the experimental data reported in Yi et al. For example, it extracts what timepoints to save at for the timecourse simulation, then serializes the files. As another example, it extracts the ligand dose inputs, and converts them to the correct units (from nM to molecules)

Outputs:

00_processed_active_G_timecourse.dict: <br>

Dict{String, Any} with 3 entries: <br>
  "response"                       => [0.0, 0.35, 0.4, 0.36, 0.39, 0.33, 0.24, … <br>
  "save_at"                        => [0, 10, 30, 60, 120, 210, 300, 450, 600] <br>
  "ligand_stimulation (nM)" => 1000 <br>
  "average_error"                 => 0.015 <br>

00_processed_active_G_dose_response.dict: <br>

Dict{String, Any} with 4 entries: <br>
  "response"                       => [0.0253298, 0.145646, 0.265963, 0.497098,… <br>
  "save_at"                        => [60] <br>
  "normalize_to_response_at_dose"  => 1000 <br>
  "ligand_stimulation (nM)" => [0.1, 6.022e14, 1.20155e15, 3.01815e… <br>
  "average_error"                 => 0.03 <br>

In [2]:
timecourse = DataFrame(CSV.File("data/active_g_timecourse.csv"))
dose_response = DataFrame(CSV.File("data/active_g_dose_response.csv"));

In [3]:
#timecourse inputs
ligand_stimulation_nM = 1000
average_error = 0.015
timecourse_dict = Dict("save_at"=>timecourse[!,"t"], "response"=>timecourse[!," percent Ga"], 
"ligand_stimulation (nM)" => ligand_stimulation_nM, "average_error" => average_error)
serialize("outputs/000_processed_active_g_timecourse.dict", timecourse_dict)

In [4]:
#dose response inputs
save_at = [60] #seconds, needs to be a vector to be inputted into ODEProblem later
normalize_to = 1000.0 #1 uM
average_error = 0.03 #assume error is equal across dose response measurements
dose_response_nM = [0.1, 1.0, 2.0, 5.0, 10.0, 20.0, 50.0, 100.0] #ligand doses, nM, doesn't include 1 uM stimulation
#save
dose_response_dict = Dict("ligand_stimulation (nM)"=>dose_response_nM, "response"=>dose_response[!,"relative Ga"], "save_at" =>save_at, 
"normalize_to_response_at_dose"=>normalize_to,"average_error" => average_error)
serialize("outputs/000_processed_active_g_dose_response.dict", dose_response_dict)

### Extract Relevant Outputs for Test Set

000_processed_binding_affinity_dose_response.dict: <br>

Dict{String, Any} with 4 entries: <br>
  "response"                       => [0.0253298, 0.145646, 0.265963, 0.497098,… <br>
  "save_at"                        => [60] <br>
  "normalize_to_response_at_dose"  => 6.022e17 <br>
  "ligand_stimulation (molecules)" => [6.022e13, 6.022e14, 1.20155e15, 3.01815e… <br>
  "average_error"                 => 0.03 <br>

In [5]:
#convert ligand stimulation from nM to molecules, save with the processed timecourse dictionary
dose_response = DataFrame(CSV.File("data/rl_dose_response.csv"));
save_at = [60] #seconds, needs to be a vector to be inputted into ODEProblem later
normalize_to = 1000.0 #1 uM
average_error = 0.03 #assume error is equal across dose response measurements
dose_response_nM = [0.1, 1.0, 2.0, 5.0, 10.0, 20.0, 50.0, 100.0] #ligand doses, nM, doesn't include 1 uM stimulation
#save
dose_response_dict = Dict("ligand_stimulation (nM)"=>dose_response_nM, "response"=>dose_response[!," relative RL"], "save_at" =>save_at, 
"normalize_to_response_at_dose"=>normalize_to,"average_error" => average_error)
serialize("outputs/000_processed_rl_dose_response.dict", dose_response_dict)

Extract Kd Prediction

In [7]:
data = DataFrame(CSV.File("data/predicted_kd.csv"))
parameters = String.(data[!,"parameter"])
convert_to_nM = 10^9
mean_predicted = log10.(data[!,"predicted_Kd(M)"].*convert_to_nM)
mean_reported = log10.(data[!,"reported_Kd(M)"].*convert_to_nM)
lambda = data[!,"AF_ranking_score"]
protein_peptide_std_dev = 0.8 #from PPI Affinity MAE of protein-peptide Kd prediction
dictionary_definition = Dict(parameters[i]=>Dict("mean"=>mean_predicted[i],"std_dev"=>protein_peptide_std_dev,"lambda"=>lambda[i]) for i in 1:length(parameters))
dictionary_definition_reported = Dict(parameters[i]=>Dict("mean"=>mean_reported[i],"std_dev"=>protein_peptide_std_dev,"lambda"=>lambda[i]) for i in 1:length(parameters))
serialize("outputs/000_regularization_parameters_predicted.dict", dictionary_definition)
serialize("outputs/000_regularization_parameters_reported.dict", dictionary_definition_reported)