In [1]:
using CSV, DataFrames, Serialization
include("define_test_train.jl");

### Extract Experimental Data from CSV
This script extracts relevant inputs from the experimental data reported in Kholodenko et al. For example, it extracts what timepoints to save at for the timecourse simulation, then serializes the files. As another example, it extracts the ligand dose inputs (in nM).

Outputs 8 dictionaries, one per experimental measurement, with the following entries:

000_processed_grb_egfr_20.dict: <br>

Dict{String, Any} with 4 entries: <br>
  "save_at"                => [0, 15, 30, 45, 60, 120] <br>
  "reponse"                => [0.0, 18.06, 15.79, 8.66, 6.44, 4.7] <br>
  "ligand_simulation (nM)" => 20.0 <br>
  "average_error"          => 5 <br>

In [2]:
data_files = readdir("data")
sort_files = data_files .!= ".DS_Store"
data_files = sort(data_files[sort_files]) #sort to ensure consistent order
sort_files = data_files .!= "kholodenko1.xml"
data_files = sort(data_files[sort_files])
sort_files = data_files .!= "predicted_kd.csv"
data_files = sort(data_files[sort_files])
data = [DataFrame(CSV.File("data/$(data_files[i])")) for i in 1:length(data_files)]
ligand_stimulation = [20,20,0.2,2,20,2,20,20] #consistent with sorted order of files
my_keys = ["save_at", "response","ligand_simulation (nM)", "average_error"]
average_error = [2.5, 5.0, 1.0, 1.0, 5.0, 1.0, 1.0, 1.0] ##took average error per species, rounded to nearest 0.5
my_values = [[data[i][!,"x"], data[i][!," y"],ligand_stimulation[i], average_error[i]] for i in 1:length(data_files)]
processed_dictionary = [Dict(my_keys .=> my_values[i]) for i in 1:length(data_files)]
output_names = [replace(data_files[i], ".csv" => "") for i in 1:length(data_files)]
output_names = ["000_processed_" * output_names[i] * ".dict" for i in 1:length(data_files)]
[serialize("outputs/$(output_names[i])", processed_dictionary[i]) for i in 1:length(data_files)];

In [4]:
data_files

8-element Vector{String}:
 "grb_egfr_20.csv"
 "grb_shc_20.csv"
 "p_egfr_02.csv"
 "p_egfr_2.csv"
 "p_egfr_20.csv"
 "p_plcg_2.csv"
 "p_plcg_20.csv"
 "p_shc_20.csv"

### Store Training Data and Standard Deviations as Dictionaries of 1D Arrays

In [2]:
#pSHC as test data
data_files = return_training_data_names_w_pshc_test()
data_points = Array{Float64}(undef,0)
std_dev = Array{Float64}(undef,0)
[append!(data_points, deserialize("outputs/$(i)")["response"]) for i in data_files]
[append!(std_dev, fill(deserialize("outputs/$(i)")["average_error"],length(deserialize("outputs/$(i)")["response"]))) for i in data_files]
serialize("outputs/000_training_data_w_pshc_test.dict", Dict("response"=>data_points, "average_error"=>std_dev))

#pEGFR as test data
data_files = return_training_data_names_w_egfr_test()
data_points = Array{Float64}(undef,0)
std_dev = Array{Float64}(undef,0)
[append!(data_points, deserialize("outputs/$(i)")["response"]) for i in data_files]
[append!(std_dev, fill(deserialize("outputs/$(i)")["average_error"],length(deserialize("outputs/$(i)")["response"]))) for i in data_files]
serialize("outputs/000_training_data_w_pegfr_test.dict", Dict("response"=>data_points, "average_error"=>std_dev))

#pSHC and SHC:GRB2 as test data
data_files = return_training_data_names_w_pshc_grb2shc_test()
data_points = Array{Float64}(undef,0)
std_dev = Array{Float64}(undef,0)
[append!(data_points, deserialize("outputs/$(i)")["response"]) for i in data_files]
[append!(std_dev, fill(deserialize("outputs/$(i)")["average_error"],length(deserialize("outputs/$(i)")["response"]))) for i in data_files]
serialize("outputs/000_training_data_w_pshc_grb2shc_test.dict", Dict("response"=>data_points, "average_error"=>std_dev))

Extract and Store Kd Predictions & AlphaFold ranking score as Dictionary

In [2]:
data = DataFrame(CSV.File("data/predicted_kd.csv"))
parameters = String.(data[!,"parameter"])
convert_to_nM = 10^9
mean_predicted = log10.(data[!,"predicted_Kd(M)"].*convert_to_nM)
mean_reported = log10.(data[!,"reported_Kd(M)"].*convert_to_nM)
lambda = data[!,"AF_ranking_score"]
mask0 = findall(lambda .< 0.5)
lambda[mask0] .= 0
mask1 = findall(lambda .>= 0.5)
lambda[mask1] .= 1
dictionary_definition = Dict(parameters[i]=>Dict("mean"=>mean_predicted[i],"std_dev"=>1.3,"lambda"=>lambda[i]) for i in 1:length(parameters))
dictionary_definition_reported = Dict(parameters[i]=>Dict("mean"=>mean_reported[i],"std_dev"=>1.3,"lambda"=>lambda[i]) for i in 1:length(parameters))
serialize("outputs/000_augmentation_parameters_predicted_kd_rank_score.dict", dictionary_definition)
serialize("outputs/000_augmentation_parameters_reported_kd_rank_score.dict", dictionary_definition_reported);

Extract and Store Kd Predictions & AlphaFold ipTM as Dictionary

In [22]:
data = DataFrame(CSV.File("data/predicted_kd.csv"))
parameters = String.(data[!,"parameter"])
convert_to_nM = 10^9
mean_predicted = log10.(data[!,"predicted_Kd(M)"].*convert_to_nM)
mean_reported = log10.(data[!,"reported_Kd(M)"].*convert_to_nM)
lambda = data[!,"AF_ipTM"]
dictionary_definition = Dict(parameters[i]=>Dict("mean"=>mean_predicted[i],"std_dev"=>1.3,"lambda"=>lambda[i]) for i in 1:length(parameters))
dictionary_definition_reported = Dict(parameters[i]=>Dict("mean"=>mean_reported[i],"std_dev"=>1.3,"lambda"=>lambda[i]) for i in 1:length(parameters))
serialize("outputs/000_augmentation_parameters_predicted_kd_iptm.dict", dictionary_definition)
serialize("outputs/000_augmentation_parameters_reported_kd_iptm.dict", dictionary_definition_reported);

Extract and Store Kd Predictions with No AlphaFold Confidence

In [24]:
data = DataFrame(CSV.File("data/predicted_kd.csv"))
parameters = String.(data[!,"parameter"])
convert_to_nM = 10^9
mean_predicted = log10.(data[!,"predicted_Kd(M)"].*convert_to_nM)
mean_reported = log10.(data[!,"reported_Kd(M)"].*convert_to_nM)
lambda = fill(1, length(parameters))
dictionary_definition = Dict(parameters[i]=>Dict("mean"=>mean_predicted[i],"std_dev"=>1.3,"lambda"=>lambda[i]) for i in 1:length(parameters))
serialize("outputs/000_augmentation_parameters_predicted_kd_no_AF3_confidence.dict", dictionary_definition)