# Rough Epistasis Measurements for Sampling

In [1]:
import numpy as np
import pandas as pd

data = pd.read_csv("~/rosetta-antibody-ddgs/raw_datasets/full_data.csv")

# Subsetting for phillips data
new_data = data[~data["Source"].str.contains("Phillips")]
data = data[data["Source"].str.contains("Phillips")]

mut_1 = data[data["LD"] == 1]
mut_2 = data[data["LD"] == 2]
mut_3 = data[data["LD"] == 3]
mut_4 = data[data["LD"] == 4]

new_data = pd.concat([new_data, mut_1])

Getting the predicted values for $\Delta \Delta G$ for LD == 2.

In [2]:
import re

ddGs_predicted_9114 = pd.DataFrame()
ddGs_predicted_6261 = pd.DataFrame()
mut_1_9114 = mut_1[mut_1["#PDB"] == "4FQY"]
mut_2_9114 = mut_2[mut_2["#PDB"] == "4FQY"]
mut_1_6261 = mut_1[mut_1["#PDB"] == "3GBN"]
mut_2_6261 = mut_2[mut_2["#PDB"] == "3GBN"]
duplicates_list = ["H:T57A;H:F74S"]

ddGs_actual_9114 = mut_2_9114.loc[:, ["Mutations", "ddG(kcal/mol)"]]
ddGs_actual_9114.Mutations.iat[(ddGs_actual_9114.Mutations.eq(
    "H:T57A;H:F74S")).argmax()] = "H:T57A;H:F74S*"
ddGs_actual_9114.set_index("Mutations", inplace=True)

ddGs_actual_6261 = mut_2_6261.loc[:, ["Mutations", "ddG(kcal/mol)"]]
ddGs_actual_6261.set_index("Mutations", inplace=True)

for mutation in mut_2_9114["Mutations"]:
    mut1, mut2 = re.split(";", mutation)
    if mutation in duplicates_list:
        duplicates_list.remove(mutation)
        mutation = mutation + '*'
    ddG = pd.Series(mut_1_9114["ddG(kcal/mol)"][mut_1_9114["Mutations"] ==
                     mut1].values + mut_1_9114["ddG(kcal/mol)"][mut_1_9114["Mutations"] == mut2].values)
    ddGs_predicted_9114 = pd.concat([ddGs_predicted_9114, pd.DataFrame({
        "ddG(kcal/mol)": ddG,
        "Mutations": mutation
    })])

for mutation in mut_2_6261["Mutations"]:
    mut1, mut2 = re.split(";", mutation)
    ddG = pd.Series(mut_1_6261["ddG(kcal/mol)"][mut_1_6261["Mutations"] ==
                     mut1].values + mut_1_6261["ddG(kcal/mol)"][mut_1_6261["Mutations"] == mut2].values)
    ddGs_predicted_6261 = pd.concat([ddGs_predicted_6261, pd.DataFrame({
        "ddG(kcal/mol)": ddG,
        "Mutations": mutation
    })])

ddGs_predicted_9114.set_index("Mutations", inplace = True)
ddGs_predicted_6261.set_index("Mutations", inplace = True)

error_9114 = abs(ddGs_actual_9114 - ddGs_predicted_9114)
error_6261 = abs(ddGs_actual_6261 - ddGs_predicted_6261)

error_6261 = error_6261.sort_values(by=["ddG(kcal/mol)"])
error_9114 = error_9114.sort_values(by=["ddG(kcal/mol)"])

print("CR6261")
print(error_6261.describe(), '\n')
print(error_6261.tail(30), '\n')
error_2_6261 = error_6261.tail(30) # need to sample more to get enough at the interface
print("CR9114")
print(error_9114.describe(), '\n')
print(error_9114.tail(30))
error_2_9114 = error_9114.tail(30)


CR6261
       ddG(kcal/mol)
count      53.000000
mean        0.084232
std         0.087351
min         0.005425
25%         0.027084
50%         0.055526
75%         0.102046
max         0.397227 

                ddG(kcal/mol)
Mutations                    
H:K58N;H:F74S        0.050894
H:T57A;H:F74S        0.052886
H:K58N;H:G76S        0.053836
H:R30S;H:D73E        0.055526
H:P28T;H:V100L       0.058796
H:F74S;H:G76S        0.062391
H:T57A;H:A75T        0.066203
H:P28T;H:G76S        0.067015
H:K58N;H:V78A        0.067038
H:R30S;H:K58N        0.075624
H:K58N;H:V100L       0.078187
H:V78A;H:V100L       0.080122
H:P61Q;H:V78A        0.083940
H:P28T;H:P61Q        0.085929
H:D73E;H:V100L       0.091735
H:T57A;H:K58N        0.091868
H:F74S;H:V100L       0.102046
H:P28T;H:K58N        0.105818
H:P28T;H:T57A        0.108511
H:F74S;H:V78A        0.108803
H:R30S;H:F74S        0.119661
H:P28T;H:F74S        0.155669
H:P28T;H:A75T        0.158265
H:P61Q;H:A75T        0.165800
H:D73E;H:A75T        0

Getting the predicted values for $\Delta \Delta G$ for LD == 3.

In [3]:
ddGs_predicted_9114 = pd.DataFrame()
ddGs_predicted_6261 = pd.DataFrame()

mut_3_9114 = mut_3[mut_3["#PDB"] == "4FQY"]
mut_3_6261 = mut_3[mut_3["#PDB"] == "3GBN"]

ddGs_actual_9114 = mut_3_9114.loc[:, ["Mutations", "ddG(kcal/mol)"]]
ddGs_actual_9114.set_index("Mutations", inplace=True)

ddGs_actual_6261 = mut_3_6261.loc[:, ["Mutations", "ddG(kcal/mol)"]]
ddGs_actual_6261.set_index("Mutations", inplace=True)

for mutation in mut_3_9114["Mutations"]:
    mut1, mut2, mut3 = re.split(";", mutation)
    ddG = pd.Series(mut_1_9114["ddG(kcal/mol)"][mut_1_9114["Mutations"] == mut1].values + 
                    mut_1_9114["ddG(kcal/mol)"][mut_1_9114["Mutations"] == mut2].values + 
                    mut_1_9114["ddG(kcal/mol)"][mut_1_9114["Mutations"] == mut3].values)
    ddGs_predicted_9114 = pd.concat([ddGs_predicted_9114, pd.DataFrame({
        "ddG(kcal/mol)": ddG,
        "Mutations": mutation
    })])

for mutation in mut_3_6261["Mutations"]:
    mut1, mut2, mut3 = re.split(";", mutation)
    ddG = pd.Series(mut_1_6261["ddG(kcal/mol)"][mut_1_6261["Mutations"] == mut1].values +
                    mut_1_6261["ddG(kcal/mol)"][mut_1_6261["Mutations"] == mut2].values +
                    mut_1_6261["ddG(kcal/mol)"][mut_1_6261["Mutations"] == mut3].values)
    ddGs_predicted_6261 = pd.concat([ddGs_predicted_6261, pd.DataFrame({
        "ddG(kcal/mol)": ddG,
        "Mutations": mutation
    })])

ddGs_predicted_9114.set_index("Mutations", inplace = True)
ddGs_predicted_6261.set_index("Mutations", inplace = True)

error_9114 = abs(ddGs_actual_9114 - ddGs_predicted_9114)
error_6261 = abs(ddGs_actual_6261 - ddGs_predicted_6261)

error_6261 = error_6261.sort_values(by=["ddG(kcal/mol)"])
error_9114 = error_9114.sort_values(by=["ddG(kcal/mol)"])

print("CR6261")
print(error_6261.describe(), '\n')
print(error_6261.tail(20), '\n')
error_3_6261 = error_6261.tail(20)
print("CR9114")
print(error_9114.describe(), '\n')
print(error_9114.tail(20))
error_3_9114 = error_9114.tail(20)

CR6261
       ddG(kcal/mol)
count     162.000000
mean        0.178702
std         0.336480
min         0.001084
25%         0.043250
50%         0.081823
75%         0.237253
max         3.631996 

                       ddG(kcal/mol)
Mutations                           
H:R30S;H:T57A;H:A75T        0.310191
H:P28T;H:R30S;H:K58N        0.319117
H:P28T;H:F74S;H:A75T        0.327431
H:P28T;H:R30S;H:V100L       0.329712
H:K58N;H:A75T;H:G76S        0.331871
H:P28T;H:D73E;H:G76S        0.348432
H:R30S;H:A75T;H:V78A        0.360495
H:P28T;H:F74S;H:V78A        0.386479
H:T57A;H:P61Q;H:A75T        0.407358
H:P28T;H:D73E;H:V78A        0.408030
H:R30S;H:A75T;H:V100L       0.408236
H:K58N;H:A75T;H:V78A        0.427543
H:P28T;H:T57A;H:A75T        0.431177
H:R30S;H:K58N;H:A75T        0.635854
H:R30S;H:F74S;H:G76S        0.786300
H:P28T;H:F74S;H:G76S        0.852066
H:P28T;H:R30S;H:D73E        1.111621
H:P28T;H:D73E;H:F74S        1.173660
H:P28T;H:R30S;H:G76S        1.174607
H:P28T;H:R30S;H:F74S     

Getting the predicted values for $\Delta \Delta G$ for LD == 4.

In [4]:
ddGs_predicted_9114 = pd.DataFrame()
ddGs_predicted_6261 = pd.DataFrame()

mut_4_9114 = mut_4[mut_4["#PDB"] == "4FQY"]
mut_4_6261 = mut_4[mut_4["#PDB"] == "3GBN"]

ddGs_actual_9114 = mut_4_9114.loc[:, ["Mutations", "ddG(kcal/mol)"]]
ddGs_actual_9114.set_index("Mutations", inplace=True)

ddGs_actual_6261 = mut_4_6261.loc[:, ["Mutations", "ddG(kcal/mol)"]]
ddGs_actual_6261.set_index("Mutations", inplace=True)

for mutation in mut_4_9114["Mutations"]:
    mut1, mut2, mut3, mut4 = re.split(";", mutation)
    ddG = pd.Series(mut_1_9114["ddG(kcal/mol)"][mut_1_9114["Mutations"] == mut1].values +
                    mut_1_9114["ddG(kcal/mol)"][mut_1_9114["Mutations"] == mut2].values +
                    mut_1_9114["ddG(kcal/mol)"][mut_1_9114["Mutations"] == mut3].values +
                    mut_1_9114["ddG(kcal/mol)"][mut_1_9114["Mutations"] == mut4].values)
    ddGs_predicted_9114 = pd.concat([ddGs_predicted_9114, pd.DataFrame({
        "ddG(kcal/mol)": ddG,
        "Mutations": mutation
    })])

for mutation in mut_4_6261["Mutations"]:
    mut1, mut2, mut3, mut4 = re.split(";", mutation)
    ddG = pd.Series(mut_1_6261["ddG(kcal/mol)"][mut_1_6261["Mutations"] == mut1].values +
                    mut_1_6261["ddG(kcal/mol)"][mut_1_6261["Mutations"] == mut2].values +
                    mut_1_6261["ddG(kcal/mol)"][mut_1_6261["Mutations"] == mut3].values +
                    mut_1_6261["ddG(kcal/mol)"][mut_1_6261["Mutations"] == mut4].values)
    ddGs_predicted_6261 = pd.concat([ddGs_predicted_6261, pd.DataFrame({
        "ddG(kcal/mol)": ddG,
        "Mutations": mutation
    })])

ddGs_predicted_9114.set_index("Mutations", inplace=True)
ddGs_predicted_6261.set_index("Mutations", inplace=True)

error_9114 = abs(ddGs_actual_9114 - ddGs_predicted_9114)
error_6261 = abs(ddGs_actual_6261 - ddGs_predicted_6261)

error_6261 = error_6261.sort_values(by=["ddG(kcal/mol)"])
error_9114 = error_9114.sort_values(by=["ddG(kcal/mol)"])

print("CR6261")
print(error_6261.describe(), '\n')
print(error_6261.tail(10), '\n')
error_4_6261 = error_6261.tail(10)
print("CR9114")
print(error_9114.describe(), '\n')
print(error_9114.tail(10))
error_4_9114 = error_9114.tail(10)


CR6261
       ddG(kcal/mol)
count     328.000000
mean        0.334951
std         0.537074
min         0.000312
25%         0.059406
50%         0.186562
75%         0.347565
max         3.770513 

                              ddG(kcal/mol)
Mutations                                  
H:R30S;H:F74S;H:G76S;H:V78A        1.476329
H:P28T;H:D73E;H:F74S;H:G76S        1.541208
H:P28T;H:R30S;H:A75T;H:G76S        1.711430
H:P28T;H:D73E;H:F74S;H:V78A        1.798144
H:P28T;H:R30S;H:F74S;H:V100L       2.725497
H:P28T;H:R30S;H:T57A;H:F74S        3.445486
H:P28T;H:R30S;H:F74S;H:V78A        3.466238
H:P28T;H:R30S;H:P61Q;H:F74S        3.677788
H:P28T;H:R30S;H:F74S;H:G76S        3.722345
H:P28T;H:R30S;H:K58N;H:F74S        3.770513 

CR9114
       ddG(kcal/mol)
count    1820.000000
mean        1.433740
std         1.525403
min         0.000242
25%         0.355037
50%         0.905128
75%         1.952489
max         9.387788 

                               ddG(kcal/mol)
Mutations                    

## Outputting sampled dataset

In [5]:
error_2_6261.reset_index(inplace=True)
error_2_9114.reset_index(inplace=True)

error_3_6261.reset_index(inplace=True)
error_3_9114.reset_index(inplace=True)

error_4_6261.reset_index(inplace=True)
error_4_9114.reset_index(inplace=True)

print(len(new_data))

samples = [error_2_6261, error_2_9114, error_3_6261, error_3_9114, error_4_6261, error_4_9114]
to_add = pd.DataFrame()
for sample in samples:
    for mutation in sample["Mutations"]:
        add = data[data["Mutations"] == mutation]
        to_add = pd.concat([add, to_add])

new_data = pd.concat([to_add, new_data])

print(len(new_data))

try:
    new_data.to_csv('use_this_data.csv', index=False)
    print("Wrote file.")
except:
    print("Did not output.")

764
885
Wrote file.
