In [27]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import re
import statsmodels.api as sm
from patsy import dmatrices

data = pd.read_csv("~/rosetta-antibody-ddgs/raw_datasets/full_data.csv")

# Subsetting for phillips data
new_data = data[~data["Source"].str.contains("Phillips")]
data = data[data["Source"].str.contains("Phillips")]

mut_1 = data.loc[data["LD"] == 1]
mut_2 = data.loc[data["LD"] == 2]
mut_3 = data.loc[data["LD"] == 3]
mut_4 = data.loc[data["LD"] == 4]
mut_5 = data.loc[data["LD"] == 5]
mut_2 = mut_2.drop_duplicates(subset="ddG(kcal/mol)", keep=False).copy()
mut_3 = mut_3.drop_duplicates(subset="ddG(kcal/mol)", keep=False).copy()
mut_4 = mut_4.drop_duplicates(subset="ddG(kcal/mol)", keep=False).copy()
mut_5 = mut_5.drop_duplicates(subset="ddG(kcal/mol)", keep=False).copy()

dump_data = pd.concat([mut_1, mut_2, mut_3, mut_4, mut_5]
                      ).reset_index(drop=True)
print(dump_data.head())


   #PDB Mutations  ddG(kcal/mol)                Source  LD
0  3GBN    H:D73E      -0.123782  Phillips et al. 2021   1
1  3GBN    H:T57A       0.007413  Phillips et al. 2021   1
2  3GBN    H:V78A       0.011997  Phillips et al. 2021   1
3  3GBN    H:P61Q       0.012366  Phillips et al. 2021   1
4  3GBN    H:G76S       0.058492  Phillips et al. 2021   1


In [28]:
mlb9114 = MultiLabelBinarizer(classes=['S29F', 'N30S', 'N31S', 'S52I', 'S56T', 'T57A', 'A58N',
                              'S70T', 'I73K', 'F74S', 'S75T', 'N76S', 'N82AS', 'T83R', 'F91Y', 'S100BY'], sparse_output=True)
mlb6261 = MultiLabelBinarizer(classes=['P28T', 'R30S', 'T57A', 'K58N', 'P61Q',
                                       'D73E', 'F74S', 'A75T', 'G76S', 'V78A', 'V100L'], sparse_output=True)

dump_data9114 = dump_data.loc[dump_data["#PDB"] == "4FQY"].copy()
dump_data6261 = dump_data.loc[dump_data["#PDB"] == "3GBN"].copy()

dump_data9114.loc[:, ("Mutations")] = dump_data9114.loc[:, ("Mutations")].apply(
    lambda x: re.split(";", re.sub(r"\w:(\w+)", r"\1", x))).copy()
dump_data6261.loc[:, ("Mutations")] = dump_data6261.loc[:, ("Mutations")].apply(
    lambda x: re.split(";", re.sub(r"\w:(\w+)", r"\1", x))).copy()

dump_data9114 = dump_data9114.join(pd.DataFrame.sparse.from_spmatrix(
    mlb9114.fit_transform(dump_data9114.pop('Mutations')),
    index=dump_data9114.index,
    columns=mlb9114.classes_))
dump_data9114.drop("Source", axis=1, inplace=True)
dump_data6261 = dump_data6261.join(pd.DataFrame.sparse.from_spmatrix(
    mlb6261.fit_transform(dump_data6261.pop('Mutations')),
    index=dump_data6261.index,
    columns=mlb6261.classes_))
dump_data6261.drop("Source", axis=1, inplace=True)
print(dump_data9114.head())
print(dump_data6261.head())


    #PDB  ddG(kcal/mol)  LD  S29F  N30S  N31S  S52I  S56T  T57A  A58N  S70T  \
11  4FQY      -0.587729   1     0     0     0     0     0     0     1     0   
12  4FQY      -0.159545   1     0     0     0     0     0     0     0     0   
13  4FQY       0.805839   1     0     0     0     0     0     0     0     0   
14  4FQY       1.163809   1     0     0     0     0     0     1     0     0   
15  4FQY       0.451638   1     1     0     0     0     0     0     0     0   

    I73K  F74S  S75T  N76S  N82AS  T83R  F91Y  S100BY  
11     0     0     0     0      0     0     0       0  
12     0     0     0     0      0     0     1       0  
13     0     0     0     1      0     0     0       0  
14     0     0     0     0      0     0     0       0  
15     0     0     0     0      0     0     0       0  
   #PDB  ddG(kcal/mol)  LD  P28T  R30S  T57A  K58N  P61Q  D73E  F74S  A75T  \
0  3GBN      -0.123782   1     0     0     0     0     0     1     0     0   
1  3GBN       0.007413   1     0 

# CR6261

In [30]:
affinities = dump_data6261.loc[:, "ddG(kcal/mol)"]
print(affinities)
affinity_array = affinities[["ddG(kcal/mol)"]].values.flatten()
print(affinity_array.shape)
mutations = ['P28T', 'R30S', 'T57A', 'K58N', 'P61Q',
                     'D73E', 'F74S', 'A75T', 'G76S', 'V78A', 'V100L']
genotypes = np.array(
    dump_data6261[[x for x in mutations]].copy(), dtype=np.float64)
print(genotypes.shape)


0      -0.123782
1       0.007413
2       0.011997
3       0.012366
4       0.058492
          ...   
2209    4.274974
2210    4.306079
2211    4.326925
2212    4.327260
2213    4.355152
Name: ddG(kcal/mol), Length: 1011, dtype: float64


KeyError: "None of [Index(['ddG(kcal/mol)'], dtype='object')] are in the [index]"

In [24]:
dump_data6261.rename({"ddG(kcal/mol)": "ddG"}, axis=1, inplace=True)
y, X = dmatrices(
    "ddG ~ P28T + R30S + T57A + K58N + P61Q + D73E + F74S + A75T + G76S + V78A + V100L", data=dump_data6261, return_type="dataframe")
print(y)
print(X)

           ddG
0    -0.123782
1     0.007413
2     0.011997
3     0.012366
4     0.058492
...        ...
2209  4.274974
2210  4.306079
2211  4.326925
2212  4.327260
2213  4.355152

[1011 rows x 1 columns]
      Intercept  P28T  R30S  T57A  K58N  P61Q  D73E  F74S  A75T  G76S  V78A  \
0           1.0   0.0   0.0   0.0   0.0   0.0   1.0   0.0   0.0   0.0   0.0   
1           1.0   0.0   0.0   1.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   
2           1.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   1.0   
3           1.0   0.0   0.0   0.0   0.0   1.0   0.0   0.0   0.0   0.0   0.0   
4           1.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   1.0   0.0   
...         ...   ...   ...   ...   ...   ...   ...   ...   ...   ...   ...   
2209        1.0   1.0   1.0   0.0   1.0   0.0   0.0   1.0   0.0   1.0   0.0   
2210        1.0   1.0   1.0   0.0   0.0   0.0   1.0   1.0   0.0   1.0   0.0   
2211        1.0   1.0   1.0   0.0   1.0   1.0   0.0   1.0   0.0   0.0   0.0   
2212 

# NOTE:
This is not right, just was wanting to test this out. I ended up using the method of Phillips et al. Mainly because it was pretty readily replicable with the data in reverse.

In [25]:
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                    ddG   R-squared:                       0.566
Model:                            OLS   Adj. R-squared:                  0.561
Method:                 Least Squares   F-statistic:                     118.4
Date:                Thu, 07 Jul 2022   Prob (F-statistic):          1.85e-172
Time:                        17:18:26   Log-Likelihood:                -763.72
No. Observations:                1011   AIC:                             1551.
Df Residuals:                     999   BIC:                             1610.
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.4363      0.073     -5.949      0.0