In [2]:
import os 
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold,GroupKFold
from sklearn.metrics import roc_auc_score,matthews_corrcoef,ndcg_score,average_precision_score
from training.utils import DMS_file_for_LLM


In [3]:
folds = 5
seed = 42
top_test_frac = 0.1

bindingGYM = pd.read_csv('./input/BindingGYM.csv')

In [4]:


def calc_two_extreme_metric(train,bottom_test,top_test,pred_col):
    ms = {}
    train = train.loc[(train['top_frac']>=top_test_frac)&(train['top_frac']<(1-top_test_frac))].reset_index(drop=True)
    valid = train
    preds = valid[pred_col].values
    bottom_preds = bottom_test[pred_col].values
    top_preds = top_test[pred_col].values
    all_preds = np.concatenate([bottom_preds,preds,top_preds])
    # n = int(len(top_labels)*0.1/args.top_test_frac)
    n = len(top_preds)
    m = len(bottom_preds)
    top_pred_idxs = np.argsort(all_preds)[-(len(all_preds)-n):]
    top_idxs = np.arange(n)
    for k in [10,20,50,100]:
        hit = (top_pred_idxs[-min(k,n):]>=(len(all_preds)-n)).mean()
#         print(k,hit)
        if f'TopHit@{k}' not in ms:
            ms[f'TopHit@{k}'] = hit 
        else:
            ms[f'TopHit@{k}'] += hit

        hit = (top_pred_idxs[:min(k,n)][:min(k,m)]<m).mean()
        if f'BottomHit@{k}' not in ms:
            ms[f'BottomHit@{k}'] = hit 
        else:
            ms[f'BottomHit@{k}'] += hit 
    for k in [10,20,50,100]:
        ms[f'UnbiasHit@{k}'] = ms[f'TopHit@{k}'] - ms[f'BottomHit@{k}']
    return ms

def calc_zero_shot_metric(df,pred_col,label_col='DMS_score',top_test=True):
    label_bin = (df[label_col]>np.percentile(df[label_col].values,90))+0
    pred_bin = (df[pred_col]>np.percentile(df[pred_col].values,90))+0
    Spearman = df[label_col].rank().corr(df[pred_col].rank())
    AUC = roc_auc_score(label_bin,df[pred_col])
    MCC = matthews_corrcoef(label_bin,pred_bin)
    NDCG = ndcg_score(df[label_col].rank().values.reshape(1,-1),df[pred_col].values.reshape(1,-1),k=df.shape[0]//10)
    AP = average_precision_score(label_bin,df[pred_col])
    ms = {'Spearman':Spearman,
            'AUC':AUC,
            'MCC':MCC,
            'NDCG':NDCG,
           'AP':AP}
    if top_test:
        train = df.sort_values(by=label_col)
        train['rank'] = np.arange(0,train.shape[0])
        train['top_frac'] = train['rank'] / train.shape[0]
        bottom_test = train.loc[(train['top_frac']<(top_test_frac))].reset_index(drop=True)
        top_test = train.loc[(train['top_frac']>=(1-top_test_frac))].reset_index(drop=True)
        ms.update(calc_two_extreme_metric(train,bottom_test,top_test,pred_col))
    return ms

def get_mutant_count(x):
    n = 0
    for c in x:
        if x[c] != '':
            n += len(x[c].split(':'))
    return n

def get_zero_shot_metric_df(path,pred_col,contig=False,mod=False):
    zero_shot_metric = {}
    for DMS_id in bindingGYM['DMS_id'].values:
        orig_df = pd.read_csv(f'./input/Binding_substitutions_DMS/{DMS_id}.csv')
        print(DMS_id)
        df = pd.read_csv(f'{path}/{DMS_id}.csv')
        
        assert df.shape[0] == orig_df.shape[0]
        if contig or mod:
            df['mutant'] = df['mutant'].apply(eval)
            df = df.loc[df['mutant'].apply(get_mutant_count)<2].reset_index(drop=True)
            if df.shape[0] < 100:
                continue
        zero_shot_metric[DMS_id] = calc_zero_shot_metric(df,pred_col)
    zero_shot_metric_df = pd.DataFrame(zero_shot_metric.values())
    zero_shot_metric_df.insert(0,'DMS_id',zero_shot_metric.keys())
    return zero_shot_metric_df


def get_finetune_intra_random_metric_df(path,pred_col,model_type):
    zero_shot_metric = {}
    oneORtwo_mut_metric = {}
    multi_mut_metric = {}
    for DMS_id in bindingGYM['DMS_id'].values:
        print(DMS_id)
        train = pd.read_csv(f'./input/Binding_substitutions_DMS/{DMS_id}.csv')
        df = pd.read_csv(f'{path}/train_on_{DMS_id}_intra_random_{model_type}_seed{seed}/oof.csv')
        assert train.shape[0] == df.shape[0]
        assert df['fold'].isna().sum()==0
        zero_shot_metric[DMS_id] = calc_zero_shot_metric(df,pred_col,top_test=False)

        oneORtwo_df = df.loc[df['mutant'].fillna('').apply(lambda x:len(x.split(':'))<3)].reset_index(drop=True)
        print(oneORtwo_df.shape)
        if oneORtwo_df.shape[0] >= 100:
            oneORtwo_mut_metric[DMS_id] = calc_zero_shot_metric(oneORtwo_df,pred_col,top_test=False)

        multi_df = df.loc[df['mutant'].fillna('').apply(lambda x:len(x.split(':'))>=3)].reset_index(drop=True)
        print(multi_df.shape)
        if multi_df.shape[0] >= 100:
            multi_mut_metric[DMS_id] = calc_zero_shot_metric(multi_df,pred_col,top_test=False)

    zero_shot_metric_df = pd.DataFrame(zero_shot_metric.values())
    zero_shot_metric_df.insert(0,'DMS_id',zero_shot_metric.keys())
    
    oneORtwo_mut_metric_df = pd.DataFrame(oneORtwo_mut_metric.values())
    oneORtwo_mut_metric_df.insert(0,'DMS_id',oneORtwo_mut_metric.keys())
    
    multi_mut_metric_df = pd.DataFrame(multi_mut_metric.values())
    multi_mut_metric_df.insert(0,'DMS_id',multi_mut_metric.keys())
    return zero_shot_metric_df,oneORtwo_mut_metric_df,multi_mut_metric_df


def get_finetune_intra_contig_metric_df(path,pred_col,model_type):
    zero_shot_metric = {}
    for DMS_id in bindingGYM['DMS_id'].values:
        train = pd.read_csv(f'./input/Binding_substitutions_DMS/{DMS_id}.csv')
        train = DMS_file_for_LLM(train,focus=True)
        train = train.loc[train['mutant'].apply(lambda x:len(x.split(':'))<2)].reset_index(drop=True)
        if train.shape[0] < 100:
            continue
        print(DMS_id)
        df = pd.read_csv(f'{path}/train_on_{DMS_id}_intra_contig_{model_type}_seed{seed}/oof.csv')
        assert train.shape[0] == df.shape[0]
        assert df['fold'].isna().sum()==0
        zero_shot_metric[DMS_id] = calc_zero_shot_metric(df,pred_col,top_test=False)
        
    zero_shot_metric_df = pd.DataFrame(zero_shot_metric.values())
    zero_shot_metric_df.insert(0,'DMS_id',zero_shot_metric.keys())
    return zero_shot_metric_df

def get_finetune_intra_mod_metric_df(path,pred_col,model_type):
    zero_shot_metric = {}
    for DMS_id in bindingGYM['DMS_id'].values:
        train = pd.read_csv(f'./input/Binding_substitutions_DMS/{DMS_id}.csv')
        train = DMS_file_for_LLM(train,focus=True)
        train = train.loc[train['mutant'].apply(lambda x:len(x.split(':'))<2)].reset_index(drop=True)
        if train.shape[0] < 100:
            continue
        print(DMS_id)
        df = pd.read_csv(f'{path}/train_on_{DMS_id}_intra_mod_{model_type}_seed{seed}/oof.csv')
        assert train.shape[0] == df.shape[0]
        assert df['fold'].isna().sum()==0
        zero_shot_metric[DMS_id] = calc_zero_shot_metric(df,pred_col,top_test=False)
        
    zero_shot_metric_df = pd.DataFrame(zero_shot_metric.values())
    zero_shot_metric_df.insert(0,'DMS_id',zero_shot_metric.keys())
    return zero_shot_metric_df


def get_finetune_intra_top_test_metric_df(path,pred_col,model_type):
    zero_shot_metric = {}
    for DMS_id in bindingGYM['DMS_id'].values:
        try:
            df = pd.read_csv(f'./input/Binding_substitutions_DMS/{DMS_id}.csv')
            train = pd.read_csv(f'{path}/train_on_{DMS_id}_intra_top_test_{model_type}_seed{seed}/oof.csv')
            test = pd.read_csv(f'{path}/train_on_{DMS_id}_intra_top_test_{model_type}_seed{seed}/pred.csv')
            print(DMS_id,df.shape[0]==(train.shape[0]+test.query("fold==0").shape[0]),train['fold'].isna().sum(),train.shape,test.shape)
            bottom_test = test.loc[test['top_frac']<0.5]
            bottom_test = bottom_test.groupby('rank')[pred_col].max().reset_index()
            top_test = test.loc[test['top_frac']>0.5]
            top_test = top_test.groupby('rank')[pred_col].max().reset_index()
            
            zero_shot_metric[DMS_id] = calc_two_extreme_metric(train,bottom_test,top_test,pred_col)
        except:
            pass
    zero_shot_metric_df = pd.DataFrame(zero_shot_metric.values())
    zero_shot_metric_df.insert(0,'DMS_id',zero_shot_metric.keys())
    return zero_shot_metric_df


def get_finetune_inter_metric_df(path,pred_col,model_type):
    zero_shot_metric = {}
    oneORtwo_mut_metric = {}
    multi_mut_metric = {}
    for DMS_id in bindingGYM['DMS_id'].values:
        print(DMS_id)
        train = pd.read_csv(f'./input/Binding_substitutions_DMS/{DMS_id}.csv')
        df = pd.read_csv(f'{path}/train_on_BindingGYM_inter_cluster_{model_type}_seed{seed}/{DMS_id}_oof.csv')
        df['a'] = df['mutant'].astype(str)
        df = df.drop_duplicates('a').reset_index(drop=True)
        assert train.shape[0] == df.shape[0]
        
        zero_shot_metric[DMS_id] = calc_zero_shot_metric(df,pred_col,top_test=False)
        
        oneORtwo_df = df.loc[df['mutant'].fillna('').apply(lambda x:len(x.split(':'))<3)].reset_index(drop=True)
        print(oneORtwo_df.shape)
        if oneORtwo_df.shape[0] >= 100:
            oneORtwo_mut_metric[DMS_id] = calc_zero_shot_metric(oneORtwo_df,pred_col,top_test=False)

        multi_df = df.loc[df['mutant'].fillna('').apply(lambda x:len(x.split(':'))>=3)].reset_index(drop=True)
        print(multi_df.shape)
        if multi_df.shape[0] >= 100:
            multi_mut_metric[DMS_id] = calc_zero_shot_metric(multi_df,pred_col,top_test=False)

    zero_shot_metric_df = pd.DataFrame(zero_shot_metric.values())
    zero_shot_metric_df.insert(0,'DMS_id',zero_shot_metric.keys())
    oneORtwo_mut_metric_df = pd.DataFrame(oneORtwo_mut_metric.values())
    oneORtwo_mut_metric_df.insert(0,'DMS_id',oneORtwo_mut_metric.keys())
    
    multi_mut_metric_df = pd.DataFrame(multi_mut_metric.values())
    multi_mut_metric_df.insert(0,'DMS_id',multi_mut_metric.keys())
    return zero_shot_metric_df,oneORtwo_mut_metric_df,multi_mut_metric_df


# zeroshot

In [5]:
ESM2_zero_shot_metric_df = get_zero_shot_metric_df('./modelzoo/esm2/output','esm2_t33_650M_UR50D')

4D5_HER2_fitness_1N8Z
5A12_Ang2_fitness_4ZFG
5A12_VEGF_fitness_4ZFF
Z-domain_ZpA963_HL1_fitness_2M5A
Z-domain_ZpA963_HL2_fitness_2M5A
Z-domain_ZSPA-1_LL1_fitness_1LP1
Z-domain_ZSPA-1_LL2_fitness_1LP1
CXCR4_CXCL12_enrich_8U4O
hYAP65_peptide_FunctioncalScore_1JMQ
GB1_IgG-Fc_fitness_1FCC
GB1_IgG-Fc_fitness_1FCC_2016
SARS2-RBD_ACE2_deltaKd_6M0J
KRAS_DARPinK27_norfitness_5O2S
KRAS_PICK3CG-RBD_norfitness_1HE8
KRAS_RAF1_norfitness_6VJJ
KRAS_RAF1-RBD_norfitness_6VJJ
KRAS_RALGDS-RBD_norfitness_1LFD
KRAS_SOS1_norfitness_8BE4
BH3_Mcl-1_normed_3KZ0
BH3_Bcl-xL_normed_1PQ1
HLA-A2_TAPBPR_meanscore_5WER
PSD95_CRIPT_1BE9
PSD95_Tm2F_1BE9
ACE2_SARS2-RBD_enrich_6M17
CD19_FMC63_Fitness_7URV


ESM2_zero_shot_metric_df

# intra random

In [8]:
ProteinMPNN_intra_random_metric_df,ProteinMPNN_intra_random_metric_oneORtwo_df,ProteinMPNN_intra_random_metric_multi_df = get_finetune_intra_random_metric_df('./training/output/','pred','structure')

4D5_HER2_fitness_1N8Z
(1, 10)
(2079, 10)
5A12_Ang2_fitness_4ZFG
(165, 10)
(779, 10)
5A12_VEGF_fitness_4ZFF
(230, 10)
(29751, 10)
Z-domain_ZpA963_HL1_fitness_2M5A
(194, 10)
(2710, 10)
Z-domain_ZpA963_HL2_fitness_2M5A
(265, 10)
(335, 10)
Z-domain_ZSPA-1_LL1_fitness_1LP1
(40, 10)
(45436, 10)
Z-domain_ZSPA-1_LL2_fitness_1LP1
(32, 10)
(5551, 10)
CXCR4_CXCL12_enrich_8U4O
(5585, 10)
(0, 10)
hYAP65_peptide_FunctioncalScore_1JMQ
(7316, 10)
(11091, 10)
GB1_IgG-Fc_fitness_1FCC
(92891, 11)
(0, 11)
GB1_IgG-Fc_fitness_1FCC_2016
(539, 10)
(21637, 10)
SARS2-RBD_ACE2_deltaKd_6M0J
(11602, 11)
(10270, 11)
KRAS_DARPinK27_norfitness_5O2S
(19533, 11)
(0, 11)
KRAS_PICK3CG-RBD_norfitness_1HE8
(19203, 11)
(0, 11)
KRAS_RAF1_norfitness_6VJJ
(12677, 11)
(0, 11)
KRAS_RAF1-RBD_norfitness_6VJJ
(23162, 11)
(0, 11)
KRAS_RALGDS-RBD_norfitness_1LFD
(20341, 11)
(0, 11)
KRAS_SOS1_norfitness_8BE4
(19425, 11)
(0, 11)
BH3_Mcl-1_normed_3KZ0
(226, 11)
(292, 11)
BH3_Bcl-xL_normed_1PQ1
(226, 11)
(292, 11)
HLA-A2_TAPBPR_meanscore

In [9]:
ProteinMPNN_intra_random_metric_df

Unnamed: 0,DMS_id,Spearman,AUC,MCC,NDCG,AP
0,4D5_HER2_fitness_1N8Z,0.317166,0.814974,0.455128,0.93505,0.408442
1,5A12_Ang2_fitness_4ZFG,0.254951,0.660567,0.21582,0.785125,0.274243
2,5A12_VEGF_fitness_4ZFF,0.722979,0.741228,0.12035,0.745749,0.189638
3,Z-domain_ZpA963_HL1_fitness_2M5A,0.869175,0.957373,0.602811,0.970719,0.701513
4,Z-domain_ZpA963_HL2_fitness_2M5A,0.644945,0.801235,0.37037,0.883715,0.377232
5,Z-domain_ZSPA-1_LL1_fitness_1LP1,0.649021,0.899517,0.433224,0.89685,0.473385
6,Z-domain_ZSPA-1_LL2_fitness_1LP1,0.797611,0.927596,0.469216,0.927823,0.513511
7,CXCR4_CXCL12_enrich_8U4O,0.373208,0.665732,0.089554,0.680857,0.159055
8,hYAP65_peptide_FunctioncalScore_1JMQ,0.623248,0.822534,0.304713,0.834672,0.341969
9,GB1_IgG-Fc_fitness_1FCC,0.947442,0.97071,0.698927,0.972606,0.818233


# intra contig

In [11]:
ProteinMPNN_intra_contig_metric_df = get_finetune_intra_contig_metric_df('./training/output/','pred','structure')

CXCR4_CXCL12_enrich_8U4O
hYAP65_peptide_FunctioncalScore_1JMQ
GB1_IgG-Fc_fitness_1FCC
SARS2-RBD_ACE2_deltaKd_6M0J
KRAS_DARPinK27_norfitness_5O2S
KRAS_PICK3CG-RBD_norfitness_1HE8
KRAS_RAF1_norfitness_6VJJ
KRAS_RAF1-RBD_norfitness_6VJJ
KRAS_RALGDS-RBD_norfitness_1LFD
KRAS_SOS1_norfitness_8BE4
BH3_Mcl-1_normed_3KZ0
BH3_Bcl-xL_normed_1PQ1
HLA-A2_TAPBPR_meanscore_5WER
PSD95_CRIPT_1BE9
PSD95_Tm2F_1BE9
ACE2_SARS2-RBD_enrich_6M17
CD19_FMC63_Fitness_7URV


In [12]:
ProteinMPNN_intra_contig_metric_df

Unnamed: 0,DMS_id,Spearman,AUC,MCC,NDCG,AP
0,CXCR4_CXCL12_enrich_8U4O,0.170303,0.569366,0.079614,0.639129,0.131779
1,hYAP65_peptide_FunctioncalScore_1JMQ,0.422669,0.647878,0.156764,0.754375,0.25444
2,GB1_IgG-Fc_fitness_1FCC,0.680042,0.797409,0.206012,0.80046,0.270881
3,SARS2-RBD_ACE2_deltaKd_6M0J,0.711209,0.843382,0.274464,0.750105,0.30075
4,KRAS_DARPinK27_norfitness_5O2S,0.433501,0.671632,0.145564,0.646265,0.209365
5,KRAS_PICK3CG-RBD_norfitness_1HE8,0.577856,0.765548,0.271886,0.807324,0.280114
6,KRAS_RAF1_norfitness_6VJJ,0.470101,0.658183,0.122234,0.754882,0.195044
7,KRAS_RAF1-RBD_norfitness_6VJJ,0.554251,0.722435,0.206785,0.803003,0.267409
8,KRAS_RALGDS-RBD_norfitness_1LFD,0.615678,0.73017,0.185007,0.749766,0.245329
9,KRAS_SOS1_norfitness_8BE4,0.435488,0.665026,0.176408,0.687611,0.216703


# intra mod

In [16]:
ProteinMPNN_intra_mod_metric_df = get_finetune_intra_mod_metric_df('./training/output/','pred','structure')

CXCR4_CXCL12_enrich_8U4O
hYAP65_peptide_FunctioncalScore_1JMQ
GB1_IgG-Fc_fitness_1FCC
SARS2-RBD_ACE2_deltaKd_6M0J
KRAS_DARPinK27_norfitness_5O2S
KRAS_PICK3CG-RBD_norfitness_1HE8
KRAS_RAF1_norfitness_6VJJ
KRAS_RAF1-RBD_norfitness_6VJJ
KRAS_RALGDS-RBD_norfitness_1LFD
KRAS_SOS1_norfitness_8BE4
BH3_Mcl-1_normed_3KZ0
BH3_Bcl-xL_normed_1PQ1
HLA-A2_TAPBPR_meanscore_5WER
PSD95_CRIPT_1BE9
PSD95_Tm2F_1BE9
ACE2_SARS2-RBD_enrich_6M17
CD19_FMC63_Fitness_7URV


In [17]:
ProteinMPNN_intra_mod_metric_df

Unnamed: 0,DMS_id,Spearman,AUC,MCC,NDCG,AP
0,CXCR4_CXCL12_enrich_8U4O,0.249628,0.600506,0.05576,0.631878,0.135348
1,hYAP65_peptide_FunctioncalScore_1JMQ,0.475446,0.677719,0.195093,0.667926,0.184745
2,GB1_IgG-Fc_fitness_1FCC,0.747018,0.838186,0.280117,0.805842,0.332463
3,SARS2-RBD_ACE2_deltaKd_6M0J,0.686084,0.824481,0.228455,0.727564,0.270192
4,KRAS_DARPinK27_norfitness_5O2S,0.456405,0.667103,0.145564,0.647415,0.193736
5,KRAS_PICK3CG-RBD_norfitness_1HE8,0.58328,0.758694,0.259259,0.744456,0.277673
6,KRAS_RAF1_norfitness_6VJJ,0.489315,0.670855,0.168923,0.768275,0.212346
7,KRAS_RAF1-RBD_norfitness_6VJJ,0.60698,0.751092,0.24998,0.782761,0.289666
8,KRAS_RALGDS-RBD_norfitness_1LFD,0.613052,0.726418,0.180649,0.743177,0.236496
9,KRAS_SOS1_norfitness_8BE4,0.462754,0.693408,0.140984,0.680403,0.200998


# intra two extreme

In [18]:
ProteinMPNN_intra_top_test_metric_df = get_finetune_intra_top_test_metric_df('./training/output','pred','structure')

5A12_VEGF_fitness_4ZFF True 0 (23984, 12) (29985, 12)
Z-domain_ZpA963_HL1_fitness_2M5A True 0 (2323, 12) (2905, 12)
Z-domain_ZpA963_HL2_fitness_2M5A True 0 (480, 12) (600, 12)
Z-domain_ZSPA-1_LL1_fitness_1LP1 True 0 (36381, 12) (45475, 12)
Z-domain_ZSPA-1_LL2_fitness_1LP1 True 0 (4466, 12) (5585, 12)
CXCR4_CXCL12_enrich_8U4O True 0 (4468, 12) (5585, 12)
hYAP65_peptide_FunctioncalScore_1JMQ True 0 (14726, 12) (18405, 12)
GB1_IgG-Fc_fitness_1FCC True 0 (74312, 13) (92895, 13)
GB1_IgG-Fc_fitness_1FCC_2016 True 0 (17741, 12) (22175, 12)
SARS2-RBD_ACE2_deltaKd_6M0J True 0 (17497, 13) (21875, 13)
KRAS_DARPinK27_norfitness_5O2S True 0 (15626, 13) (19535, 13)
KRAS_PICK3CG-RBD_norfitness_1HE8 True 0 (15362, 13) (19205, 13)
KRAS_RAF1_norfitness_6VJJ True 0 (10142, 13) (12675, 13)
KRAS_RAF1-RBD_norfitness_6VJJ True 0 (18529, 13) (23165, 13)
KRAS_RALGDS-RBD_norfitness_1LFD True 0 (16272, 13) (20345, 13)
KRAS_SOS1_norfitness_8BE4 True 0 (15540, 13) (19425, 13)
BH3_Mcl-1_normed_3KZ0 True 0 (415, 13)

In [19]:
ProteinMPNN_intra_top_test_metric_df

Unnamed: 0,DMS_id,TopHit@10,BottomHit@10,TopHit@20,BottomHit@20,TopHit@50,BottomHit@50,TopHit@100,BottomHit@100,UnbiasHit@10,UnbiasHit@20,UnbiasHit@50,UnbiasHit@100
0,5A12_VEGF_fitness_4ZFF,0.9,0.7,0.75,0.55,0.68,0.4,0.63,0.34,0.2,0.2,0.28,0.29
1,Z-domain_ZpA963_HL1_fitness_2M5A,1.0,0.2,1.0,0.15,0.96,0.16,0.85,0.17,0.8,0.85,0.8,0.68
2,Z-domain_ZpA963_HL2_fitness_2M5A,0.8,0.1,0.75,0.1,0.66,0.18,0.6,0.15,0.7,0.65,0.48,0.45
3,Z-domain_ZSPA-1_LL1_fitness_1LP1,0.9,0.0,0.95,0.1,0.96,0.1,0.93,0.12,0.9,0.85,0.86,0.81
4,Z-domain_ZSPA-1_LL2_fitness_1LP1,0.9,0.5,0.9,0.45,0.84,0.34,0.86,0.35,0.4,0.45,0.5,0.51
5,CXCR4_CXCL12_enrich_8U4O,0.4,0.1,0.45,0.05,0.52,0.06,0.47,0.06,0.3,0.4,0.46,0.41
6,hYAP65_peptide_FunctioncalScore_1JMQ,1.0,0.1,0.75,0.05,0.82,0.06,0.8,0.06,0.9,0.7,0.76,0.74
7,GB1_IgG-Fc_fitness_1FCC,1.0,0.2,1.0,0.25,1.0,0.32,1.0,0.35,0.8,0.75,0.68,0.65
8,GB1_IgG-Fc_fitness_1FCC_2016,0.8,0.2,0.9,0.3,0.94,0.16,0.95,0.16,0.6,0.6,0.78,0.79
9,SARS2-RBD_ACE2_deltaKd_6M0J,0.9,0.5,0.75,0.45,0.8,0.4,0.76,0.42,0.4,0.3,0.4,0.34


# inter cluster

In [21]:
ProteinMPNN_finetune_inter_metric_df,ProteinMPNN_finetune_inter_metric_oneORtwo_df,ProteinMPNN_finetune_inter_metric_multi_df = get_finetune_inter_metric_df('./training/output','pred','structure')

4D5_HER2_fitness_1N8Z
(1, 11)
(2079, 11)
5A12_Ang2_fitness_4ZFG
(165, 11)
(779, 11)
5A12_VEGF_fitness_4ZFF
(230, 11)
(29751, 11)
Z-domain_ZpA963_HL1_fitness_2M5A
(194, 11)
(2710, 11)
Z-domain_ZpA963_HL2_fitness_2M5A
(265, 11)
(335, 11)
Z-domain_ZSPA-1_LL1_fitness_1LP1
(40, 11)
(45436, 11)
Z-domain_ZSPA-1_LL2_fitness_1LP1
(32, 11)
(5551, 11)
CXCR4_CXCL12_enrich_8U4O
(5585, 11)
(0, 11)
hYAP65_peptide_FunctioncalScore_1JMQ
(7316, 11)
(11091, 11)
GB1_IgG-Fc_fitness_1FCC
(92891, 11)
(0, 11)
GB1_IgG-Fc_fitness_1FCC_2016
(539, 11)
(21637, 11)
SARS2-RBD_ACE2_deltaKd_6M0J
(11602, 11)
(10270, 11)
KRAS_DARPinK27_norfitness_5O2S
(19533, 11)
(0, 11)
KRAS_PICK3CG-RBD_norfitness_1HE8
(19203, 11)
(0, 11)
KRAS_RAF1_norfitness_6VJJ
(12677, 11)
(0, 11)
KRAS_RAF1-RBD_norfitness_6VJJ
(23162, 11)
(0, 11)
KRAS_RALGDS-RBD_norfitness_1LFD
(20341, 11)
(0, 11)
KRAS_SOS1_norfitness_8BE4
(19425, 11)
(0, 11)
BH3_Mcl-1_normed_3KZ0
(226, 11)
(292, 11)
BH3_Bcl-xL_normed_1PQ1
(226, 11)
(292, 11)
HLA-A2_TAPBPR_meanscore

In [22]:
ProteinMPNN_finetune_inter_metric_df

Unnamed: 0,DMS_id,Spearman,AUC,MCC,NDCG,AP
0,4D5_HER2_fitness_1N8Z,0.399684,0.871302,0.348291,0.864284,0.376309
1,5A12_Ang2_fitness_4ZFG,0.106999,0.627698,0.07537,0.553554,0.15119
2,5A12_VEGF_fitness_4ZFF,0.445998,0.645043,0.041952,0.684833,0.13851
3,Z-domain_ZpA963_HL1_fitness_2M5A,0.172083,0.684914,0.098686,0.588519,0.178022
4,Z-domain_ZpA963_HL2_fitness_2M5A,0.371584,0.738364,0.240741,0.707255,0.250221
5,Z-domain_ZSPA-1_LL1_fitness_1LP1,0.012115,0.432183,-0.045078,0.530476,0.083945
6,Z-domain_ZSPA-1_LL2_fitness_1LP1,0.16512,0.685795,0.157108,0.746681,0.20401
7,CXCR4_CXCL12_enrich_8U4O,0.208581,0.562614,0.039857,0.633218,0.122393
8,hYAP65_peptide_FunctioncalScore_1JMQ,0.313728,0.689958,0.123649,0.668949,0.171189
9,GB1_IgG-Fc_fitness_1FCC,0.574222,0.720079,0.096782,0.682331,0.174469
