# Deletion Specific Ensemble

In [7]:
import numpy as np
import pandas as pd
import csv
import os
from tqdm.auto import tqdm
from scipy.stats import rankdata
import Levenshtein
import matplotlib.pyplot as plt
import seaborn as sns

two_colors = sns.xkcd_palette(['red', 'bright blue'])

In [8]:
#Load data
curr_dir = os.getcwd()

# Wild type sequence provided in the "Dataset Description":
wt = 'VPVNPEPDATSVENVALKTGSGDSQSDPIKADLEVKGQSALPFDVDCWAILCKGAPNVLQRVNEKTKNSNRDRSGANKGPFKDPQKWGIKALPPKNPSWSAQDFKSPEEYAFASSLQGGTNAILAPVNLASQNSQGGVLNGFYSANKVAQFDPSKPQQTKGTWFQITKFTGAAGPYCKALGSNDKSVCDKNKNIAGDWGFDPAKWAYQYDEKNNKFNYVGK'

# Read testing set sequences and pH:
test_df = pd.read_csv(curr_dir + "/input/novozymes-enzyme-stability-prediction/test.csv")

In [9]:
# Add mutation information to testing set:
result = []
for _, row in test_df.iterrows():
    ops = Levenshtein.editops(wt, row['protein_sequence'])
    #print("n")
    #print(ops)
    assert len(ops) <= 1
    if len(ops) > 0 and ops[0][0] == 'replace':
        idx = ops[0][1]
        result.append(['SUB', idx + 1, wt[idx], row['protein_sequence'][idx]])
    elif len(ops) == 0:
        result.append(['WT', 0, '', ''])
    elif ops[0][0] == 'insert':
        assert False, "Ups"
    elif ops[0][0] == 'delete':
        idx = ops[0][1]
        result.append(['DEL', idx + 1, wt[idx], '_'])
    else:
        assert False, "Ups"

test_df = pd.concat([test_df, pd.DataFrame(data=result, columns=['type', 'resid', 'wt', 'mut'])], axis=1)

test_df

Unnamed: 0,seq_id,protein_sequence,pH,data_source,type,resid,wt,mut
0,31390,VPVNPEPDATSVENVAEKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,17,L,E
1,31391,VPVNPEPDATSVENVAKKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,17,L,K
2,31392,VPVNPEPDATSVENVAKTGSGDSQSDPIKADLEVKGQSALPFDVDC...,8,Novozymes,DEL,17,L,_
3,31393,VPVNPEPDATSVENVALCTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,18,K,C
4,31394,VPVNPEPDATSVENVALFTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,18,K,F
...,...,...,...,...,...,...,...,...
2408,33798,VPVNPEPDATSVENVILKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,16,A,I
2409,33799,VPVNPEPDATSVENVLLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,16,A,L
2410,33800,VPVNPEPDATSVENVNLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,16,A,N
2411,33801,VPVNPEPDATSVENVPLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,16,A,P


# Blosum

In [10]:
def blosum_apply(row):
    if row['type'] == 'SUB':
        return blosum.loc[row['wt'], row['mut']]
    elif row['type'] == 'DEL':
        return -10
    elif row['type'] == 'WT':
        return 0
    else:
        assert False, "Ups"

blosum = pd.read_csv(curr_dir + '/input/blosum_data/BLOSUM100.txt', sep='\s+', comment='#')
test_df['blosum'] = test_df.apply(blosum_apply, axis=1)
test_df['blosum_rank'] = rankdata(test_df['blosum'])

test_df

Unnamed: 0,seq_id,protein_sequence,pH,data_source,type,resid,wt,mut,blosum,blosum_rank
0,31390,VPVNPEPDATSVENVAEKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,17,L,E,-7,427.5
1,31391,VPVNPEPDATSVENVAKKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,17,L,K,-6,659.0
2,31392,VPVNPEPDATSVENVAKTGSGDSQSDPIKADLEVKGQSALPFDVDC...,8,Novozymes,DEL,17,L,_,-10,43.5
3,31393,VPVNPEPDATSVENVALCTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,18,K,C,-8,207.5
4,31394,VPVNPEPDATSVENVALFTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,18,K,F,-6,659.0
...,...,...,...,...,...,...,...,...,...,...
2408,33798,VPVNPEPDATSVENVILKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,16,A,I,-4,1316.0
2409,33799,VPVNPEPDATSVENVLLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,16,A,L,-4,1316.0
2410,33800,VPVNPEPDATSVENVNLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,16,A,N,-4,1316.0
2411,33801,VPVNPEPDATSVENVPLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,16,A,P,-2,1838.5


# PLDDT

In [11]:
plddt = (
    pd.read_csv(curr_dir + '/input/novozymes-enzyme-stability-prediction/wildtype_structure_prediction_af2.pdb', sep='\s+', header=None)[[0,5,10]]
    .rename(columns={0:'atom', 5:'resid', 10:'plddt'})
    .query('atom=="ATOM"')
    .drop_duplicates()
)


# Add B factor to the testing set:
test_df = pd.merge(
    test_df,
    plddt,
    left_on='resid',
    right_on='resid',
    how='left'
)



test_df['plddt_rank'] = rankdata(-1*test_df['plddt'])

test_df

Unnamed: 0,seq_id,protein_sequence,pH,data_source,type,resid,wt,mut,blosum,blosum_rank,atom,plddt,plddt_rank
0,31390,VPVNPEPDATSVENVAEKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,17,L,E,-7,427.5,ATOM,55.23,2408.0
1,31391,VPVNPEPDATSVENVAKKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,17,L,K,-6,659.0,ATOM,55.23,2408.0
2,31392,VPVNPEPDATSVENVAKTGSGDSQSDPIKADLEVKGQSALPFDVDC...,8,Novozymes,DEL,17,L,_,-10,43.5,ATOM,55.23,2408.0
3,31393,VPVNPEPDATSVENVALCTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,18,K,C,-8,207.5,ATOM,69.25,2386.5
4,31394,VPVNPEPDATSVENVALFTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,18,K,F,-6,659.0,ATOM,69.25,2386.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2408,33798,VPVNPEPDATSVENVILKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,16,A,I,-4,1316.0,ATOM,55.85,2398.5
2409,33799,VPVNPEPDATSVENVLLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,16,A,L,-4,1316.0,ATOM,55.85,2398.5
2410,33800,VPVNPEPDATSVENVNLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,16,A,N,-4,1316.0,ATOM,55.85,2398.5
2411,33801,VPVNPEPDATSVENVPLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,16,A,P,-2,1838.5,ATOM,55.85,2398.5


# Differential PLDDT

In [13]:
plddtdiff = []

# Wild type result:
wt_plddt = (
    pd.read_csv(curr_dir + '/input/diff_pLDDT_data/nesp-kvigly-test-mutation-pdbs/WT_unrelaxed_rank_1_model_3.pdb', sep='\s+')
    .loc['ATOM'].reset_index()
    .loc[:, ['level_4', 'MODEL']].drop_duplicates()
    .rename(columns={'level_4':'resid', 'MODEL':'plddt'})
    .astype({'resid':int})
    .set_index('resid')
)

# Add difference in pLDDTto the testing set:>
for _,row in tqdm(test_df.iterrows(), total=test_df.shape[0]):
    file_path = curr_dir + '/input/diff_plddt_data/nesp-kvigly-test-mutation-pdbs/{}{}{}_unrelaxed_rank_1_model_3.pdb'.format(row['wt'], row['resid'], row['mut'])
    if os.path.exists(file_path):
        tdf = (
            pd.read_csv(file_path, sep='\s+')
            .loc['ATOM'].reset_index()
            .loc[:, ['level_4', 'MODEL']].drop_duplicates()
            .rename(columns={'level_4':'resid', 'MODEL':'plddt'})
            .astype({'resid':int})
            .set_index('resid')
        )
        plddtdiff.append((tdf.loc[row['resid']] - wt_plddt.loc[row['resid']]).values[0])
    else:
        plddtdiff.append(np.nan)

test_df['plddtdiff'] = plddtdiff
test_df['plddtdiff_rank'] = rankdata(test_df['plddtdiff'])

test_df

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2413.0), HTML(value='')))




Unnamed: 0,seq_id,protein_sequence,pH,data_source,type,resid,wt,mut,blosum,blosum_rank,atom,plddt,plddt_rank,plddtdiff,plddtdiff_rank
0,31390,VPVNPEPDATSVENVAEKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,17,L,E,-7,427.5,ATOM,55.23,2408.0,-2.13,767.0
1,31391,VPVNPEPDATSVENVAKKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,17,L,K,-6,659.0,ATOM,55.23,2408.0,-3.79,459.0
2,31392,VPVNPEPDATSVENVAKTGSGDSQSDPIKADLEVKGQSALPFDVDC...,8,Novozymes,DEL,17,L,_,-10,43.5,ATOM,55.23,2408.0,8.54,2409.0
3,31393,VPVNPEPDATSVENVALCTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,18,K,C,-8,207.5,ATOM,69.25,2386.5,-21.67,44.0
4,31394,VPVNPEPDATSVENVALFTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,18,K,F,-6,659.0,ATOM,69.25,2386.5,8.84,2411.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2408,33798,VPVNPEPDATSVENVILKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,16,A,I,-4,1316.0,ATOM,55.85,2398.5,2.93,2400.0
2409,33799,VPVNPEPDATSVENVLLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,16,A,L,-4,1316.0,ATOM,55.85,2398.5,-4.92,350.0
2410,33800,VPVNPEPDATSVENVNLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,16,A,N,-4,1316.0,ATOM,55.85,2398.5,-3.94,441.0
2411,33801,VPVNPEPDATSVENVPLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,16,A,P,-2,1838.5,ATOM,55.85,2398.5,-9.64,140.0


# Deep DDG

In [20]:
ddg = pd.read_csv(curr_dir + '/output/ddgout.csv')
print(ddg)

ddg_rank = ddg['ddg_rank']
test_df.merge(ddg_rank)

test_df


Unnamed: 0,seq_id,protein_sequence,pH,data_source,type,resid,wt,mut,blosum,blosum_rank,atom,plddt,plddt_rank,plddtdiff,plddtdiff_rank,ddg_rank
0,31390,VPVNPEPDATSVENVAEKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,17,L,E,-7,427.5,ATOM,55.23,2408.0,-2.13,767.0,2114.5
1,31391,VPVNPEPDATSVENVAKKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,17,L,K,-6,659.0,ATOM,55.23,2408.0,-3.79,459.0,2189.5
2,31392,VPVNPEPDATSVENVAKTGSGDSQSDPIKADLEVKGQSALPFDVDC...,8,Novozymes,DEL,17,L,_,-10,43.5,ATOM,55.23,2408.0,8.54,2409.0,2056.0
3,31393,VPVNPEPDATSVENVALCTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,18,K,C,-8,207.5,ATOM,69.25,2386.5,-21.67,44.0,1032.0
4,31394,VPVNPEPDATSVENVALFTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,18,K,F,-6,659.0,ATOM,69.25,2386.5,8.84,2411.0,987.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2408,33798,VPVNPEPDATSVENVILKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,16,A,I,-4,1316.0,ATOM,55.85,2398.5,2.93,2400.0,2194.0
2409,33799,VPVNPEPDATSVENVLLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,16,A,L,-4,1316.0,ATOM,55.85,2398.5,-4.92,350.0,2277.5
2410,33800,VPVNPEPDATSVENVNLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,16,A,N,-4,1316.0,ATOM,55.85,2398.5,-3.94,441.0,1911.5
2411,33801,VPVNPEPDATSVENVPLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,16,A,P,-2,1838.5,ATOM,55.85,2398.5,-9.64,140.0,1750.5


# DeMask

In [21]:
demask = pd.read_csv(curr_dir + '/input/demask_data/demaskout.txt', sep='\t', usecols=[0,1,2,3], names=['resid','wt','mut','demask'], skiprows=1)
print(demask)

# Add DeMask output to the testing set:
test_df = pd.merge(
    test_df.set_index(['wt','resid','mut']),
    demask.set_index(['wt','resid','mut']),
    left_index=True,
    right_index=True,
    how='left'
).reset_index()

test_df.loc[test_df['type']=='WT','demask'] = 0
test_df.loc[test_df['type']=='DEL','demask'] = test_df['demask'].dropna().min()


test_df['demask_rank'] = rankdata(test_df['demask'])

test_df

      resid wt mut  demask
0         1  V   A -0.1904
1         1  V   C -0.2861
2         1  V   D -0.4248
3         1  V   E -0.3821
4         1  V   F -0.3354
...     ... ..  ..     ...
4194    221  K   S -0.2735
4195    221  K   T -0.2854
4196    221  K   V -0.2856
4197    221  K   W -0.3435
4198    221  K   Y -0.3194

[4199 rows x 4 columns]


Unnamed: 0,wt,resid,mut,seq_id,protein_sequence,pH,data_source,type,blosum,blosum_rank,atom,plddt,plddt_rank,plddtdiff,plddtdiff_rank,ddg_rank,demask,demask_rank
0,L,17,E,31390,VPVNPEPDATSVENVAEKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,-7,427.5,ATOM,55.23,2408.0,-2.13,767.0,2114.5,-0.2504,831.0
1,L,17,K,31391,VPVNPEPDATSVENVAKKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,-6,659.0,ATOM,55.23,2408.0,-3.79,459.0,2189.5,-0.2494,835.5
2,L,17,_,31392,VPVNPEPDATSVENVAKTGSGDSQSDPIKADLEVKGQSALPFDVDC...,8,Novozymes,DEL,-10,43.5,ATOM,55.23,2408.0,8.54,2409.0,2056.0,-0.5986,39.5
3,K,18,C,31393,VPVNPEPDATSVENVALCTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,-8,207.5,ATOM,69.25,2386.5,-21.67,44.0,1032.0,-0.1813,1239.0
4,K,18,F,31394,VPVNPEPDATSVENVALFTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,-6,659.0,ATOM,69.25,2386.5,8.84,2411.0,987.0,-0.1822,1233.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2408,A,16,I,33798,VPVNPEPDATSVENVILKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,-4,1316.0,ATOM,55.85,2398.5,2.93,2400.0,2194.0,-0.1099,1706.0
2409,A,16,L,33799,VPVNPEPDATSVENVLLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,-4,1316.0,ATOM,55.85,2398.5,-4.92,350.0,2277.5,-0.0388,2113.0
2410,A,16,N,33800,VPVNPEPDATSVENVNLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,-4,1316.0,ATOM,55.85,2398.5,-3.94,441.0,1911.5,-0.1224,1618.5
2411,A,16,P,33801,VPVNPEPDATSVENVPLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,-2,1838.5,ATOM,55.85,2398.5,-9.64,140.0,1750.5,-0.1917,1179.0


# RMSD

In [22]:
# Read VMD/NAMD output:
namd = pd.read_csv(curr_dir + '/input/RMSD_data/novozymes-md2/residue_rmsd_sasa_last.dat', sep='\t', header=None, names=['resid','rmsd','sasa0','sasaf'])

# Add VMD/NAMD results to the testing set:
test_df = pd.merge(
    test_df,
    namd[['resid','rmsd']],
    left_on='resid',
    right_on='resid',
    how='left'
)

test_df.loc[test_df['type']=='WT','rmsd'] = test_df['rmsd'].dropna().max()
# test_df.loc[test_df['type']=='WT','sasaf'] = test_df['sasaf'].dropna().max()

test_df['rmsd_rank'] = rankdata(test_df['rmsd'])

test_df

Unnamed: 0,wt,resid,mut,seq_id,protein_sequence,pH,data_source,type,blosum,blosum_rank,atom,plddt,plddt_rank,plddtdiff,plddtdiff_rank,ddg_rank,demask,demask_rank,rmsd,rmsd_rank
0,L,17,E,31390,VPVNPEPDATSVENVAEKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,-7,427.5,ATOM,55.23,2408.0,-2.13,767.0,2114.5,-0.2504,831.0,1.713078,2305.0
1,L,17,K,31391,VPVNPEPDATSVENVAKKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,-6,659.0,ATOM,55.23,2408.0,-3.79,459.0,2189.5,-0.2494,835.5,1.713078,2305.0
2,L,17,_,31392,VPVNPEPDATSVENVAKTGSGDSQSDPIKADLEVKGQSALPFDVDC...,8,Novozymes,DEL,-10,43.5,ATOM,55.23,2408.0,8.54,2409.0,2056.0,-0.5986,39.5,1.713078,2305.0
3,K,18,C,31393,VPVNPEPDATSVENVALCTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,-8,207.5,ATOM,69.25,2386.5,-21.67,44.0,1032.0,-0.1813,1239.0,1.291873,2162.5
4,K,18,F,31394,VPVNPEPDATSVENVALFTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,-6,659.0,ATOM,69.25,2386.5,8.84,2411.0,987.0,-0.1822,1233.0,1.291873,2162.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2408,A,16,I,33798,VPVNPEPDATSVENVILKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,-4,1316.0,ATOM,55.85,2398.5,2.93,2400.0,2194.0,-0.1099,1706.0,1.119549,1938.5
2409,A,16,L,33799,VPVNPEPDATSVENVLLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,-4,1316.0,ATOM,55.85,2398.5,-4.92,350.0,2277.5,-0.0388,2113.0,1.119549,1938.5
2410,A,16,N,33800,VPVNPEPDATSVENVNLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,-4,1316.0,ATOM,55.85,2398.5,-3.94,441.0,1911.5,-0.1224,1618.5,1.119549,1938.5
2411,A,16,P,33801,VPVNPEPDATSVENVPLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,-2,1838.5,ATOM,55.85,2398.5,-9.64,140.0,1750.5,-0.1917,1179.0,1.119549,1938.5


# SASA

In [24]:
# Read VMD/NAMD output:
namd = pd.read_csv(curr_dir + '/input/SASA_data/novozymes-md/residue_rmsd_sasa_last.dat', sep='\t', header=None, names=['resid','rmsd','sasa0','sasaf'])

# Add VMD/NAMD results to the testing set:
test_df = pd.merge(
    test_df,
    namd[['resid','sasaf']],
    left_on='resid',
    right_on='resid',
    how='left'
)

# test_df.loc[test_df['type']=='WT','rmsd'] = test_df['rmsd'].dropna().max()
test_df.loc[test_df['type']=='WT','sasaf'] = test_df['sasaf'].dropna().max()
test_df['sasaf_rank'] = rankdata(test_df['sasaf'])

test_df

Unnamed: 0,wt,resid,mut,seq_id,protein_sequence,pH,data_source,type,blosum,blosum_rank,...,plddt_rank,plddtdiff,plddtdiff_rank,ddg_rank,demask,demask_rank,rmsd,rmsd_rank,sasaf,sasaf_rank
0,L,17,E,31390,VPVNPEPDATSVENVAEKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,-7,427.5,...,2408.0,-2.13,767.0,2114.5,-0.2504,831.0,1.713078,2305.0,157.169113,2319.0
1,L,17,K,31391,VPVNPEPDATSVENVAKKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,-6,659.0,...,2408.0,-3.79,459.0,2189.5,-0.2494,835.5,1.713078,2305.0,157.169113,2319.0
2,L,17,_,31392,VPVNPEPDATSVENVAKTGSGDSQSDPIKADLEVKGQSALPFDVDC...,8,Novozymes,DEL,-10,43.5,...,2408.0,8.54,2409.0,2056.0,-0.5986,39.5,1.713078,2305.0,157.169113,2319.0
3,K,18,C,31393,VPVNPEPDATSVENVALCTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,-8,207.5,...,2386.5,-21.67,44.0,1032.0,-0.1813,1239.0,1.291873,2162.5,114.086571,2159.5
4,K,18,F,31394,VPVNPEPDATSVENVALFTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,-6,659.0,...,2386.5,8.84,2411.0,987.0,-0.1822,1233.0,1.291873,2162.5,114.086571,2159.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2408,A,16,I,33798,VPVNPEPDATSVENVILKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,-4,1316.0,...,2398.5,2.93,2400.0,2194.0,-0.1099,1706.0,1.119549,1938.5,58.142597,1408.5
2409,A,16,L,33799,VPVNPEPDATSVENVLLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,-4,1316.0,...,2398.5,-4.92,350.0,2277.5,-0.0388,2113.0,1.119549,1938.5,58.142597,1408.5
2410,A,16,N,33800,VPVNPEPDATSVENVNLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,-4,1316.0,...,2398.5,-3.94,441.0,1911.5,-0.1224,1618.5,1.119549,1938.5,58.142597,1408.5
2411,A,16,P,33801,VPVNPEPDATSVENVPLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,SUB,-2,1838.5,...,2398.5,-9.64,140.0,1750.5,-0.1917,1179.0,1.119549,1938.5,58.142597,1408.5


# Rosetta

In [25]:
test_df['rosetta_rank'] = pd.read_csv('/input/rosetta_data/submission_rosetta_scores')['tm']

test_df

FileNotFoundError: [Errno 2] No such file or directory: '/input/rosetta_data/submission_rosetta_scores'

# Thermonet

In [None]:
test_df['thermonet'] = pd.read_csv('/input/thermonet_data/submission.csv')['tm']
test_df['thermonet_rank'] = rankdata(test_df['thermonet'])

# Final Dataframe

In [None]:
print('Final test dataframe with rank scores:')
print(test_df.head())
test_df.to_csv('/output/ensemble.csv', index=False)