In [35]:
import pandas as pd
import numpy as np

In [36]:
raw_df = pd.read_csv("skempi_v2.csv")
raw_df

Unnamed: 0,#Pdb,Mutation(s)_PDB,Mutation(s)_cleaned,iMutation_Location(s),Hold_out_type,Hold_out_proteins,Affinity_mut (M),Affinity_mut_parsed,Affinity_wt (M),Affinity_wt_parsed,...,Unnamed: 104,Unnamed: 105,Unnamed: 106,Unnamed: 107,Unnamed: 108,Unnamed: 109,Unnamed: 110,Unnamed: 111,Unnamed: 112,Unnamed: 113
0,1CSE_E_I,LI45G,LI38G,COR,Pr/PI,Pr/PI,5.26E-11,5.26E-11,1.12E-12,1.12E-12,...,,,,,,,,,,
1,1CSE_E_I,LI45S,LI38S,COR,Pr/PI,Pr/PI,8.33E-12,8.33E-12,1.12E-12,1.12E-12,...,,,,,,,,,,
2,1CSE_E_I,LI45P,LI38P,COR,Pr/PI,Pr/PI,1.02E-07,1.02E-07,1.12E-12,1.12E-12,...,,,,,,,,,,
3,1CSE_E_I,LI45I,LI38I,COR,Pr/PI,Pr/PI,1.72E-10,1.72E-10,1.12E-12,1.12E-12,...,,,,,,,,,,
4,1CSE_E_I,LI45D,LI38D,COR,Pr/PI,Pr/PI,1.92E-09,1.92E-09,1.12E-12,1.12E-12,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7080,3QIB_ABP_CD,KP9R,KP8R,COR,TCR/pMHC,TCR/pMHC,1JCK_A_B,2.4E-04,2.4E-04,5.5E-06,...,,,,,,,,,,
7081,3QIB_ABP_CD,TP12A,TP11A,COR,TCR/pMHC,TCR/pMHC,1JCK_A_B,>1.1E-03,1.1E-03,5.5E-06,...,,,,,,,,,,
7082,3QIB_ABP_CD,TP12S,TP11S,COR,TCR/pMHC,TCR/pMHC,1JCK_A_B,3.38E-05,3.38E-05,5.5E-06,...,,,,,,,,,,
7083,3QIB_ABP_CD,TP12N,TP11N,COR,TCR/pMHC,TCR/pMHC,1JCK_A_B,4.34E-05,4.34E-05,5.5E-06,...,,,,,,,,,,


In [1]:

AA_PROPERTIES = {
    'A': {'hydrophobic': 1.80, 'size': 0.3, 'charge': 0, 'flexibility': 0.2},   
    'C': {'hydrophobic': 2.50, 'size': 0.4, 'charge': 0, 'flexibility': 0.3},   
    'D': {'hydrophobic': -3.50, 'size': 0.5, 'charge': -1, 'flexibility': 0.5}, 
    'E': {'hydrophobic': -3.50, 'size': 0.6, 'charge': -1, 'flexibility': 0.6},
    'F': {'hydrophobic': 2.80, 'size': 0.7, 'charge': 0, 'flexibility': 0.4},   
    'G': {'hydrophobic': -0.40, 'size': 0.1, 'charge': 0, 'flexibility': 0.9}, 
    'H': {'hydrophobic': -3.20, 'size': 0.6, 'charge': 0.5, 'flexibility': 0.4}, 
    'I': {'hydrophobic': 4.50, 'size': 0.6, 'charge': 0, 'flexibility': 0.3},   
    'K': {'hydrophobic': -3.90, 'size': 0.6, 'charge': 1, 'flexibility': 0.7},  
    'L': {'hydrophobic': 3.80, 'size': 0.6, 'charge': 0, 'flexibility': 0.4},   
    'M': {'hydrophobic': 1.90, 'size': 0.6, 'charge': 0, 'flexibility': 0.6},   
    'N': {'hydrophobic': -3.50, 'size': 0.5, 'charge': 0, 'flexibility': 0.5},  
    'P': {'hydrophobic': -1.60, 'size': 0.4, 'charge': 0, 'flexibility': 0.1}, 
    'Q': {'hydrophobic': -3.50, 'size': 0.6, 'charge': 0, 'flexibility': 0.6},  
    'R': {'hydrophobic': -4.50, 'size': 0.7, 'charge': 1, 'flexibility': 0.8}, 
    'S': {'hydrophobic': -0.80, 'size': 0.4, 'charge': 0, 'flexibility': 0.5},  
    'T': {'hydrophobic': -0.70, 'size': 0.5, 'charge': 0, 'flexibility': 0.4},  
    'V': {'hydrophobic': 4.20, 'size': 0.5, 'charge': 0, 'flexibility': 0.2},   
    'W': {'hydrophobic': -0.90, 'size': 0.8, 'charge': 0, 'flexibility': 0.3},  
    'Y': {'hydrophobic': -1.30, 'size': 0.7, 'charge': 0, 'flexibility': 0.4}   
}

# Газовая постоянная (в kcal/(mol*K))
R = 0.001987

In [86]:
location_list = ['COR', 'INT', 'SUP', 'RIM', 'SUR']

def clean_dataframe(df):
    df = df.dropna(axis = 1, how = 'all')
    df = df[df.columns[:29]]
    df = df[df['iMutation_Location(s)'].isin(location_list)] #clean multiple mutations
    df = df[(df['Affinity_mut (M)'] == df['Affinity_mut_parsed']) 
                    & (df['Affinity_wt (M)'] == df['Affinity_wt_parsed'])] #keep only those matching reference values of affinities
    
    for index, row in df.iterrows():
        if row['Temperature'] == '298(assumed)':
            df.at[index, 'Temperature'] = '298'
        
        if isinstance(row['Affinity_wt (M)'], str) and ',' in row['Affinity_wt (M)']:
            df.at[index, 'Affinity_wt (M)'] = row['Affinity_wt (M)'].replace(',', '.')
        
        if isinstance(row['Affinity_mut (M)'], str) and ',' in row['Affinity_mut (M)']:
            df.at[index, 'Affinity_mut (M)'] = row['Affinity_mut (M)'].replace(',', '.')
    
    df = df.astype({'Temperature': int, 'Affinity_wt (M)': float, 'Affinity_mut (M)': float})
    df = df[df['Temperature'] < 400]  # there are unreal temperatures in the raw df
    return df

def calculate_ddG(df):
    #ddG = -R*T*ln(K_mut/K_wt)
    df['ddG'] = -1 * R * df['Temperature'] * np.log(df['Affinity_mut (M)'] / df['Affinity_wt (M)'])
    return df

In [87]:
df_cleaned = clean_dataframe(raw_df)
df_cleaned.to_csv('cleaned_skempi.csv')
df_cleaned

Unnamed: 0,#Pdb,Mutation(s)_PDB,Mutation(s)_cleaned,iMutation_Location(s),Hold_out_type,Hold_out_proteins,Affinity_mut (M),Affinity_mut_parsed,Affinity_wt (M),Affinity_wt_parsed,...,koff_mut_parsed,koff_wt (s^(-1)),koff_wt_parsed,dH_mut (kcal mol^(-1)),dH_wt (kcal mol^(-1)),dS_mut (cal mol^(-1) K^(-1)),dS_wt (cal mol^(-1) K^(-1)),Notes,Method,SKEMPI version
0,1CSE_E_I,LI45G,LI38G,COR,Pr/PI,Pr/PI,5.260000e-11,5.26E-11,1.120000e-12,1.12E-12,...,,,,,,,,,IASP,1
1,1CSE_E_I,LI45S,LI38S,COR,Pr/PI,Pr/PI,8.330000e-12,8.33E-12,1.120000e-12,1.12E-12,...,,,,,,,,,IASP,1
2,1CSE_E_I,LI45P,LI38P,COR,Pr/PI,Pr/PI,1.020000e-07,1.02E-07,1.120000e-12,1.12E-12,...,,,,,,,,,IASP,1
3,1CSE_E_I,LI45I,LI38I,COR,Pr/PI,Pr/PI,1.720000e-10,1.72E-10,1.120000e-12,1.12E-12,...,,,,,,,,,IASP,1
4,1CSE_E_I,LI45D,LI38D,COR,Pr/PI,Pr/PI,1.920000e-09,1.92E-09,1.120000e-12,1.12E-12,...,,,,,,,,,IASP,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7026,1KBH_A_B,YB98W,YB1172W,SUP,,1KBH_A_B,1.240000e-08,1.24E-08,3.400000e-08,3.4E-08,...,,,,,,,,The paper is for human NCBD,but the crystal is murine. However,in this region the protein only differs in on...
7027,1KBH_A_B,YB98W,YB1172W,SUP,,1KBH_A_B,8.750000e-08,8.75E-08,3.400000e-08,3.4E-08,...,,,,,,,,The paper is for human NCBD,but the crystal is murine. However,in this region the protein only differs in on...
7028,2KSO_A_B,KA23D,KA10D,COR,,2KSO_A_B,9.100000e-06,9.1E-06,5.200000e-06,5.2E-06,...,,,,-0.85,-2.5,19.7986577181,15.7718120805,,ITC,2
7039,2KSO_A_B,RA56E,RA43E,RIM,,2KSO_A_B,1.700000e-06,1.7E-06,5.200000e-06,5.2E-06,...,,,,-2.4,-2.5,18.4563758389,15.7718120805,,ITC,2


In [89]:
df_energy = calculate_ddG(df_cleaned)
df_energy.to_csv('cleaned_with_ddG.csv')