###
Followed from here: 🧬 14656 Unique Mutations+Voxel Features+PDBs 🧬

https://www.kaggle.com/code/vslaykovsky/14656-unique-mutations-voxel-features-pdbs?scriptVersionId=109568406

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("/projects/robustmicrob/jlaw/inputs/novozymes/NOVO_combined_dataset.csv")
df

Unnamed: 0,sequence,wildtype,pdb_position,seq_position,mutant,ddG,dT,wT,pH,source,PDB_chain
0,AAKDVKFGNDAGVKMLRGVNVLADAVKVTLGPKGRNVVLDKSFGAP...,D,473,471,C,0.000000,0.75,,,['ThermoMutDB'],1ss8A
1,AAKDVKFGNDAGVKMLRGVNVLADAVKVTLGPKGRNVVLDKSFGAP...,D,523,521,K,-1.100000,-3.75,,,['ThermoMutDB'],1ss8A
2,AAKDVKFGNDAGVKMLRGVNVLADAVKVTLGPKGRNVVLDKSFGAP...,K,4,2,E,-1.300000,-4.75,,,['ThermoMutDB'],1ss8A
3,AAQASVVANQLIPINTALTLVMMRSEVVTPVGIPAEDIPRLVSMQV...,D,36,36,A,0.705833,,,7.0,"['Q3421.txt', 'Q3214_direct.csv', 'Q1744_direc...",1msiA
4,AAQASVVANQLIPINTALTLVMMRSEVVTPVGIPAEDIPRLVSMQV...,D,58,58,N,-0.120000,,,7.0,"['Q3421.txt', 'Q3214_direct.csv', 'Q1744_direc...",1msiA
...,...,...,...,...,...,...,...,...,...,...,...
14651,YSIEADKKFKYSVKLSDYPTLQDAASAAVDGLLIDRDYNFYGGETV...,G,323,210,D,-0.130000,,,7.4,"['dataset_S2648.csv', 'iStable2.0 - s3568']",1tyvA
14652,YSIEADKKFKYSVKLSDYPTLQDAASAAVDGLLIDRDYNFYGGETV...,R,285,172,K,-3.430000,,,7.4,"['dataset_S2648.csv', 'iStable2.0 - s3568']",1tyvA
14653,YSIEADKKFKYSVKLSDYPTLQDAASAAVDGLLIDRDYNFYGGETV...,T,235,122,I,-0.530000,,,7.4,"['dataset_S2648.csv', 'iStable2.0 - s3568']",1tyvA
14654,YVDKGYEPSKKRDIIAVFRVTPAEGYTIEQAAGAVAAESSTGTWTT...,E,63,51,S,,-7.20,,,['ThermoMutDB'],1gehA


In [None]:
DOCKER = False  # requires Docker
RUN_FAST_RELAX = False  # requires Rosetta
RUN_MUTANT_FAST_RELAX = False  # requires Rosetta
GEN_FEATURES = False  # requires HTMD

import glob
import os
import warnings
import numpy as np
import Bio
import pandas as pd
import requests
from Bio import SeqUtils
from Bio.PDB import PDBParser, PDBIO
from Bio.SeqUtils import seq1
from tqdm import tqdm
from tqdm.notebook import tqdm
import json
import re

pd.set_option('display.max_columns', 1000)

THERMONET_V1 = 'ThermoNetV1'

COLUMNS = ['sequence', 'PDB_chain', 'PDB', 'chain', 'wildtype', 'seq_position', 'pdb_position', 'mutant', 'ddG', 'dT', 'wT', 'pH', 'source']

DATASETS_PATH='data/datasets'

In [None]:
import matplotlib.pyplot as plt


def plot_hist(df):
    plt.figure(figsize=(20, 7))
    if 'ddG' in df.columns:
        df.ddG.plot.hist(bins=50, ax=plt.subplot(1, 3, 1), title='ddG distribution')
    if 'dT' in df.columns:
        df.dT.plot.hist(bins=50, ax=plt.subplot(1, 3, 2), title='dT distribution')
    if 'dT' in df.columns and 'ddG' in df.columns:
        df.plot.scatter(x='ddG', y='dT', ax=plt.subplot(1, 3, 3), title='ddG vs dT')
    
    
def gen_af2_bigquery(df):
    checksums = "'" + "','".join(df.sequenceChecksum.dropna().unique()) + "'"
    return f"""
    SELECT
      sequenceChecksum,
      min(entryID) af2id
    FROM bigquery-public-data.deepmind_alphafold.metadata
    WHERE sequenceChecksum in ({checksums})
    GROUP BY sequenceChecksum
    """


def download_cif(fname):
    af2id = pd.read_csv(fname).af2id
    files = af2id.apply(lambda id: f'gs://public-datasets-deepmind-alphafold/{id}-model_v3.cif' if not os.path.exists(f'data/cifs/{id}-model_v3.cif') else None).dropna()
    files = ' '.join(files)
    if len(files) > 0:
        !gsutil -m cp $files data/cifs

In [None]:
def load_fireprot():
    df = pd.read_csv('data/datasets/fireprotdb_results.csv').drop_duplicates()
    df = df.dropna(subset=['pdb_id']).reset_index(drop=True)  # Invalid PDB structures, we use AF2 structures in FireProtDB
    df = df.rename(columns={'wild_type': 'wildtype', 'position': 'pdb_position', 'mutation': 'mutant', 'dTm': 'dT'})
    df['source'] = 'FireProtDB'
    df.ddG = -df.ddG
    return df.filter(COLUMNS)


df = load_fireprot()
df

In [None]:
plot_hist(df)

In [None]:
def load_s630():
    df = pd.DataFrame(pd.read_csv(f'{DATASETS_PATH}/S630.txt', sep="\t"))
    COLUMNS = ['sequence', 'PDB_chain', 'PDB', 'chain', 'wildtype', 'seq_position', 'pdb_position', 'mutant', 'ddG', 'dT', 'wT', 'pH', 'source']
    df = df.rename(columns={
        'WT': 'wildtype',
        'Position': 'pdb_position',
        # 'Seq_position': 'seq_position', # seq positions are invalid
        'Mutation': 'mutant',
        'actual_ddG': 'ddG',
    })
    df.PDB = df.PDB.str.lower()
    df['source'] = 'iStable2.0 - s630'
    return df.filter(COLUMNS)

df = load_s630()
df

In [None]:
plot_hist(df)

In [None]:
def load_S3568():
    df = pd.read_csv(f'{DATASETS_PATH}/S3568_training.txt', sep="\t")
    df = df.rename(columns={
        'CHAIN': 'chain',
        'PDBPOSITION': 'pdb_position',
        # 'SEQPOSITION': 'seq_position', # seq positions are invalid
        'MUTANT': 'mutant',
        'PH': 'pH',
        'DDG': 'ddG',
        'WDTYPE': 'wildtype',
    })
    df['source'] = 'iStable2.0 - s3568'
    df.PDB = df.PDB.str.lower()

    return df.filter(COLUMNS)

df = load_S3568()
df

In [None]:
def load_thermomutdb():
    df = pd.DataFrame(json.load(open(f'{DATASETS_PATH}/thermomutdb.json', 'r'))).query('mutation_type == "Single"')
    df = df.rename(columns={
        'PDB_wild': 'PDB',
        'mutated_chain': 'chain',
        'ddg': 'ddG',
        'dtm': 'dT',
    })
    df['wildtype'] = df.mutation_code.str[0]
    df['mutant'] = df.mutation_code.str[-1]
    df['pdb_position'] = df.mutation_code.str.strip().str[1:-1]
    df = df[df.pdb_position.str.isdigit()]
    df.pdb_position = df.pdb_position.astype(int)
    df['source'] = 'ThermoMutDB'
    df.PDB = df.PDB.str.lower()

    return df.filter(COLUMNS + ['mutation_code'])

df = load_thermomutdb()
df

In [None]:
def load_kaggle_mutations_dt():
    df = pd.read_csv(f'{DATASETS_PATH}/dt-xgboost-5000-mutations-200-pdb-files-lb-0-40.csv').drop(columns=['PDB', 'sequence']).rename(
        columns={
            'WT': 'wildtype',
            'MUT': 'mutant',
            'position': 'pdb_position',
            'dTm': 'dT',
            'CIF': 'PDB'
        })
    df['chain'] = 'A'
    df['source'] = 'dt-xgboost-5000-mutations-200-pdb-files-lb-0-40.csv'
    return df.filter(COLUMNS)


df_kaggle = load_kaggle_mutations_dt()
df_kaggle

In [None]:
def load_kaggle_mutations_ddg():
    df = pd.read_csv(f'{DATASETS_PATH}/ddg-xgboost-5000-mutations-200-pdb-files-lb-0-40.csv').drop(columns=['sequence']).rename(
        columns={
            'mutation': 'mutant',
            'position': 'pdb_position',
        }).drop(columns=['Unnamed: 0'])
    df['chain'] = 'A'
    df['source'] = 'ddg-xgboost-5000-mutations-200-pdb-files-lb-0-40.csv'
    df.PDB = df.PDB.str.lower()
    return df.filter(COLUMNS)


df_kaggle = load_kaggle_mutations_ddg()
df_kaggle

In [None]:
def load_pucci():
    """
    https://aip.scitation.org/doi/suppl/10.1063/1.4947493
    """
    df = pd.read_excel('data/datasets/pucci-proteins-appendixtable1.xlsx').drop(columns=['Unnamed: 0', 'N']).loc[1:]
    df = df.rename(
        columns={'PDBid': 'PDB', 'Chain': 'chain', 'RESN': 'pdb_position', 'RESwt': 'wildtype', 'RESmut': 'mutant',
                 'ΔTmexp': 'dT', 'Tmexp [wt]': 'wT', 'ΔΔGexp(T)': 'ddG'})
    df.mutant = df.mutant.str.capitalize().str.strip().map(Bio.SeqUtils.IUPACData.protein_letters_3to1)
    df.wildtype = df.wildtype.str.capitalize().str.strip().map(Bio.SeqUtils.IUPACData.protein_letters_3to1)
    df.PDB = df.PDB.str.lower()
    df.pH = df.pH.str.replace('[^-0-9.]', '', regex=True)
    df.ddG = df.ddG.str.replace('[^-0-9.]|^-$', '', regex=True)

    def to_float(v):
        try:
            return float(v)
        except:
            return None

    df.pH = df.pH.map(to_float)
    df.ddG = df.ddG.map(to_float)
    df.dT = df.dT.astype(float)
    df['source'] = 'pucci-proteins-appendixtable1.xlsx'
    return df.filter(COLUMNS)


df_pucci = load_pucci()
df_pucci

In [None]:
plot_hist(df_pucci)

In [None]:
def load_s140():
    df = pd.read_csv('data/datasets/dataset_S140.csv').rename(columns={
        'Wildtype': 'PDB', ' ∆∆G (kcal/mol)': 'ddG', ' Mutation': 'mutation'
    })[['PDB', 'mutation', 'ddG']]
    df['wildtype'] = df['mutation'].str[1]
    df['chain'] = df.PDB.str[-1]
    df.PDB = df.PDB.str[:-1].str.lower()
    df['mutant'] = df['mutation'].str[-1]
    df['pdb_position'] = df['mutation'].str.slice(2, -1).astype(int)
    df['source'] = 'dataset_S140.csv'
    return df.filter(COLUMNS)


df = load_s140()
df

In [None]:
plot_hist(df)

In [None]:
def load_s350_s2648(fname):
    df = pd.read_csv(fname).rename(columns={
        'PH': 'pH',
        'PDB_CHAIN': 'PDB',
        'WILD_RES': 'wildtype',
        'MUTANT_RES': 'mutant',
        'EXP_DDG': 'ddG',
        'POSITION': 'pdb_position'
    })
    df['chain'] = df.PDB.str[-1]
    df.PDB = df.PDB.str[:-1].str.lower()
    df['source'] = os.path.basename(fname)
    return df.filter(COLUMNS)


df = load_s350_s2648('data/datasets/dataset_S350.csv')
df

In [None]:
plot_hist(df)

In [None]:
df = load_s350_s2648('data/datasets/dataset_S2648.csv')
df

In [None]:
plot_hist(df)

In [None]:
def load_Q1744_Q3214(fname):
    df = pd.read_csv(fname).rename(columns={
        'pdb_id': 'PDB',
        'wild_type': 'wildtype',
        'ddg': 'ddG',
        'position': 'pdb_position'
    })
    df['chain'] = df.PDB.str[-1]
    df.PDB = df.PDB.str[:-1].str.lower()
    df['source'] = os.path.basename(fname)
    df.ddG = -df.ddG
    return df.filter(COLUMNS)

df = load_Q1744_Q3214('data/datasets/Q1744_direct.csv')
df

In [None]:
plot_hist(df)

In [None]:
df = load_Q1744_Q3214('data/datasets/Q3214_direct.csv')

In [None]:
plot_hist(df)

In [None]:
def load_Q3421():
    df = pd.read_csv('data/datasets/Q3421.txt',
                     delimiter="\s+").iloc[1:].reset_index(drop=True).rename(columns={
        'PDB_ID': 'PDB',
        'Chain': 'chain',
        'Pos(PDB)': 'pdb_position',
        'Wildtype': 'wildtype',
    })
    df['source'] = 'Q3421.txt'
    df.PDB = df.PDB.str.lower()
    df.ddG = df.ddG.astype(float)
    return df.filter(COLUMNS)


df = load_Q3421()
df

In [None]:
df_combined = pd.concat([
    load_fireprot(),
    load_Q3421(),
    load_Q1744_Q3214('data/datasets/Q3214_direct.csv'),
    load_Q1744_Q3214('data/datasets/Q1744_direct.csv'),
    load_s350_s2648('data/datasets/dataset_S2648.csv'),
    load_s350_s2648('data/datasets/dataset_S350.csv'),
    load_s140(),
    load_pucci(),
    load_kaggle_mutations_dt(),
    load_kaggle_mutations_ddg(),  # a lot of inconsistencies between mutation positions and pdb files.
    load_thermomutdb(),
    load_S3568(),
    load_s630()
], axis=0).astype({'pdb_position': float, 'wT': float, 'pH': float}).filter(COLUMNS).reset_index(drop=True)
df_combined

In [None]:
df_combined['PDB_chain'] = df_combined.PDB + df_combined.chain.astype(str)

## Rescale Pucci dataset

In [None]:
def pucci(df):
    if len(df[df.source == 'pucci-proteins-appendixtable1.xlsx']) > 0 and len(
            df[df.source != 'pucci-proteins-appendixtable1.xlsx']) > 0:
        return pd.Series({'nonpucci': (df[df.source != 'pucci-proteins-appendixtable1.xlsx']).ddG.mean(),
                          'pucci': (df[df.source == 'pucci-proteins-appendixtable1.xlsx']).ddG.mean()})
    return None


df = df_combined.groupby(['PDB_chain', 'pdb_position', 'mutant']).apply(lambda df: pucci(df)).dropna().reset_index(drop=True)

pucci_ratio = (df.pucci / df.nonpucci).replace([np.inf, -np.inf], None).dropna()
pucci_ratio = pucci_ratio[(pucci_ratio < 10) & (-10 < pucci_ratio)]
ax = pucci_ratio.plot.hist(bins=100, title='ddG-Pucci/ddG-non-pucci distribution')

In [None]:
PUCCI_K = (df.pucci / df.nonpucci).replace([np.inf, -np.inf], None).median()
PUCCI_K

In [None]:
df.plot.scatter(x='pucci', y='nonpucci')

In [None]:
df_combined.loc[df_combined.source == 'pucci-proteins-appendixtable1.xlsx', 'ddG'] = df_combined.ddG / PUCCI_K

In [None]:
plot_hist(df_combined[df_combined.source == 'pucci-proteins-appendixtable1.xlsx'])

In [None]:
df_combined.groupby('source').apply(lambda df: (df.ddG / df.dT).median())

In [None]:
plot_hist(df_combined)

In [None]:
## Download AlphaFold2 structures
1. Produce the query to retrieve AlphaFold2 IDs by checksums of sequences
2. Run the resulting query in BigQuery

In [None]:
df_combined['sequenceChecksum'] = df_combined.sequence.apply(lambda s: crc64iso.crc64(s) if type(s) is str else None)

sql = gen_af2_bigquery(df_combined)
print(sql)

In [None]:
AF2_FILE = 'tmp/af.csv'  # this is your BigQuery result

if os.path.exists(AF2_FILE):
    download_cif(AF2_FILE)

In [None]:
def download_pdbs(df):
    existing_pdbs = set([os.path.basename(f)[:4] for f in glob.glob('data/pdbs/*.pdb')])
    to_download = set(df[df.PDB.str.len() == 4].PDB.unique()) - existing_pdbs
    with tqdm(to_download) as prog:
        for pdb in prog:
            prog.set_description(f'Downloading {pdb}')
            fname = f'data/pdbs/{pdb}.pdb'
            url = f'https://files.rcsb.org/download/{pdb}.pdb'
            v = requests.get(url)
            if v.status_code != 200:
                print(url, 'status code', v.status_code)
                continue
            with open(fname, 'w+') as f:
                f.write(v.content.decode('utf-8'))


download_pdbs(df_combined)

In [None]:
!ls data/pdbs | wc -l

In [None]:
def pdbs_to_chains():
    chain_seq = []

    # CIF
    import Bio.PDB as PDB
    parser = PDB.MMCIFParser()
    pdbio = PDBIO()
    for f in tqdm(glob.glob('data/cifs/*.cif'), 'CIF file chains'):
        id = '-'.join(os.path.basename(f).split('-')[:-1])
        structure = parser.get_structure(id, f)
        model = next(iter(structure.get_models()))
        for chain in model.get_chains():
            chain_name = f'{id}{chain.get_id()}'
            pdb = f'data/chains/{chain_name}.pdb'
            chain_seq.append({
                'PDB_chain': chain_name,
                'sequence': seq1(''.join(residue.resname for residue in chain)),
                'pdb_ids': tuple([r.get_id()[1] for r in chain]),
                # 'file': pdb
            })
            if not os.path.exists(pdb):
                pdbio.set_structure(chain)
                pdbio.save(f'data/chains/{chain_name}.pdb')

    # PDBs
    parser = PDBParser()
    io = PDBIO()
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        with tqdm(glob.glob('data/pdbs/*.pdb'), desc='PDBs') as prog:
            for pdb in prog:
                id = os.path.basename(pdb)[:4]
                prog.set_description(id)
                structure = parser.get_structure(id, pdb)

                # validate
                models = list(structure.get_models())
                for i in range(len(models) - 1):
                    s1 = seq1(''.join(residue.resname for residue in models[i].get_residues()))
                    s2 = seq1(''.join(residue.resname for residue in models[i + 1].get_residues()))
                    assert s1 == s2, "Invalid models"

                pdb_chains = structure.get_chains()

                for chain in pdb_chains:
                    chain_name = f'{structure.get_id()}{chain.get_id()}'
                    chain_seq.append({
                        'PDB_chain': chain_name,
                        'sequence': seq1(''.join(residue.resname for residue in chain)),
                        'pdb_ids': tuple([r.get_id()[1] for r in chain]),
                        # 'file': pdb
                    })
                    if not os.path.exists(f'data/chains/{chain_name}.pdb'):
                        io.set_structure(chain)
                        io.save(f'data/chains/{chain_name}.pdb')

    return pd.DataFrame(chain_seq).drop_duplicates()


df_chains = pdbs_to_chains()

In [None]:
df_chains

In [None]:
df_ds_pdb = pd.merge(df_combined, df_chains, on='PDB_chain', how='inner', suffixes=('', '_PDB'))
assert len(df_ds_pdb[~df_ds_pdb.sequence.isna() & ~df_ds_pdb.sequence_PDB.isna() & (df_ds_pdb.sequence != df_ds_pdb.sequence_PDB)]) == 0, 'Invalid pdb sequences'
df_ds_pdb.sequence = df_ds_pdb[['sequence', 'sequence_PDB']].bfill(axis=1).iloc[:, 0]
df_ds_pdb.drop(columns=['sequence_PDB'], inplace=True)
df_ds_pdb

In [None]:
df_ds_seq = pd.merge(df_combined.drop(columns=['PDB_chain']), df_chains, on='sequence', how='inner').reset_index(drop=True)
df_ds_seq

In [None]:
df_ds = pd.concat([df_ds_pdb, df_ds_seq], axis=0)

In [None]:
def position_error_stats(df):
    def valid_pdb(r):
        if np.isnan(r.pdb_position):
            return True
        if r.pdb_position in r.pdb_ids:
            idx = r.pdb_ids.index(r.pdb_position)
            if idx < len(r.sequence):
                if r.sequence[idx] == r.wildtype:
                    return True
        return False

    def valid_seq(r):
        if np.isnan(r.seq_position):
            return True
        if r.seq_position < len(r.sequence):
            if r.sequence[int(r.seq_position)] == r.wildtype:
                return True
        return False

    return pd.Series({
        'valid_pdb_position': df.apply(valid_pdb, axis=1).mean(),
        'pdb_position_count': (~df.pdb_position.isna()).sum(),

        # 'valid_seq_position': df.apply(valid_seq, axis=1).mean(),
        # 'seq_position_count': (~df.seq_position.isna()).sum(),
        'total_count': len(df),
    })


df_ds.groupby('source').apply(position_error_stats)

In [None]:
def valid_pos(r):
    pos, pdb_ids = r.pdb_position, r.pdb_ids
    if pos not in pdb_ids:
        return False
    idx = pdb_ids.index(pos)
    if r.sequence[idx] != r.wildtype:
        return False
    return True

print('Invalid positions:', (~df_ds.apply(valid_pos, axis=1)).sum())

In [None]:
df_ds = df_ds[df_ds.apply(valid_pos, axis=1)].reset_index(drop=True)

In [None]:
df_ds['seq_position'] = df_ds.apply(lambda r: r.pdb_ids.index(r.pdb_position) if r.pdb_position in r.pdb_ids else None, axis=1)
df_ds = df_ds.astype({'pdb_position': int, 'seq_position': int})

In [None]:
df_ds['mutant_PDB'] = df_ds.apply(lambda r: f'data/relaxed_chains/{r.PDB_chain}/{r.PDB_chain}_{r.wildtype}{r.pdb_position}{r.mutant}_relaxed.pdb', axis=1)
df_ds['mutant_PDB'] = df_ds.apply(lambda r: r.mutant_PDB if os.path.exists(r.mutant_PDB) else None, axis=1)
df_ds['wildtype_PDB'] = df_ds.apply(lambda r: f'data/relaxed_chains/{r.PDB_chain}/{r.PDB_chain}_relaxed.pdb', axis=1)
df_ds['wildtype_PDB'] = df_ds.apply(lambda r: r.wildtype_PDB if os.path.exists(r.wildtype_PDB) else None, axis=1)
df_ds

In [None]:
df_clean = df_ds.groupby(['sequence', 'wildtype', 'pdb_position', 'seq_position', 'mutant']).mean()
df_clean['source'] = df_ds.groupby(['sequence', 'wildtype', 'pdb_position', 'seq_position', 'mutant']).source.apply(lambda v: v.unique().tolist())
df_clean['PDB_chain'] = df_ds.sort_values('mutant_PDB').groupby(['sequence', 'wildtype', 'pdb_position', 'seq_position', 'mutant'], sort=False).PDB_chain.first()
# df_clean['mutant_PDB'] = df_ds.sort_values('mutant_PDB').groupby(['sequence', 'wildtype', 'pdb_position', 'seq_position', 'mutant'], sort=False).mutant_PDB.apply(lambda v: v.unique().tolist())
df_clean = df_clean.reset_index()
df_clean