In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/novozymes-enzyme-stability-prediction/sample_submission.csv
/kaggle/input/novozymes-enzyme-stability-prediction/wildtype_structure_prediction_af2.pdb
/kaggle/input/novozymes-enzyme-stability-prediction/train.csv
/kaggle/input/novozymes-enzyme-stability-prediction/test.csv
/kaggle/input/novozymes-enzyme-stability-prediction/train_updates_20220929.csv


The goal of the notebook is to utilize the 3d structure of the enzyme variants, create voxrl representation and train a 3D CNN architecture. The entire process can be distributed into three steps. 

# 1. Generate accurate 3D structures of mutant protein from the wildtype.

In [10]:
import math
import multiprocessing
import os
import sys

import Levenshtein
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.model_selection
import tensorflow as tf
from keras import layers, callbacks
from keras import models
from keras import optimizers
from biopandas.pdb import PandasPdb
import pandas_profiling
from keras.saving.save import load_model
from sklearn.model_selection import GroupKFold
from tqdm import tqdm

from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

import plotly.express as px

In [8]:
%%capture
!pip install biopandas

In [4]:
df_train = pd.read_csv('../input/novozymes-enzyme-stability-prediction/train.csv')
df_test = pd.read_csv("../input/novozymes-enzyme-stability-prediction/test.csv")
sample = pd.read_csv('../input/novozymes-enzyme-stability-prediction/sample_submission.csv')

print(f"train_shape:{df_train.shape},test_shape:{df_test.shape},Sample_shape:{sample.shape}")

train_shape:(31390, 5),test_shape:(2413, 4),Sample_shape:(2413, 2)


In [5]:
# Make corrections to data 2409 rows, with all features marked as NaN), 
# as well as the rows where the pH and tm were transposed 
# (25 rows, with corrected features in this dataset)

import pandas as pd

df_train_updates = pd.read_csv("../input/novozymes-enzyme-stability-prediction/train_updates_20220929.csv", index_col="seq_id")

all_features_nan = df_train_updates.isnull().all("columns")

drop_indices = df_train_updates[all_features_nan].index
df_train = df_train.drop(index=drop_indices)

swap_ph_tm_indices = df_train_updates[~all_features_nan].index
df_train.loc[swap_ph_tm_indices, ["pH", "tm"]] = df_train_updates.loc[swap_ph_tm_indices, ["pH", "tm"]]

In [6]:
wild_type = 'VPVNPEPDATSVENVALKTGSGDSQSDPIKADLEVKGQSALPFDVDCWAILCKGAPNVLQRVNEKTKNSNRDRSGANKGPFKDPQKWGIKALPPKNPSWSAQDFKSPEEYAFASSLQGGTNAILAPVNLASQNSQGGVLNGFYSANKVAQFDPSKPQQTKGTWFQITKFTGAAGPYCKALGSNDKSVCDKNKNIAGDWGFDPAKWAYQYDEKNNKFNYVGK'
def gen_mutations(name, df,wild = wild_type):
    result = []
    for _, r in df.iterrows():
        ops = Levenshtein.editops(wild, r.protein_sequence)
        assert len(ops) <= 1
        if len(ops) > 0 and ops[0][0] == 'replace':
            idx = ops[0][1]
            result.append([ops[0][0], idx + 1, wild[idx], r.protein_sequence[idx]])
        elif len(ops) == 0:
            result.append(['same', 0, '', ''])
        elif ops[0][0] == 'insert':
            assert False, "Ups"
        elif ops[0][0] == 'delete':
            idx = ops[0][1]
            result.append(['delete', idx + 1, wild[idx], '-'])
        else:
            assert False, "Ups"

    df = pd.concat([df, pd.DataFrame(data=result, columns=['op', 'idx', 'wild', 'mutant'])], axis=1)
    df['mut'] = df[['wild', 'idx', 'mutant']].astype(str).apply(lambda v: ''.join(v), axis=1)
    df['name'] = name
    return df


df_test = gen_mutations('wildtypeA', df_test)
df_test

Unnamed: 0,seq_id,protein_sequence,pH,data_source,op,idx,wild,mutant,mut,name
0,31390,VPVNPEPDATSVENVAEKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,replace,17,L,E,L17E,wildtypeA
1,31391,VPVNPEPDATSVENVAKKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,replace,17,L,K,L17K,wildtypeA
2,31392,VPVNPEPDATSVENVAKTGSGDSQSDPIKADLEVKGQSALPFDVDC...,8,Novozymes,delete,17,L,-,L17-,wildtypeA
3,31393,VPVNPEPDATSVENVALCTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,replace,18,K,C,K18C,wildtypeA
4,31394,VPVNPEPDATSVENVALFTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,replace,18,K,F,K18F,wildtypeA
...,...,...,...,...,...,...,...,...,...,...
2408,33798,VPVNPEPDATSVENVILKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,replace,16,A,I,A16I,wildtypeA
2409,33799,VPVNPEPDATSVENVLLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,replace,16,A,L,A16L,wildtypeA
2410,33800,VPVNPEPDATSVENVNLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,replace,16,A,N,A16N,wildtypeA
2411,33801,VPVNPEPDATSVENVPLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,replace,16,A,P,A16P,wildtypeA


In [11]:
df_pdb =  PandasPdb().read_pdb('/kaggle/input/novozymes-enzyme-stability-prediction/wildtype_structure_prediction_af2.pdb')
print(df_pdb.df.keys())

dict_keys(['ATOM', 'HETATM', 'ANISOU', 'OTHERS'])


In [12]:
def sep():
    print("-"*50)
df_atom = df_pdb.df['ATOM']
df_hetatm = df_pdb.df['HETATM']
df_anisou = df_pdb.df['ANISOU']
df_others = df_pdb.df['OTHERS']
print("df_atom shape: ", df_atom.shape),sep()
print("df_hetatm shape: ", df_hetatm.shape),sep()
print("df_anisou shape: ", df_anisou.shape),sep()
print("df_others shape: ", df_others.shape)

df_atom shape:  (3317, 21)
--------------------------------------------------
df_hetatm shape:  (0, 21)
--------------------------------------------------
df_anisou shape:  (0, 21)
--------------------------------------------------
df_others shape:  (2, 3)
