# Resistance plant gene prediction - EDA
---

## Libraries

In [1]:
# Data Manipulation
import pandas as pd
import numpy as np

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Data preparation
from sklearn.model_selection import train_test_split

# Modeling library
import tensorflow as tf
from tensorflow.keras.models import Model

# Modeling layers
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, LayerNormalization
from tensorflow.keras.preprocessing.text import Tokenizer

# Classification metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Speed up the model's training
from tensorflow.keras import mixed_precision

2024-04-26 21:02:39.330565: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-26 21:02:39.349413: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-26 21:02:39.349423: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-26 21:02:39.349922: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-26 21:02:39.353328: I tensorflow/core/platform/cpu_feature_guar

## Data loading

In [2]:
proteins = pd.read_csv('Data/train.csv')

In [3]:
proteins.head()

Unnamed: 0,ID,Sequence,Label
0,0,MDSRMDQYEVMEQIGRGAFGAAILVNHKTEKKKYVLKKIRLARQTE...,0
1,4,MAPKAEKKPAEKKPAEEKAGEKAPAAGKKPKAEKRLPASKGEKGGE...,0
2,5,MLELHFEFIDLNQPKMYKFVVCLLTLSFLLLSGLSNTALARVHHES...,0
3,6,MAEPELVNGGVKENKLWKGVFAVSGIMSTLVIYGVLQEKIMRVPYG...,0
4,7,MTSCGCLVLEKVEDHGGEAAGRGRGRLAQGGGGGGGGCGSCAGEWR...,0


In [4]:
proteins.describe()

Unnamed: 0,ID,Label
count,14161.0,14161.0
mean,9469.114116,0.089965
std,5461.636271,0.286142
min,0.0,0.0
25%,4732.0,0.0
50%,9471.0,0.0
75%,14222.0,0.0
max,18881.0,1.0


In [5]:
proteins.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14161 entries, 0 to 14160
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   ID        14161 non-null  int64 
 1   Sequence  14161 non-null  object
 2   Label     14161 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 332.0+ KB


In [6]:
proteins[proteins.duplicated()].shape

(0, 3)

##  Preparation

### Tokenization

In [7]:
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(proteins['Sequence'])
sequences = tokenizer.texts_to_sequences(proteins['Sequence'])

In [8]:
sequences = pd.DataFrame(sequences).fillna(0)
sequences

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2018,2019,2020,2021,2022,2023,2024,2025,2026,2027
0,17,9,2,8,17,9,15,16,6,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,17,3,12,7,3,6,7,7,12,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,17,1,6,1,18,14,6,14,10,9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,17,3,6,12,6,1,5,13,4,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,17,11,2,19,4,19,1,5,1,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14156,17,4,1,7,6,6,14,6,6,18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14157,17,17,17,17,4,6,4,5,2,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14158,17,16,13,13,13,15,16,2,14,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14159,17,3,3,8,19,2,10,3,14,17,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
labels = pd.DataFrame(proteins['Label'])
labels.head()

Unnamed: 0,Label
0,0
1,0
2,0
3,0
4,0


### Split in train and test

In [10]:
sequences_train, sequences_test, labels_train, labels_test = train_test_split(sequences, labels, test_size=0.3, random_state=33)

In [11]:
print('Training shapes: ', sequences_train.shape, labels_train.shape)
print('Test shapes: ', sequences_test.shape, labels_test.shape)

Training shapes:  (9912, 2028) (9912, 1)
Test shapes:  (4249, 2028) (4249, 1)


### Mixed precision

* Set up mix precision before modeling to speed up the training

In [12]:
!nvidia-smi -L

GPU 0: NVIDIA GeForce RTX 3060 (UUID: GPU-ab9b9fbd-4e42-e44a-c243-d1a9ea6686fe)


In [13]:
tf.config.list_physical_devices()

2024-04-26 21:02:42.183369: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-04-26 21:02:42.195627: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-04-26 21:02:42.195664: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.


[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [14]:
mixed_precision.set_global_policy(policy='mixed_float16')

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA GeForce RTX 3060, compute capability 8.6


2024-04-26 21:02:42.201937: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.


In [15]:
mixed_precision.global_policy()

<Policy "mixed_float16">

## Modeling

In [16]:
max_len = sequences_train.shape[1]

In [17]:
def class_report(model, predictions):
    print(classification_report(y_true=labels_test, y_pred=predictions, target_names=['Negative', 'Positive'], zero_division=0))

def conf_matrix(model, predictions):
    cm = confusion_matrix(labels_test, predictions)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Negative', 'Positive'])
    
    # Plot the confusion matrix
    disp.plot(cmap=plt.cm.Blues)
    plt.title('Confusion Matrix')
    plt.show()

### LSTM (Baseline)

In [19]:
inputs = Input(shape=(max_len,), name='Input')
embeddings = Embedding(input_dim=len(tokenizer.word_index), output_dim=700, input_length=max_len, name='Embeddings')(inputs)

normalization = LayerNormalization()(embeddings)

lstms = LSTM(5, name='LSTM_layer_1')(normalization)

dense = Dense(3, activation='relu', name='Dense_1')(lstms)

outputs = Dense(1, activation='sigmoid', name='Output')(dense)

2024-04-26 21:02:42.239840: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-04-26 21:02:42.239904: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-04-26 21:02:42.239935: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-04-26 21:02:42.355793: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-04-26 21:02:42.355852: I external/local_xla/xla/stream_executor

In [20]:
model = Model(inputs=inputs, outputs=outputs)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input (InputLayer)          [(None, 2028)]            0         
                                                                 
 Embeddings (Embedding)      (None, 2028, 700)         16100     
                                                                 
 layer_normalization (Layer  (None, 2028, 700)         1400      
 Normalization)                                                  
                                                                 
 LSTM_layer_1 (LSTM)         (None, 5)                 14120     
                                                                 
 Dense_1 (Dense)             (None, 3)                 18        
                                                                 
 Output (Dense)              (None, 1)                 4         
                                                             

In [26]:
model.compile(optimizer=tf.keras.optimizers.Adamax(), loss=tf.keras.losses.binary_crossentropy, metrics=['accuracy'])

In [22]:
history = model.fit(sequences_train, labels_train, epochs=1)

2024-04-26 21:02:44.059184: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8904
2024-04-26 21:02:44.529577: I external/local_xla/xla/service/service.cc:168] XLA service 0x7f7bc0b77c50 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-04-26 21:02:44.529605: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3060, Compute Capability 8.6
2024-04-26 21:02:44.532364: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1714154564.581748   16502 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.




In [24]:
# predictions = (model.predict(sequences_test) > 0.5).astype("int32")
model.evaluate(sequences_test, labels_test)

2024-04-26 21:03:28.062006: W tensorflow/core/grappler/utils/graph_view.cc:849] No registered '_MklLayerNorm' OpKernel for GPU devices compatible with node {{node model/layer_normalization/add}}
	.  Registered:  device='CPU'; T in [DT_BFLOAT16]
  device='CPU'; T in [DT_FLOAT]



InvalidArgumentError: Graph execution error:

Detected at node model/layer_normalization/add defined at (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main

  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code

  File "/home/kostasmpouros/.local/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>

  File "/home/kostasmpouros/.local/lib/python3.10/site-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/home/kostasmpouros/.local/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 739, in start

  File "/home/kostasmpouros/.local/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 205, in start

  File "/usr/lib/python3.10/asyncio/base_events.py", line 603, in run_forever

  File "/usr/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once

  File "/usr/lib/python3.10/asyncio/events.py", line 80, in _run

  File "/home/kostasmpouros/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue

  File "/home/kostasmpouros/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 534, in process_one

  File "/home/kostasmpouros/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell

  File "/home/kostasmpouros/.local/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 359, in execute_request

  File "/home/kostasmpouros/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 778, in execute_request

  File "/home/kostasmpouros/.local/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 446, in do_execute

  File "/home/kostasmpouros/.local/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 549, in run_cell

  File "/home/kostasmpouros/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3075, in run_cell

  File "/home/kostasmpouros/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3130, in _run_cell

  File "/home/kostasmpouros/.local/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner

  File "/home/kostasmpouros/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3334, in run_cell_async

  File "/home/kostasmpouros/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3517, in run_ast_nodes

  File "/home/kostasmpouros/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code

  File "/tmp/ipykernel_16391/2824582786.py", line 2, in <module>

  File "/home/kostasmpouros/.local/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/home/kostasmpouros/.local/lib/python3.10/site-packages/keras/src/engine/training.py", line 2296, in evaluate

  File "/home/kostasmpouros/.local/lib/python3.10/site-packages/keras/src/engine/training.py", line 4108, in run_step

  File "/home/kostasmpouros/.local/lib/python3.10/site-packages/keras/src/engine/training.py", line 2066, in test_function

  File "/home/kostasmpouros/.local/lib/python3.10/site-packages/keras/src/engine/training.py", line 2049, in step_function

  File "/home/kostasmpouros/.local/lib/python3.10/site-packages/keras/src/engine/training.py", line 2037, in run_step

  File "/home/kostasmpouros/.local/lib/python3.10/site-packages/keras/src/engine/training.py", line 1917, in test_step

  File "/home/kostasmpouros/.local/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/home/kostasmpouros/.local/lib/python3.10/site-packages/keras/src/engine/training.py", line 590, in __call__

  File "/home/kostasmpouros/.local/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/home/kostasmpouros/.local/lib/python3.10/site-packages/keras/src/engine/base_layer.py", line 1149, in __call__

  File "/home/kostasmpouros/.local/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler

  File "/home/kostasmpouros/.local/lib/python3.10/site-packages/keras/src/engine/functional.py", line 515, in call

  File "/home/kostasmpouros/.local/lib/python3.10/site-packages/keras/src/engine/functional.py", line 672, in _run_internal_graph

  File "/home/kostasmpouros/.local/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/home/kostasmpouros/.local/lib/python3.10/site-packages/keras/src/engine/base_layer.py", line 1149, in __call__

  File "/home/kostasmpouros/.local/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler

  File "/home/kostasmpouros/.local/lib/python3.10/site-packages/keras/src/layers/normalization/layer_normalization.py", line 343, in call

Value for attr 'T' of half is not in the list of allowed values: float, bfloat16
	; NodeDef: {{node model/layer_normalization/add}}; Op<name=_MklLayerNorm; signature=x:T, scale:T, offset:T -> y:T; attr=T:type,allowed=[DT_FLOAT, DT_BFLOAT16]; attr=epsilon:float,default=0.001>
	 [[model/layer_normalization/add]] [Op:__inference_test_function_6770]

In [None]:
class_report(model, predictions)

In [None]:
conf_matrix(model, predictions)