In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score

from matplotlib import pyplot as plt

from data_handler import TrainDataHandler

TRAIN_VARIANTS_PATH = "../data/training_variants/training_variants"
TRAIN_TEXT_PATH = "../data/training_text/training_text"


# Reflect changes in the modules immediately.
%load_ext autoreload
%autoreload 2

In [2]:
train_data_handler = TrainDataHandler(TRAIN_VARIANTS_PATH, TRAIN_TEXT_PATH)

2022-12-17 09:20:20 |[36m DEBUG    [0m| data_handler         | Loading the train variants data..
2022-12-17 09:20:20 |[36m DEBUG    [0m| data_handler         | Loading the train text data..
2022-12-17 09:20:22 |[36m DEBUG    [0m| data_handler         | Preparing train data ..
2022-12-17 09:20:22 |[32m INFO     [0m| data_handler         | Train data is loaded. (Data size: (3316, 5))


In [3]:
train_data = train_data_handler.train_data.copy()
train_data.head()

Unnamed: 0,ID,Gene,Variation,Class,Text
0,0,FAM58A,Truncating Mutations,1,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,CBL,W802*,2,Abstract Background Non-small cell lung canc...
2,2,CBL,Q249E,2,Abstract Background Non-small cell lung canc...
3,3,CBL,N454D,3,Recent evidence has demonstrated that acquired...
4,4,CBL,L399V,4,Oncogenic mutations in the monomeric Casitas B...


In [4]:
def my_precision_score(y_true, y_pred):
    return precision_score(y_true, y_pred, average="weighted")


def my_recall_score(y_true, y_pred):
    return recall_score(y_true, y_pred, average="weighted")

In [5]:
EVAL_METRIC_DICT = {
    "accuracy_score": accuracy_score,
    "balanced_accuracy_score": balanced_accuracy_score,
    "precision_score": my_precision_score,
    "recall_score": my_recall_score,
}

In [13]:
# use NLP to classify the text deep learning
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# tf.keras.metrics.binary_accuracy

# ensure that code uses GPU
print(tf.test.gpu_device_name())

# split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(train_data['Text'], train_data['Class'], test_size=0.2,
                                                    random_state=0)

# vectorize the text
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1

maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

# train the model
embedding_dim = 50

model = keras.Sequential()
model.add(layers.Embedding(
    input_dim=vocab_size,
    output_dim=embedding_dim,
    input_length=maxlen)
)
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(10, activation='softmax'))
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy', # 'sparse_categorical_crossentropy',
    metrics=['accuracy']
)
print("Model summary: ", model.summary())

history = model.fit(
    X_train, y_train,
    epochs=200,
    verbose=False,
    validation_data=(X_test, y_test)
)

loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f} | Loss: {:.4f}".format(accuracy, loss))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Test Accuracy: {:.4f} | Loss: {:.4f}".format(accuracy, loss))

# predict the test data
pred = model.predict(X_test)
pred = np.argmax(pred, axis=1)

print("Accuracy score for classifier: ", accuracy_score(y_test, pred))

/device:GPU:0
Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 100, 50)           8276050   
                                                                 
 global_max_pooling1d_7 (Glo  (None, 50)               0         
 balMaxPooling1D)                                                
                                                                 
 dense_15 (Dense)            (None, 10)                510       
                                                                 
 dense_16 (Dense)            (None, 10)                110       
                                                                 
Total params: 8,276,670
Trainable params: 8,276,670
Non-trainable params: 0
_________________________________________________________________
Model summary:  None


ValueError: in user code:

    File "C:\Users\ibrah\anaconda3\envs\MyDeepLearning\lib\site-packages\keras\engine\training.py", line 1051, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\ibrah\anaconda3\envs\MyDeepLearning\lib\site-packages\keras\engine\training.py", line 1040, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\ibrah\anaconda3\envs\MyDeepLearning\lib\site-packages\keras\engine\training.py", line 1030, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\ibrah\anaconda3\envs\MyDeepLearning\lib\site-packages\keras\engine\training.py", line 890, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "C:\Users\ibrah\anaconda3\envs\MyDeepLearning\lib\site-packages\keras\engine\training.py", line 948, in compute_loss
        return self.compiled_loss(
    File "C:\Users\ibrah\anaconda3\envs\MyDeepLearning\lib\site-packages\keras\engine\compile_utils.py", line 201, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "C:\Users\ibrah\anaconda3\envs\MyDeepLearning\lib\site-packages\keras\losses.py", line 139, in __call__
        losses = call_fn(y_true, y_pred)
    File "C:\Users\ibrah\anaconda3\envs\MyDeepLearning\lib\site-packages\keras\losses.py", line 243, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "C:\Users\ibrah\anaconda3\envs\MyDeepLearning\lib\site-packages\keras\losses.py", line 1787, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "C:\Users\ibrah\anaconda3\envs\MyDeepLearning\lib\site-packages\keras\backend.py", line 5119, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (None, 1) and (None, 10) are incompatible


In [None]:
# confusion matrix
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, pred)

# plot confusion matrix
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 10))
sns.heatmap(cm, annot=True, fmt="d")
plt.title("Confusion matrix")
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()

In [154]:
# deep learning NLP to predict the class

# import keras
from keras.preprocessing.text import Tokenizer



Unnamed: 0,ID,Gene,Variation,Class,Text
3297,3297,RUNX1,R177*,4,THE AML1 GENE IS KNOWN as the most frequent ta...
3298,3298,RUNX1,Y113*,4,Introduction Myelodysplastic syndromes (MDS) ...
3299,3299,RUNX1,R139G,4,The BCR-ABL fusion protein generated by t(9;22...
3300,3300,RUNX1,K83N,4,The most frequent mutations associated with le...
3301,3301,RUNX1,R177Q,4,The most frequent mutations associated with le...
3302,3302,RUNX1,R166Q,4,Familial platelet disorder with predisposition...
3303,3303,RUNX1,P173S,4,Introduction Myelodysplastic syndromes (MDS) ...
3304,3304,RUNX1,R201Q,4,Familial platelet disorder with predisposition...
3305,3305,RUNX1,S70fsX93,4,Introduction Myelodysplastic syndromes (MDS) ...
3306,3306,RUNX1,W279*,1,Here we report two new RUNX1 mutations in one ...
