<a href="https://colab.research.google.com/github/khalidelmoutaouakil/colabtools/blob/master/Cancer_Classification_CNN_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
from __future__ import division, print_function, absolute_import
''' import libraries to disable warnings'''
import warnings
import logging, os

warnings.filterwarnings('ignore')
logging.disable(logging.WARNING)
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

''' import all required libraries for data cleansing, preprocessing, training and testing'''
from openpyxl import load_workbook
import numpy as np
import keras
from keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from keras.layers import *
from keras.models import Model
import keras.backend as K

In [0]:
''' Function to get pass the excel sheet location, read every attribute/column,'''
def fetch_info(dataset_loc):
    def if_int(s):
        try:
            float(s)
            return True
        except ValueError:
            return False

    def ref_change(Ref):
        temp = ''
        bag_ref = {'A': '1', 'C': '2', 'G': '3', 'T': '4'}
        if Ref in bag_ref:
            return bag_ref[Ref]
        elif Ref == "-":
            return '0'
        else:
            for letter in Ref:
                if temp == '':
                    temp = temp + str(bag_ref[letter])
                else:
                    temp = temp + ',' + str(bag_ref[letter])
            return temp


    workbook = load_workbook(dataset_loc,read_only=True)
    sheet_1 = workbook.get_sheet_names()[0]
    worksheet = workbook.get_sheet_by_name(sheet_1)
    FirstRow = True
    CancerType_dic = {}
    TumorID_dic = {}
    GeneName_dic = {}
    Chromosome_dic = {}
    VariantType_dic = {}
    data = []

    for row in worksheet.iter_rows():
        Row = []
        if FirstRow:
            FirstRow = False
            continue

        # CANCER_TYPE
        cancerType = str(row[0].value)
        if cancerType in CancerType_dic:
            Row.append(CancerType_dic[cancerType])
        else:
            nCancer = len(CancerType_dic)
            CancerType_dic[cancerType] = nCancer
            Row.append(CancerType_dic[cancerType])

        # TUMOR_SAMPLE_ID
        tumorID = str(row[1].value)
        if tumorID in TumorID_dic:
            Row.append(TumorID_dic[tumorID])
        else:
            nTumorId = len(TumorID_dic)
            TumorID_dic[tumorID] = nTumorId + 1
            Row.append(TumorID_dic[tumorID])


        # GENE_NAME
        geneName = str(row[2].value)
        if geneName in GeneName_dic:
            Row.append(GeneName_dic[geneName])
        else:
            nGene = len(GeneName_dic)
            GeneName_dic[geneName] = nGene + 1
            Row.append(GeneName_dic[geneName])


        #CHROMOSOME

        Chromosome = str(row[3].value)
        if if_int(Chromosome):
            Row.append(float(Chromosome))
        else:
            if Chromosome in Chromosome_dic:
                Row.append(Chromosome_dic[Chromosome])
            else:
                nChromo = len(Chromosome_dic)
                Chromosome_dic[Chromosome] = (nChromo + 1)*(-1)
                Row.append(Chromosome_dic[Chromosome])

        #START POSITION

        startPos = float(row[4].value)
        Row.append(startPos)

        #END_POSITION
        endPos = float(row[5].value)
        Row.append(endPos)

        #VARIANT TYPE
        variantType = str(row[6].value)
        if variantType in VariantType_dic:    #TUMOR_ALLELE

            Row.append(VariantType_dic[variantType])
        else:
            nVariant = len(VariantType_dic)
            VariantType_dic[variantType] = nVariant + 1
            Row.append(VariantType_dic[variantType])

        #REFERENCE_ALLELE
        referAllele = row[7].value
        Row.append(ref_change(referAllele))

        #TUMOR_ALLELE
        tumorAlle = row[8].value
        Row.append(ref_change(tumorAlle))

        data.append(Row)

    return np.array(data)

In [0]:

def dataset_process(data_loc):
    def data_scale(data):
        min_point = np.min(data)
        max_point = np.max(data)

        data_norm = (data-min_point)/(max_point - min_point)

        return data_norm

    def data_seq_padd(feature, maxlen):
        features_str = []
        n = feature.shape[0]

        for i in range(0, n):
            x = feature[i]
            temp = x.split(",")
            features_str.append(np.array(temp).astype(int))

        int_features = np.array(features_str)
        return pad_sequences(int_features, padding='post', maxlen=maxlen)

    def featureGen(data):
        # generate features

        Data = data[:, 0:data.shape[1]-2]
        Data = np.array(Data).astype(float)
        for i in range(1, data.shape[1] - 2):
            Data[:, i] = data_scale(Data[:, i])

        # print Data
        # features_dict = data[:, 1:6]
        Reference_Allele = data[:, 7]
        Reference_Allele = data_seq_padd(Reference_Allele, maxlen=96)

        Tumor_Allele = data[:, 8]
        Tumor_Allele = data_seq_padd(Tumor_Allele, maxlen=82)

        y = keras.utils.to_categorical(np.array(Data[:, 0]).astype(int))
        x = np.reshape(Data[:, 1:data.shape[1]-2], (-1, 6, 1))
        return y, x, Reference_Allele, Tumor_Allele

    data = np.load(data_loc)

    #SHUFFLE DATA
    data = np.take(data, np.random.permutation(data.shape[0]), axis=0, out=data)

    # print data

    test_values = data.shape[0] - train_values - val_values

    train_data = data[0:train_values, :]
    valid_data = data[train_values:train_values+val_values, :]
    test_data = data[train_values+val_values:, :]

    #
    [y_train, x_train_feature, x_train_ref, x_train_tumor] = featureGen(train_data)
    [y_valid, x_valid_feature, x_valid_ref, x_valid_tumor] = featureGen(valid_data)
    [y_test, x_test_feature, x_test_ref, x_test_tumor] = featureGen(test_data)

    return (y_train, x_train_feature, x_train_ref, x_train_tumor,
            y_valid, x_valid_feature, x_valid_ref, x_valid_tumor,
            y_test, x_test_feature, x_test_ref, x_test_tumor)

In [0]:
def model_generate(shapeFeat, shapeRef, shapeTumor, num_class):
    # ref LSTM
    input_ref = Input(shapeRef, name='ref')

    with tf.name_scope('Embedding-Conv1D-LSTM-1'):
        ref = Embedding(5, 64, input_length=shapeRef[0])(input_ref)

        ref = Conv1D(128, 4, activation='relu')(ref)
        ref = MaxPool1D(2)(ref)

        ref = LSTM(100)(ref)
        ref = Dropout(0.35)(ref)

    # tumor LSTM
    input_tumor = Input(shapeTumor, name='tumor')

    with tf.name_scope('Embedding-Conv1D-LSTM-2'):
        tumor = Embedding(5, 64, input_length=shapeTumor[0])(input_tumor)

        tumor = Conv1D(128, 4, activation='relu')(tumor)
        tumor = MaxPool1D(2)(tumor)

        tumor = LSTM(100)(tumor)
        tumor = Dropout(0.35)(tumor)

    # feature CNN
    input_feature = Input(shapeFeat, name='feature')

    with tf.name_scope('Conv1D-Dense-3'):
        feature = Conv1D(128, 4, activation='relu')(input_feature)
        feature = MaxPool1D(2)(feature)

        feature = Flatten()(feature)

        feature = Dense(100, activation='relu')(feature)
        feature = Dropout(0.35)(feature)

    # concatenate
    cat = concatenate([feature, ref, tumor])

    with tf.name_scope('Dense'):
        cat = Dense(150, activation='relu')(cat)
        cat = Dropout(0.3)(cat)

        cat = Dense(50, activation='relu')(cat)
        cat = Dropout(0.2)(cat)

    outputs = Dense(num_class, activation='softmax', name='logits')(cat)

    model = Model(inputs=[input_feature, input_ref, input_tumor],
                  outputs=[outputs])

    return model

In [0]:
dataset_loc = '/content/drive/My Drive/Colab Notebooks/data/TCGA_Cancer_Dataset_6.xlsx'

# Hyper-parameters
learning_rate = 1e-4
batch_size = 1000
epochs = 15
train_values = 33000
val_values = 1000
shapeFeat = (6, 1)
shapeRef = (96,)
shapeTumor = (82,)
total_classes = 6

In [0]:
data = fetch_info(dataset_loc)
np.save('/content/drive/My Drive/Colab Notebooks/data.npy', data)



In [0]:

# Load data
file_path = '/content/drive/My Drive/Colab Notebooks/data.npy'
y_train, x_train_feature, x_train_ref, x_train_tumor,\
y_valid, x_valid_feature, x_valid_ref, x_valid_tumor,\
y_test, x_test_feature, x_test_ref, x_test_tumor = dataset_process(file_path)


In [0]:
# Create model
model = model_generate(shapeFeat, shapeRef, shapeTumor, total_classes)

model = model_generate(shapeFeat, shapeRef, shapeTumor, total_classes)

# Compile model with Adam optimizer
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adam(learning_rate),
              metrics=['accuracy'])

# call back function
cb_ckpt = keras.callbacks.ModelCheckpoint('/content/drive/My Drive/Colab Notebooks/checkpoint/weights.{epoch:05d}-{val_acc:.5f}.h5', monitor='val_acc', verbose=1,
                                          save_best_only=True, save_weights_only=False,
                                          mode='auto', period=1)






Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.




In [0]:
# train model
model.fit({'feature': x_train_feature, 'ref': x_train_ref, 'tumor': x_train_tumor},
          {'logits': y_train},
          shuffle=True, epochs=epochs, batch_size=epochs, callbacks=[cb_ckpt],
          validation_data=({'feature': x_valid_feature, 'ref': x_valid_ref, 'tumor': x_valid_tumor},
                           {'logits': y_valid}))

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 33000 samples, validate on 1000 samples
Epoch 1/15

Epoch 00001: val_acc improved from -inf to 0.82300, saving model to /content/drive/My Drive/Colab Notebooks/checkpoint/weights.00001-0.82300.h5
Epoch 2/15

Epoch 00002: val_acc improved from 0.82300 to 0.84600, saving model to /content/drive/My Drive/Colab Notebooks/checkpoint/weights.00002-0.84600.h5
Epoch 3/15

Epoch 00003: val_acc improved from 0.84600 to 0.86500, saving model to /content/drive/My Drive/Colab Notebooks/checkpoint/weights.00003-0.86500.h5
Epoch 4/15

Epoch 00004: val_acc improved from 0.86500 to 0.89300, saving model to /content/drive/My Drive/Colab Notebooks/checkpoint/weights.00004-0.89300.h5
Epoch 5/15

Epoch 00005: val_acc improved from 0.89300 to 0.92000, saving model to /content/drive/My Drive/Colab Notebooks/checkpoint/weights.00005-0.92000.h5
Epoch 6/15

Epoch 00006: val_acc improved from 0.92000 to 0.92300

<keras.callbacks.History at 0x7fd0879cecf8>

In [0]:
results = model.evaluate({'feature': x_train_feature, 'ref': x_train_ref, 'tumor': x_train_tumor},
               {'logits': y_train},
               verbose=1, batch_size=batch_size)

print('Loss: ' + str(results[0]))
print('Accuracy: ' + str(results[1]))

Loss: 0.11064044160373283
Accuracy: 0.9617272687680793
