In [1]:
import pandas as pd
import numpy as np
import multiprocessing
from random import shuffle


In [2]:
X_train_variant_data = pd.read_csv('training_variants')
X_train_variant_data.head()


Unnamed: 0,ID,Gene,Variation,Class
0,0,FAM58A,Truncating Mutations,1
1,1,CBL,W802*,2
2,2,CBL,Q249E,2
3,3,CBL,N454D,3
4,4,CBL,L399V,4


In [3]:
print X_train_variant_data.shape[0]

3321


In [4]:
X_test_variant_data = pd.read_csv('test_variants')
X_test_variant_data.head()

Unnamed: 0,ID,Gene,Variation
0,0,ACSL4,R570S
1,1,NAGLU,P521L
2,2,PAH,L333F
3,3,ING1,A148D
4,4,TMEM216,G77A


In [5]:
X_train_text = pd.read_table('training_text', sep='\|\|', engine='python', names=['ID', 'Text'], skiprows=[0])
X_test_text = pd.read_table('test_text', sep='\|\|', engine='python', names=['ID', 'Text'], skiprows=[0])
X_test_text.head()

Unnamed: 0,ID,Text
0,0,2. This mutation resulted in a myeloproliferat...
1,1,Abstract The Large Tumor Suppressor 1 (LATS1)...
2,2,Vascular endothelial growth factor receptor (V...
3,3,Inflammatory myofibroblastic tumor (IMT) is a ...
4,4,Abstract Retinoblastoma is a pediatric retina...


In [6]:
print len(np.unique(X_train_variant_data['Variation']))

2996


In [7]:
"""
    Remove the class target label from the training variant data set and add it to an independent target label array
"""
y_train = X_train_variant_data['Class']
X_train_variant_data.drop('Class', axis=1, inplace=True)
y_train.head()
#print y_train.shape

0    1
1    2
2    2
3    3
4    4
Name: Class, dtype: int64

In [8]:
"""
    One hot encode the target label array
"""

y_train = pd.get_dummies(y_train, prefix='class', prefix_sep='')
y_train.head()


Unnamed: 0,class1,class2,class3,class4,class5,class6,class7,class8,class9
0,1,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0


In [9]:
"""
    Hash vectorize gene, variation columns
"""
from sklearn.feature_extraction.text import HashingVectorizer


X_train_gene = X_train_variant_data['Gene']
gene_hash_vectorizer = HashingVectorizer(n_features=500)
gene_vector = gene_hash_vectorizer.transform(X_train_gene)

X_train_variation = X_train_variant_data['Variation']
variation_hash_vectorizer = HashingVectorizer(n_features=5000)
variation_vector = variation_hash_vectorizer.transform(X_train_variation)

X_test_gene = X_test_variant_data['Gene']
gene_test_hash_vectorizer = HashingVectorizer(n_features=500)
gene_test_vector = gene_test_hash_vectorizer.transform(X_test_gene)

X_test_variation = X_test_variant_data['Variation']
variation_test_hash_vectorizer = HashingVectorizer(n_features=5000)
variation_test_vector = variation_test_hash_vectorizer.transform(X_test_variation)

print gene_vector.shape


(3321, 500)


In [10]:
"""
    Download nltk stopwords corpus
"""
import nltk
nltk.download('popular')

[nltk_data] Downloading collection u'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /home/paperspace/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /home/paperspace/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /home/paperspace/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /home/paperspace/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /home/paperspace/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /home/paperspace/nltk_data...
[nltk_data]    |   Package movie

True

In [11]:
"""
    Process clinical text data by tokenizing words, removing stop words, stemming words etc.
"""

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from gensim import utils
def process_clinical_text(clinical_text):
    stop_words = set(stopwords.words('english'))
    tokenized_text = word_tokenize(utils.to_unicode(clinical_text))
    stemmer = SnowballStemmer('english')
    processed_text = []
    for word in tokenized_text:
        if word not in stop_words:
            processed_text.append(stemmer.stem(word))
    return processed_text

print process_clinical_text('I am of the opinion that there is no guarantee any where in life')    

[u'i', u'opinion', u'guarante', u'life']


In [12]:
"""
    Apply CountVectorizer model on the clinical text training and test data
"""
from sklearn.feature_extraction.text import CountVectorizer

count_vector = CountVectorizer()

processed_clinical_text = X_train_text['Text']
count_vector.fit(processed_clinical_text)
clinical_text_train = count_vector.transform(processed_clinical_text).toarray()
processed_clinical_test_text = X_test_text['Text']
clinical_text_test = count_vector.transform(processed_clinical_test_text).toarray()
print clinical_text_test[0]



[0 3 0 ..., 0 0 0]


In [13]:
print clinical_text_train.shape
print clinical_text_test.shape

(3321, 155732)
(5668, 155732)


In [14]:
"""
    Combine gene vector, variation vector and clinical text vector into a resultant training set
"""

X_train = np.hstack((gene_vector.toarray(), variation_vector.toarray(), clinical_text_train))
X_test = np.hstack((gene_test_vector.toarray(), variation_test_vector.toarray(), clinical_text_test))

print X_train.shape
print X_test.shape
print y_train.shape

(3321, 161232)
(5668, 161232)
(3321, 9)


In [15]:
"""
    Expand dimensions of data to fit into the CNN model
"""

X_train_dim_data = np.expand_dims(X_train, axis=2)
#print X_train_dim_data.shape
y_train_dim_data = np.array(y_train)
#print y_train_dim_data.shape

X_test_dim_data = np.expand_dims(X_test, axis=2)
print X_test_dim_data.shape

(5668, 161232, 1)


In [16]:
"""
    Define and compile the CNN model
"""

from keras.layers import Conv1D, MaxPooling1D, GlobalAveragePooling1D
from keras.layers import Dense, Dropout, Flatten
from keras.models import Sequential

def create_cnn_model(optimizer='rmsprop'):
    num_classes = 9
    cnn_model = Sequential()
    cnn_model.add(Conv1D(filters=16, kernel_size=2, padding='same', activation='relu', input_shape=X_train_dim_data.shape[1:]))
    cnn_model.add(MaxPooling1D(pool_size=2))
    cnn_model.add(Dropout(0.2))
    cnn_model.add(Conv1D(filters=32, kernel_size=2, padding='same', activation='relu'))
    cnn_model.add(MaxPooling1D(pool_size=2))
    cnn_model.add(Dropout(0.2))
    cnn_model.add(Conv1D(filters=64, kernel_size=2, padding='same', activation='relu'))
    cnn_model.add(MaxPooling1D(pool_size=2))
    cnn_model.add(Dropout(0.2))
    cnn_model.add(GlobalAveragePooling1D())
    cnn_model.add(Dense(num_classes, activation='softmax'))

    cnn_model.summary()
    cnn_model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return cnn_model


Using TensorFlow backend.


In [17]:
"""
    Reshape the training inputs for KFold cross validation inputs
"""
X_train_2d = X_train_dim_data.reshape(X_train_dim_data.shape[0], X_train_dim_data.shape[1])
y_train_2d = y_train_dim_data.reshape(y_train_dim_data.shape[0], y_train_dim_data.shape[1])
print y_train_2d.shape
print y_train_2d[0]

(3321, 9)
[1 0 0 0 0 0 0 0 0]


In [18]:
"""
    Train the CNN model
"""

from keras.callbacks import ModelCheckpoint
from sklearn.model_selection import KFold

checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.from_scratch.hdf5', 
                               verbose=1, save_best_only=True)
cv_fold = KFold(n_splits=10)
cnn_model = create_cnn_model()

for train_index, test_index in cv_fold.split(X_train_2d):
    X_fold_train, X_fold_test = X_train_2d[train_index], X_train_2d[test_index]
    y_fold_train, y_fold_test = y_train_2d[train_index], y_train_2d[test_index]
    X_fold_train = np.expand_dims(X_fold_train, axis=2)
    X_fold_test = np.expand_dims(X_fold_test, axis=2)    
    cnn_model.fit(X_fold_train, y_fold_train, validation_data=(X_fold_test, y_fold_test), epochs=3, batch_size=20, 
              callbacks=[checkpointer], verbose=1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 161232, 16)        48        
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 80616, 16)         0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 80616, 16)         0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 80616, 32)         1056      
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 40308, 32)         0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 40308, 32)         0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 40308, 64)         4160      
__________

Epoch 3/3
Train on 2989 samples, validate on 332 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Train on 2989 samples, validate on 332 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [19]:
"""
    Load the best weights into the model
"""
cnn_model.load_weights('saved_models/weights.best.from_scratch.hdf5')

In [20]:
"""
    Make predictions from test data
"""

model_predictions = cnn_model.predict(X_test_dim_data)

print model_predictions[0]


[ 0.18649754  0.12723508  0.02971044  0.21590438  0.08643806  0.08711641
  0.24601415  0.00724449  0.01383942]


In [21]:
y_pred = []
for i in range(len(model_predictions)):
    max_value = model_predictions[i][np.argmax(model_predictions[i])]
    y_pred.append(model_predictions[i] // max_value)



In [22]:
print y_pred[2]

[ 0.  0.  0.  0.  0.  0.  1.  0.  0.]


In [27]:
"""
    Kaggle competition test data set for target class labels are incomplete and as such our test prediction 
    and true values need to be reconciled appropriately
"""
y_true_data = pd.read_csv('stage1_solution_filtered.csv')
#print y_true_data['ID']
y_final_pred = [y_pred[d] for d in y_true_data['ID']]
model_predictions_final = [model_predictions[d] for d in y_true_data['ID']]
print model_predictions_final[0]
y_final_pred_labels = [ np.argmax(pred) + 1 for pred in y_final_pred]
#print y_final_pred_labels[0]

y_true_data.drop('ID', axis=1, inplace=True)
print np.array(y_true_data)
y_true_data_labels = [np.argmax(data) + 1 for data in np.array(y_true_data)]
print y_true_data_labels

print len(y_final_pred_labels)
print len(y_true_data_labels)

[ 0.18592404  0.12709844  0.02955492  0.21704428  0.08652719  0.08736788
  0.24605358  0.00699793  0.01343174]
[[1 0 0 ..., 0 0 0]
 [0 1 0 ..., 0 0 0]
 [0 1 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]]
[1, 2, 2, 4, 4, 4, 9, 7, 7, 7, 2, 1, 4, 1, 1, 1, 2, 3, 2, 7, 2, 1, 7, 7, 3, 7, 9, 7, 1, 4, 1, 6, 4, 9, 6, 4, 1, 5, 2, 1, 3, 2, 7, 7, 5, 7, 4, 6, 4, 1, 1, 4, 7, 1, 6, 7, 1, 1, 7, 2, 7, 2, 1, 7, 4, 9, 7, 4, 4, 1, 4, 5, 1, 1, 1, 4, 2, 2, 1, 7, 1, 5, 6, 7, 6, 6, 1, 7, 5, 2, 1, 4, 7, 7, 4, 1, 1, 4, 7, 1, 1, 2, 3, 7, 2, 2, 7, 4, 1, 4, 4, 7, 4, 7, 7, 7, 6, 3, 7, 7, 4, 4, 4, 2, 4, 1, 4, 1, 4, 1, 2, 4, 7, 7, 4, 7, 7, 7, 4, 1, 2, 7, 4, 1, 1, 7, 2, 7, 7, 7, 2, 2, 1, 4, 1, 1, 7, 7, 1, 5, 4, 4, 7, 2, 1, 2, 4, 6, 4, 7, 1, 1, 1, 1, 7, 7, 4, 5, 4, 1, 4, 2, 6, 7, 4, 5, 7, 3, 1, 7, 6, 2, 1, 8, 7, 7, 5, 7, 5, 5, 1, 1, 1, 7, 7, 2, 1, 1, 7, 1, 2, 1, 4, 9, 1, 2, 7, 7, 1, 2, 4, 1, 4, 7, 7, 1, 4, 8, 4, 5, 4, 4, 5, 1, 4, 1, 2, 7, 2, 4, 1, 6, 6, 5, 7, 7, 6, 2, 7, 9, 1, 1, 2, 6

In [32]:
from sklearn.metrics import log_loss
#print model_predictions_final

y_true_data_final = np.array(y_true_data)
print y_true_data_final.shape
print np.array(model_predictions_final).shape
#print log_loss(y_true_data_labels, model_predictions_final)
print log_loss(y_true_data_final, np.array(model_predictions_final))

(368, 9)
(368, 9)
1.78656935668
