In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import time
import warnings
import numpy as np
from nltk.corpus import stopwords
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics.classification import accuracy_score, log_loss
#from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from collections import Counter, defaultdict
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import math
from sklearn.metrics import normalized_mutual_info_score
from sklearn.ensemble import RandomForestClassifier
warnings.filterwarnings("ignore")

In [2]:

data = pd.read_csv('training_variants')
data_text =pd.read_csv("training_text",sep="\|\|",engine="python",names=["ID","TEXT"],skiprows=1)

# loading stop words from nltk library
stop_words = set(stopwords.words('english'))



In [3]:

def nlp_preprocessing(total_text, index, column):
    if type(total_text) is not int:
        string = ""
        # replace every special char with space
        total_text = re.sub('[^a-zA-Z0-9\n]', ' ', str(total_text))
        # replace multiple spaces with single space
        total_text = re.sub('\s+',' ', total_text)
        # converting all the chars into lower-case.
        total_text = total_text.lower()
        
        for word in total_text.split():
        # if the word is a not a stop word then retain that word from the data
            if not word in stop_words:
                string += word + " "
        
        data_text[column][index] = string


In [4]:

#text processing stage.
start_time = time.clock()
for index, row in data_text.iterrows():#iterate over row in dataframe pandas
    nlp_preprocessing(row['TEXT'], index, 'TEXT')

#merging both gene_variations and text data based on ID
result = pd.merge(data, data_text,on='ID', how='left')

y_true = result['Class'].values
result.Gene      = result.Gene.str.replace('\s+', '_')
result.Variation = result.Variation.str.replace('\s+', '_')

# split the data into test and train by maintaining same distribution of output varaible 'y_true' [stratify=y_true]
X_train, test_df, y_train, y_test = train_test_split(result, y_true, stratify=y_true, test_size=0.2)
# split the train data into train and cross validation by maintaining same distribution of output varaible 'y_train' [stratify=y_train]
train_df, cv_df, y_train, y_cv = train_test_split(X_train, y_train, stratify=y_train, test_size=0.2)



In [5]:

# one-hot encoding of Gene feature.
gene_vectorizer = CountVectorizer()
train_gene_feature_onehotCoding = gene_vectorizer.fit_transform(train_df['Gene'])
test_gene_feature_onehotCoding = gene_vectorizer.transform(test_df['Gene'])
cv_gene_feature_onehotCoding = gene_vectorizer.transform(cv_df['Gene'])


# one-hot encoding of variation feature.
variation_vectorizer = CountVectorizer()
train_variation_feature_onehotCoding = variation_vectorizer.fit_transform(train_df['Variation'])
test_variation_feature_onehotCoding = variation_vectorizer.transform(test_df['Variation'])
cv_variation_feature_onehotCoding = variation_vectorizer.transform(cv_df['Variation'])


from collections import defaultdict
def extract_dictionary_paddle(cls_text):
    dictionary = defaultdict(int)
    for index, row in cls_text.iterrows():
        for word in row['TEXT'].split():
            dictionary[word] +=1
    return dictionary

In [6]:


# building a CountVectorizer with all the words that occured minimum 3 times in train data
text_vectorizer = CountVectorizer(min_df=3)
train_text_feature_onehotCoding = text_vectorizer.fit_transform(train_df['TEXT'])
# getting all the feature names (words)
train_text_features= text_vectorizer.get_feature_names()

# train_text_feature_onehotCoding.sum(axis=0).A1 will sum every row and returns (1*number of features) vector
train_text_fea_counts = train_text_feature_onehotCoding.sum(axis=0).A1

# zip(list(text_features),text_fea_counts) will zip a word with its number of times it occured
text_fea_dict = dict(zip(list(train_text_features),train_text_fea_counts))



dict_list = []
# dict_list =[] contains 9 dictoinaries each corresponds to a class
for i in range(1,10):
    cls_text = train_df[train_df['Class']==i]
    # build a word dict based on the words in that class
    dict_list.append(extract_dictionary_paddle(cls_text))
    # append it to dict_list

In [7]:
# dict_list[i] is build on i'th  class text data
# total_dict is buid on whole training text data
total_dict = extract_dictionary_paddle(train_df)


confuse_array = []
for i in train_text_features:
    ratios = []
    max_val = -1
    for j in range(0,9):
        ratios.append((dict_list[j][i]+10 )/(total_dict[i]+90))
    confuse_array.append(ratios)
confuse_array = np.array(confuse_array)

# don't forget to normalize every feature
train_text_feature_onehotCoding = normalize(train_text_feature_onehotCoding, axis=0)

# we use the same vectorizer that was trained on train data
test_text_feature_onehotCoding = text_vectorizer.transform(test_df['TEXT'])
# don't forget to normalize every feature
test_text_feature_onehotCoding = normalize(test_text_feature_onehotCoding, axis=0)

# we use the same vectorizer that was trained on train data
cv_text_feature_onehotCoding = text_vectorizer.transform(cv_df['TEXT'])
# don't forget to normalize every feature
cv_text_feature_onehotCoding = normalize(cv_text_feature_onehotCoding, axis=0)

sorted_text_fea_dict = dict(sorted(text_fea_dict.items(), key=lambda x: x[1] , reverse=True))
sorted_text_occur = np.array(list(sorted_text_fea_dict.values()))


train_gene_var_onehotCoding = hstack((train_gene_feature_onehotCoding,train_variation_feature_onehotCoding))
test_gene_var_onehotCoding = hstack((test_gene_feature_onehotCoding,test_variation_feature_onehotCoding))
cv_gene_var_onehotCoding = hstack((cv_gene_feature_onehotCoding,cv_variation_feature_onehotCoding))

train_x_onehotCoding = hstack((train_gene_var_onehotCoding, train_text_feature_onehotCoding)).tocsr()
train_y = np.array(list(train_df['Class']))

test_x_onehotCoding = hstack((test_gene_var_onehotCoding, test_text_feature_onehotCoding)).tocsr()
test_y = np.array(list(test_df['Class']))

cv_x_onehotCoding = hstack((cv_gene_var_onehotCoding, cv_text_feature_onehotCoding)).tocsr()
cv_y = np.array(list(cv_df['Class']))




In [8]:




from sklearn.calibration import CalibratedClassifierCV

alpha = [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100,1000]
cv_log_error_array = []
for i in alpha:
    print("for alpha =", i)
    clf = MultinomialNB(alpha=i)
    clf.fit(train_x_onehotCoding, train_y)
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
    sig_clf.fit(train_x_onehotCoding, train_y)
    sig_clf_probs = sig_clf.predict_proba(cv_x_onehotCoding)
    cv_log_error_array.append(log_loss(cv_y, sig_clf_probs, labels=clf.classes_, eps=1e-15))
    # to avoid rounding error while multiplying probabilites we use log-probability estimates
    print("Log Loss :",log_loss(cv_y, sig_clf_probs)) 

best_alpha = np.argmin(cv_log_error_array)
clf = MultinomialNB(alpha=alpha[best_alpha])
clf.fit(train_x_onehotCoding, train_y)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(train_x_onehotCoding, train_y)

for alpha = 1e-05
Log Loss : 1.302486664191609
for alpha = 0.0001
Log Loss : 1.29769961445265
for alpha = 0.001
Log Loss : 1.2949829691796635
for alpha = 0.1
Log Loss : 1.269458471337792
for alpha = 1
Log Loss : 1.2970400277907674
for alpha = 10
Log Loss : 1.372703945845657
for alpha = 100
Log Loss : 1.3673831660690778
for alpha = 1000
Log Loss : 1.3069679345363816


CalibratedClassifierCV(base_estimator=MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True),
            cv=3, method='sigmoid')

In [16]:

sig_clf_probs = sig_clf.predict_proba(cv_x_onehotCoding)
# to avoid rounding error while multiplying probabilites we use log-probability estimates
#print("Log Loss :",log_loss(cv_y, sig_clf_probs))
#print("Number of missclassified point :", np.count_nonzero((sig_clf.predict(cv_x_onehotCoding)- cv_y))/cv_y.shape[0])
#plot_confusion_matrix(cv_y, sig_clf.predict(cv_x_onehotCoding.toarray()))

g=pd.Series('FAM58A')
v=pd.Series('Truncating Mutations')
st=pd.Series('hi')
'''
g=pd.Series('TERT')
v=pd.Series('C228T')
st=pd.Series(result.iloc[28][4])'''

test_gene_feature_onehotCoding = gene_vectorizer.transform(g)
test_variation_feature_onehotCoding = variation_vectorizer.transform(v)
test_text_feature_onehotCoding = text_vectorizer.transform(st)
test_text_feature_onehotCoding = normalize(test_text_feature_onehotCoding, axis=0)
test_gene_var_onehotCoding = hstack((test_gene_feature_onehotCoding,test_variation_feature_onehotCoding))
test_x_onehotCoding = hstack((test_gene_var_onehotCoding, test_text_feature_onehotCoding)).tocsr()

no_feature = 100
predicted_cls = sig_clf.predict(test_x_onehotCoding[0])
if predicted_cls[0]==7:
    print(predicted_cls,'1')
else:
    print(0)
print("Predicted Class :", predicted_cls[0])
l=np.round(sig_clf.predict_proba(test_x_onehotCoding[0]),4)
print("Predicted Class Probabilities:", np.round(sig_clf.predict_proba(test_x_onehotCoding[0]),4))
print(l.max())

0
Predicted Class : 1
Predicted Class Probabilities: [[0.3725 0.1299 0.02   0.1764 0.056  0.054  0.1805 0.0061 0.0046]]
0.3725


In [11]:
import pickle
naive_pickle=open('naive_pickle_file.pkl','wb')
pickle.dump(clf,naive_pickle)
naive_pickle.close()
G_Vectorizer_pickle=open('gene_vec_pickle_file.pkl','wb')
pickle.dump(gene_vectorizer,G_Vectorizer_pickle)
G_Vectorizer_pickle.close()
V_Vectorizer_pickle=open('var_vec_pickle_file.pkl','wb')
pickle.dump(variation_vectorizer,V_Vectorizer_pickle)
V_Vectorizer_pickle.close()
T_Vectorizer_pickle=open('report_vec_pickle_file.pkl','wb')
pickle.dump(text_vectorizer,T_Vectorizer_pickle)
T_Vectorizer_pickle.close()


In [12]:
sig_clf_pickle=open('sig_clf_pickle_file.pkl','wb')
pickle.dump(sig_clf,sig_clf_pickle)
T_Vectorizer_pickle.close()

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize
from scipy.sparse import hstack
import pickle

g=pd.Series('BRCA1')
v=pd.Series('p53')
st=pd.Series('hii')

test_gene_feature_onehotCoding = gene_vectorizer.transform(g)
test_variation_feature_onehotCoding = variation_vectorizer.transform(v)
test_text_feature_onehotCoding = text_vectorizer.transform(st)
test_text_feature_onehotCoding = normalize(test_text_feature_onehotCoding, axis=0)
test_gene_var_onehotCoding = hstack((test_gene_feature_onehotCoding,test_variation_feature_onehotCoding))
test_x_onehotCoding = hstack((test_gene_var_onehotCoding, test_text_feature_onehotCoding)).tocsr()

no_feature = 100
predicted_cls = sig_clf.predict(test_x_onehotCoding[0])
print("Predicted Class :", predicted_cls[0])
l=np.round(sig_clf.predict_proba(test_x_onehotCoding[0]),4)
print("Predicted Class Probabilities:", np.round(sig_clf.predict_proba(test_x_onehotCoding[0]),4))
print(l.max())