#Tempus Biomarkers Notebook
##Justin Collins

In [48]:
#import needed packages
import json
import pandas as pd
import numpy as np

In [49]:
#import data
biomarkers = pd.DataFrame(pd.read_csv("biomarkers.csv"))
targets = pd.DataFrame(pd.read_csv("targets.csv"))
patients = open("patient_profiles.json")

In [None]:
#check proper import
print(type(biomarkers))
print(type(targets))
print(type(patients))

In [None]:
#fix input problem of json importing as a string
patients_str = patients.read()
patients_json = json.loads(patients_str)[0]
type(patients_json)

In [52]:
#unfold nested dictionaries into dataframe
patients = patients_json
patients = pd.DataFrame(patients)
patients = pd.json_normalize(patients["patient_profiles"])

In [None]:
#create master dataframe
bioTarget = pd.merge(biomarkers, targets, on='biomarker_id')
fullFile = pd.merge(bioTarget, patients, on="patient_id")
fullFile.head()

In [None]:
#check for NaN
fullFile.isna().sum()

In [None]:
#most of cohort does not have comorbidty index
del fullFile["status.comorbidity_index"]
#fix NaN
fullFile['demographics.race'].fillna("unknown")

In [None]:
#one hot encoding
smoking_onehot = pd.get_dummies(fullFile["status.smoking_status"])
race_onehot = pd.get_dummies(fullFile['demographics.race'])
subtype_onehot = pd.get_dummies(fullFile["status.disease_sub_type"])
gender_onehot = pd.get_dummies(fullFile["demographics.gender"])
cohort_onehot = pd.get_dummies(fullFile["status.cohort_qualifier"])
patients_onehot = smoking_onehot.join(race_onehot)
patients_onehot1 = patients_onehot.join(subtype_onehot)
patients_onehot2 = patients_onehot1.join(gender_onehot)
patients_onehot3 = patients_onehot2.join(cohort_onehot)
patients_onehot3.head()

In [None]:
#merge one hot encoding to full dataframe
full_onehot = patients_onehot3.join(fullFile)
full_onehot.head()

In [58]:
#remove unneeded columns
del full_onehot["demographics.gender"]
del full_onehot["demographics.age"]
del full_onehot["status.cohort_qualifier"]
del full_onehot["demographics.race"]
del full_onehot["patient_id"]
del full_onehot["biomarker_id"]
del full_onehot["status.smoking_status"]
del full_onehot["status.disease_sub_type"]

In [None]:
#create a test vs train random split of data
X = full_onehot
Y = full_onehot["target_label"]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30)
data_test = X_test
data_test["target_label"] = y_test

In [60]:
#remove target from predictor data
del X_train["target_label"]
del X_test["target_label"]

In [None]:
#build classification tree model (CART)
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)

In [62]:
#evaluate the classification tree model
y_pred = classifier.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[51  8]
 [ 4  7]]
              precision    recall  f1-score   support

           0       0.93      0.86      0.89        59
           1       0.47      0.64      0.54        11

    accuracy                           0.83        70
   macro avg       0.70      0.75      0.72        70
weighted avg       0.85      0.83      0.84        70



CART accuracy score : 0.83

In [63]:
#import
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
#create object of the classifier
neigh = KNeighborsClassifier(n_neighbors=3)
#Train
neigh.fit(X_train, y_train)
#predict
pred = neigh.predict(data_test)
#evaluate
print ("KNeighbors accuracy score : ",accuracy_score(y_test, pred))

KNeighbors accuracy score :  0.7714285714285715


In [64]:
#import
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
#create an object of type LinearSVC
svc_model = LinearSVC(random_state=0)
#train the algorithm on training data and predict using the testing data
pred = svc_model.fit(X_train, y_train).predict(data_test)
#print the accuracy score of the model
print("LinearSVC accuracy : ",accuracy_score(y_test, pred, normalize = True))

LinearSVC accuracy :  0.8428571428571429


In [65]:
#import
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
#create an object of the type GaussianNB
gnb = GaussianNB()
#train the algorithm on training data and predict using the testing data
pred = gnb.fit(X_train, y_train).predict(data_test)
#print the accuracy score of the model
print("Naive-Bayes accuracy : ",accuracy_score(y_test, pred, normalize = True))

Naive-Bayes accuracy :  0.8428571428571429


When comparing these models, the SVC and Naive Bayes both slighly outperform CART but their accuracy is in question based on the model assumptions.

CART is the model I reccomend for use on this classification process