# Uses AutoML to train a precision medicine document classifier on ClinicalTrials data

In [2]:
import pandas as pd
studies = pd.read_csv("20180712processedGoldStandardCT.tsv", sep = '\t', encoding='utf8')
studies.fillna("", inplace=True)
studies.head()

Unnamed: 0.1,Unnamed: 0,trec_topic_number,trec_doc_id,pm_rel_desc,disease_desc,gene1_annotation_desc,gene1_name,gene2_annotation_desc,gene2_name,gene3_annotation_desc,...,other_interventions,inclusion_criteria,mesh_terms_conditions,mesh_terms_interventions,trec_topic_disease,trec_topic_age,trec_topic_sex,trec_topic_other1,trec_topic_other2,trec_topic_other3
0,0,1,NCT00001188,Not PM,,,,,,,...,radiation therapy following surgery,Patients must have biopsy-proven soft tissue s...,Sarcoma,,Liposarcoma,38-year-old,male,GERD,,
1,1,1,NCT00001189,Not PM,,,,,,,...,radiotherapy,DISEASE CHARACTERISTICS: Biopsy-proven ...,Sarcoma;Soft Tissue Neoplasms,,Liposarcoma,38-year-old,male,GERD,,
2,2,1,NCT00002185,Not PM,,,,,,,...,,Patients must have: - HIV-po...,"HIV Infections;Sarcoma;Sarcoma, Kaposi",Nelfinavir,Liposarcoma,38-year-old,male,GERD,,
3,3,1,NCT00002466,Not PM,,,,,,,...,conventional surgery;radiation therapy,DISEASE CHARACTERISTICS: Diagnosis of peripher...,"Sarcoma;Kidney Neoplasms;Sarcoma, Ewing;Neuroe...",Cyclophosphamide;Ifosfamide;Isophosphamide mus...,Liposarcoma,38-year-old,male,GERD,,
4,4,1,NCT00002641,Not PM,,,,,,,...,filgrastim;adjuvant therapy;conventional surge...,DISEASE CHARACTERISTICS: - Histologi...,Sarcoma;Endometrial Neoplasms;Kidney Neoplasms...,Doxorubicin;Liposomal doxorubicin;Isophosphami...,Liposarcoma,38-year-old,male,GERD,,


In [3]:
feature_names = ["official_title",
                "brief_summary",
                "detailed_description",
                "study_design_info",
                "outcomes",
                "conditions",
                "arm_groups",
                "drug_interventions",
                "other_interventions",
                "inclusion_criteria",
                "mesh_terms_conditions",
                "mesh_terms_interventions"]

In [4]:
studies["conditions"]= studies["conditions"].str.replace(";", " ")
studies["mesh_terms_conditions"]= studies["mesh_terms_conditions"].str.replace(";", " ")
studies["mesh_terms_interventions"]= studies["mesh_terms_interventions"].str.replace(";", " ")
studies["drug_interventions"]= studies["drug_interventions"].str.replace(";", " ")
studies["other_interventions"]= studies["other_interventions"].str.replace(";", " ")

In [5]:
for i in range(1):
    for f in feature_names:
        print(f + ": " + str(studies.loc[i+1, f]), end="\n\n")

official_title: The Treatment of Grade I Sarcomas and Benign, Non-Metastasizing Highly Invasive Soft Tissue Tumors

brief_summary: Patients with Grade I soft tissue sarcomas or benign, non-metastasizing invasive soft tissue      tumors will receive wide local excision and be prospectively randomized as to either receive      or not receive radiation therapy.

detailed_description: This is a randomized study. Patients undergo surgical excision of all gross disease and then      are randomized to Arm I or Arm II.      Arm I: Radiotherapy. Involved-field irradiation.      Arm II: No further treatment.

study_design_info: Treatment

outcomes: Characterize the natural and clinical histories of inherited urologic malignant disorders.    Ongoing;Determine the genetic etiology of hereditary urologic malignant disorders in which the gene defect is unknown, by linkage analysis, positional cloning and evaluation of candidate genes.    Ongoing;Correlate specific mutations and their associated prot

In [6]:
studies['X'] = studies.apply(lambda r: ' '.join(r[feature] for feature in feature_names), axis=1)

In [7]:
studies["X"][1]

"The Treatment of Grade I Sarcomas and Benign, Non-Metastasizing Highly Invasive Soft Tissue Tumors Patients with Grade I soft tissue sarcomas or benign, non-metastasizing invasive soft tissue      tumors will receive wide local excision and be prospectively randomized as to either receive      or not receive radiation therapy. This is a randomized study. Patients undergo surgical excision of all gross disease and then      are randomized to Arm I or Arm II.      Arm I: Radiotherapy. Involved-field irradiation.      Arm II: No further treatment. Treatment Characterize the natural and clinical histories of inherited urologic malignant disorders.    Ongoing;Determine the genetic etiology of hereditary urologic malignant disorders in which the gene defect is unknown, by linkage analysis, positional cloning and evaluation of candidate genes.    Ongoing;Correlate specific mutations and their associated protein domains with disease phenotypic expression based on parameters including presenti

In [8]:
import numpy as np
np.unique(studies['pm_rel_desc'])

studies["pm"] = 0
studies["pm"][studies["pm_rel_desc"] == "Human PM"] = 1
studies["pm"][studies["pm_rel_desc"] == "Animal PM"] = 1
studies["pm"][:10]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
Name: pm, dtype: int64

In [9]:
print(sum(studies.pm == 0))
print(sum(studies.pm == 1))

9480
3961


In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(studies['X'], studies['pm'], test_size=0.25, random_state=33)
print("X_train shape: ", X_train.shape)
print("y_train shape: ", y_train.shape)
print("X_test shape: ", X_test.shape)
print("y_test shape: ", y_test.shape)

X_train shape:  (10080,)
y_train shape:  (10080,)
X_test shape:  (3361,)
y_test shape:  (3361,)


In [11]:
print("x_train instance: ", X_train[1])
print("y_train instance: ", y_train[1])

x_train instance:  The Treatment of Grade I Sarcomas and Benign, Non-Metastasizing Highly Invasive Soft Tissue Tumors Patients with Grade I soft tissue sarcomas or benign, non-metastasizing invasive soft tissue      tumors will receive wide local excision and be prospectively randomized as to either receive      or not receive radiation therapy. This is a randomized study. Patients undergo surgical excision of all gross disease and then      are randomized to Arm I or Arm II.      Arm I: Radiotherapy. Involved-field irradiation.      Arm II: No further treatment. Treatment Characterize the natural and clinical histories of inherited urologic malignant disorders.    Ongoing;Determine the genetic etiology of hereditary urologic malignant disorders in which the gene defect is unknown, by linkage analysis, positional cloning and evaluation of candidate genes.    Ongoing;Correlate specific mutations and their associated protein domains with disease phenotypic expression based on parameters 

In [12]:
import sklearn
from sklearn import preprocessing

#Encode from string to numbers
enc = preprocessing.LabelEncoder()
y_train = enc.fit_transform(y_train)
y_test = enc.transform(y_test)

In [13]:
import string
from nltk.stem import PorterStemmer
from nltk import word_tokenize
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

def stemming_tokenizer(text):
    stemmer = PorterStemmer()
    return [stemmer.stem(w) for w in word_tokenize(text)]

In [14]:
n_words = 10000
vectorizer = TfidfVectorizer(tokenizer=stemming_tokenizer,
                             stop_words=stopwords.words('english') + list(string.punctuation),lowercase = True, max_features = n_words)
tfidf = vectorizer.fit(X_train.values.astype('U'))

In [15]:
X_test = tfidf.transform(X_test.values.astype('U'))
X_train = tfidf.transform(X_train.values.astype('U'))

print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)

X_train shape:  (10080, 10000)
X_test shape:  (3361, 10000)


In [16]:
print(X_train[0])

  (0, 9895)	0.07640168464229442
  (0, 9866)	0.04877070868669573
  (0, 9861)	0.045401458785041926
  (0, 9656)	0.11695867860820577
  (0, 9580)	0.10587472379258316
  (0, 9537)	0.012837907129987473
  (0, 9467)	0.04763827782955141
  (0, 9365)	0.06359613962668724
  (0, 9307)	0.01611855311596489
  (0, 9290)	0.045406711966273966
  (0, 9215)	0.01880541809365138
  (0, 9086)	0.022006317895254486
  (0, 9043)	0.016304586580080414
  (0, 8859)	0.09223762659451608
  (0, 8854)	0.02256466564801664
  (0, 8761)	0.0312824710122498
  (0, 8737)	0.0361943721464876
  (0, 8695)	0.10769292970923393
  (0, 8673)	0.03755916071942743
  (0, 8541)	0.3956701599978626
  (0, 8540)	0.48537750524091133
  (0, 8391)	0.04717268938837295
  (0, 8015)	0.025690731254771736
  (0, 7982)	0.02379527058028806
  (0, 7967)	0.04315040222061069
  :	:
  (0, 2451)	0.04291117082412941
  (0, 2280)	0.021723226376840335
  (0, 2260)	0.15364505158882066
  (0, 2156)	0.02226319464940933
  (0, 2095)	0.11413265396999447
  (0, 2068)	0.1186980614301624

In [17]:
import autosklearn.classification
automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=10800#1200#86400
)
automl.fit(X_train, y_train)
y_hat = automl.predict(X_test)
print("Final Models:", automl.show_models())

Final Models: [(0.200000, SimpleClassificationPipeline({'balancing:strategy': 'weighting', 'categorical_encoding:__choice__': 'no_encoding', 'classifier:__choice__': 'adaboost', 'imputation:strategy': 'most_frequent', 'preprocessor:__choice__': 'liblinear_svc_preprocessor', 'rescaling:__choice__': 'standardize', 'classifier:adaboost:algorithm': 'SAMME', 'classifier:adaboost:learning_rate': 0.4305533777913843, 'classifier:adaboost:max_depth': 4, 'classifier:adaboost:n_estimators': 450, 'preprocessor:liblinear_svc_preprocessor:C': 1.7691037850299067, 'preprocessor:liblinear_svc_preprocessor:dual': 'False', 'preprocessor:liblinear_svc_preprocessor:fit_intercept': 'True', 'preprocessor:liblinear_svc_preprocessor:intercept_scaling': 1, 'preprocessor:liblinear_svc_preprocessor:loss': 'squared_hinge', 'preprocessor:liblinear_svc_preprocessor:multi_class': 'ovr', 'preprocessor:liblinear_svc_preprocessor:penalty': 'l1', 'preprocessor:liblinear_svc_preprocessor:tol': 0.0004769684828846608},
data

In [24]:
print("Confusion Matrix: \n", sklearn.metrics.confusion_matrix(y_test, y_hat, labels=[1,0]))
print("Precision: ", sklearn.metrics.precision_score(y_test, y_hat))
print("Recall: ", sklearn.metrics.recall_score(y_test, y_hat))
print("Accuracy: ", sklearn.metrics.accuracy_score(y_test, y_hat))

Confusion Matrix: 
 [[ 568  412]
 [ 351 2030]]
Precision:  0.6180631120783461
Recall:  0.5795918367346938
Accuracy:  0.7729842308836655


In [25]:
import pickle
VECTORIZER_NAME = "tfidfmodel_studies.sav"
pickle.dump(tfidf, open(VECTORIZER_NAME, 'wb'))

In [20]:
MODELNAME = "trec_model_studies.sav"
pickle.dump(automl, open(MODELNAME, 'wb'))
 
# load the model from disk
loaded_model = pickle.load(open(MODELNAME, 'rb'))
y_hat2 = loaded_model.predict(X_test)
print("Accuracy: ", sklearn.metrics.accuracy_score(y_test, y_hat2))

Accuracy:  0.7729842308836655
