In [1]:
#Run to install libraries
!pip install pandas
!pip install matplotlib
!pip install scikit-learn
!pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Downloading xgboost-1.2.1-py3-none-manylinux2010_x86_64.whl (148.9 MB)
[K     |████████████████████████████████| 148.9 MB 95 kB/s s eta 0:00:01  |                                | 358 kB 8.2 MB/s eta 0:00:19     |▏                               | 952 kB 8.2 MB/s eta 0:00:19     |▍                               | 1.6 MB 8.2 MB/s eta 0:00:18     |▌                               | 2.3 MB 8.2 MB/s eta 0:00:18     |▋                               | 2.9 MB 8.2 MB/s eta 0:00:18     |██▍                             | 11.1 MB 8.2 MB/s eta 0:00:17     |███████▉                        | 36.4 MB 15.3 MB/s eta 0:00:08     |█████████████████████████▊      | 119.9 MB 14.7 

# Foundation Medicine Adult Cancer Clinical Dataset (FM-AD)

### Objective
For an organ there are two aspects of diagnosis, the biospecimen and clinical. We seek to predict the primary diagnosis using data from the biospecimen.

Basically, Classification of Cancer based on Biospecimen results

In [2]:
"""
Libraries to import
"""
from os import listdir
from os.path import isfile, join, isdir
import pandas as pd
from sklearn import preprocessing
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [3]:
"""
Data has been littered across different folders, pesent inside the source directory. 
It has been compiled into a manageable datastrutcture, containing organ-wise biospecmen and clinical records.
"""
def read_data(path_to_dataset):
    from os import listdir
    from os.path import isfile, join, isdir
    import pandas as pd
    mypath = path_to_dataset
    onlydirs = [join(mypath, f) for f in listdir(mypath) if isdir(join(mypath, f))]
    onlyfiles = [list(join(path, f) for f in listdir(path) if isfile(join(path, f)) ) for path in onlydirs]

    data = {}
    for item in onlyfiles:
        key = item[0].split("\\")[-1].split(".")[-2]
        dtype = item[0].split("\\")[-1]
        if key not in data:
            data[key] = {}
            data[key]["clinical"] = pd.DataFrame()
            data[key]["biospecimen"] = pd.DataFrame()
        if "Clinical" in dtype:
            data[key]["clinical"] = data[key]["clinical"].append(pd.read_csv(item[0], sep="\t"))
        elif "Biospecimen" in dtype:
            data[key]["biospecimen"] = data[key]["biospecimen"].append(pd.read_csv(item[0], sep="\t"))
            
    return data

In [4]:
"""
Utility function to assign consistent labels to diagnosed cancer
"""
#Output Classes for this notebook
cancer = ['Papillary carcinoma', 'Carcinoma', 'Follicular carcinoma',
       'Medullary carcinoma', 'Astrocytoma', 'Glioblastoma',
       'Oligodendroglioma', 'Glioma', 'Meningioma', 'Ependymoma',
       'Not Reported', 'Medulloblastoma', 'Squamous cell carcinoma',
       'Adenocarcinoma', 'Melanoma', 'Neuroendocrine carcinoma',
       'Urothelial carcinoma', 'Adenoid cystic carcinoma',
       'Sarcomatoid carcinoma', 'Germ Cell Tumor', 'Mesothelioma',
       'Clear cell carcinoma', 'Renal cell carcinoma',
       'Collecting duct carcinoma', 'Papillary renal cell carcinoma',
       'Serous carcinoma', 'Endometrioid adenocarcinoma',
       'Carcinosarcoma', 'Clear cell adenocarcinoma',
       'Papillary serous adenocarcinoma', 'Infiltrating duct carcinoma',
       'Lobular carcinoma', 'Ductal carcinoma in situ',
       'Metaplastic carcinoma', 'Inflammatory carcinoma',
       'Cholangiocarcinoma', 'Hepatocellular carcinoma',
       'Adrenal cortical carcinoma', 'Pheochromocytoma', 'Neuroblastoma',
       'Esthesioneuroblastoma', 'Thymoma', 'Mucoepidermoid carcinoma',
       'Spindle cell carcinoma', 'Pituitary adenoma',
       'Granulosa cell tumor', 'Sex cord tumor', 'Mucinous carcinoma',
       'Non-small cell carcinoma', 'Small cell carcinoma',
       'Atypical carcinoid tumor', 'Adenosquamous carcinoma',
       'Large cell neuroendocrine carcinoma', 'Large cell carcinoma',
       'Carcinoid tumor', 'Merkel cell carcinoma', 'Basal cell carcinoma',
       'Adnexal carcinoma', 'Myoepithelial carcinoma', 'Duct carcinoma',
       'Acinar cell tumor', 'Gastrointestinal stromal tumor',
       'Basaloid carcinoma', 'Acinar adenocarcinoma',
       'Duct adenocarcinoma', 'Chordoma', 'Paraganglioma',
       'Acinar cell carcinoma', 'Solid pseudopapillary tumor']

cancer_type = ['anaplastic', 'malignant',' diffuse type', 'intestinal type', 'undifferentiated']

spec_type = ['NOS']

reference = {"cancer": cancer, "cancer_type_1": cancer_type, "cancer_type_2": spec_type}

def get_class(item, key):
    for label in item.split(","):
        if label.strip() in reference[key]:
            return label.strip()
    return None

In [5]:
"""
1. Merges biospecimen and clinical data
2. Remoes duplicates
3. Fills missing values.
"""
def clean_data(data):
    
    #Join biospecimen and clinica, alos remoe duplicates.
    merged_data = {}

    for key in data:
        merged_data[key] = pd.merge(data[key]["biospecimen"].drop_duplicates(), data[key]["clinical"].drop_duplicates(), on=['case_id','cases.submitter_id', 'project_id', 'project.name',
           'cases.disease_type', 'cases.primary_site'], suffixes=[False, False])
    
    features = ["cases.disease_type", "cases.primary_site", "samples.tumor_descriptor", "aliquots.analyte_type", "slides.percent_tumor_nuclei", "diagnoses.primary_diagnosis", "diagnoses.classification_of_tumor", "diagnoses.age_at_diagnosis"]
    full_data = pd.DataFrame()
    for key in merged_data:
        full_data = full_data.append(merged_data[key][features])

    full_data["cancer"] = full_data["diagnoses.primary_diagnosis"].apply(lambda x: get_class(x, "cancer"))
    full_data["cancer_type_1"] = full_data["diagnoses.primary_diagnosis"].apply(lambda x: get_class(x, "cancer_type_1"))
    full_data["cancer_type_2"] = full_data["diagnoses.primary_diagnosis"].apply(lambda x: get_class(x, "cancer_type_2"))
    
    #Replace None with "NA"
    full_data = full_data.fillna("NA")
    
    return full_data

In [6]:
"""
Encode all strings to numbers
"""
encoder = {}
def encode_data(full_data):
    encoded_data = pd.DataFrame()
    encode_cols = ["cases.disease_type", "cases.primary_site", "samples.tumor_descriptor", "aliquots.analyte_type",
                   "cancer", "cancer_type_1", "cancer_type_2"]
    normal_cols = ["diagnoses.age_at_diagnosis", "slides.percent_tumor_nuclei"]

    global encoder;
    for col in encode_cols:
        encoder[col] = preprocessing.LabelEncoder()
        encoded_data = pd.concat([encoded_data, pd.DataFrame({col:pd.Series(encoder[col].fit_transform(full_data[col]))})], axis=1)

    for col in normal_cols:
        encoded_data[col] =  full_data[col].values

    encoded_data = encoded_data.replace("NA", 0)
    
    return encoded_data

In [7]:
"""
Seperate input features from output features
"""
def get_input_output(encoded_data):
    outputs = []
    inputs = []
    for key in encoded_data:
        if "cancer" in key:
            outputs.append(key)
        else:
            inputs.append(key)

    input_data = encoded_data[inputs]
    output_data = encoded_data[outputs]
    
    return input_data, output_data

### Classification Model

In [8]:
#change path here
data = read_data("/home/jupyter-user/fm-ad-dataset/gdc-fm-ad-phs001179-2-open/datasets")
cleaned_data = clean_data(data)
encoded_data = encode_data(cleaned_data)
input_data, output_data = get_input_output(encoded_data)

In [9]:
"""
Create and train classification model
"""
output_label = "cancer"
X_train, X_test, y_train, y_test = train_test_split(input_data.values, output_data[output_label].values, test_size=0.2, random_state=2020, stratify=output_data[output_label].values)

def create_classifier(X_train, y_train):
    model = XGBClassifier(objective="multi:softmax")
    model.fit(X_train, y_train)
    return model

model = create_classifier(X_train, y_train)

In [10]:
#Test performance metric
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        37
           1       0.00      0.00      0.00         2
           2       1.00      1.00      1.00         6
           3       0.94      0.95      0.95      1274
           4       0.55      0.72      0.62        32
           5       1.00      1.00      1.00         6
           6       1.00      1.00      1.00         4
           7       0.82      0.82      0.82        11
           8       0.35      0.32      0.33        38
           9       0.08      0.20      0.12         5
          10       1.00      1.00      1.00         4
          11       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         2
          13       0.98      0.98      0.98       439
          14       0.94      0.94      0.94        16
          15       0.64      0.81      0.72        54
          16       0.67      0.80      0.73         5
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
"""
One function to do it all. Returns a dataframe with predicted diagnosis appended to the rspective rows.
"""
def classify(model, path_to_dataset):
    data = read_data(path_to_dataset)
    cleaned_data = clean_data(data)
    encoded_data = encode_data(cleaned_data)
    input_data, output_data = get_input_output(encoded_data)
    
    y_pred = model.predict(input_data.values)
    predictions = [round(value) for value in y_pred]
    
    cleaned_data["prediction"] = encoder["cancer"].inverse_transform(predictions)
    return cleaned_data

In [13]:
classify(model, "/home/jupyter-user/fm-ad-dataset/gdc-fm-ad-phs001179-2-open/datasets")

Unnamed: 0,cases.disease_type,cases.primary_site,samples.tumor_descriptor,aliquots.analyte_type,slides.percent_tumor_nuclei,diagnoses.primary_diagnosis,diagnoses.classification_of_tumor,diagnoses.age_at_diagnosis,cancer,cancer_type_1,cancer_type_2,prediction
0,Squamous Cell Neoplasms,Thyroid Gland,Metastatic,DNA,70,"Papillary carcinoma, NOS",metastasis,19047,Papillary carcinoma,,NOS,Papillary carcinoma
1,Squamous Cell Neoplasms,Thyroid Gland,Metastatic,DNA,80,"Papillary carcinoma, NOS",metastasis,32065,Papillary carcinoma,,NOS,Papillary carcinoma
2,"Epithelial Neoplasms, NOS",Thyroid Gland,Metastatic,DNA,90,"Carcinoma, anaplastic, NOS",metastasis,28417,Carcinoma,anaplastic,NOS,Carcinoma
3,Squamous Cell Neoplasms,Thyroid Gland,Metastatic,DNA,80,"Papillary carcinoma, NOS",metastasis,22157,Papillary carcinoma,,NOS,Papillary carcinoma
4,Adenomas and Adenocarcinomas,Thyroid Gland,Metastatic,DNA,80,"Follicular carcinoma, NOS",metastasis,15648,Follicular carcinoma,,NOS,Follicular carcinoma
...,...,...,...,...,...,...,...,...,...,...,...,...
724,Ductal and Lobular Neoplasms,Pancreas,Primary,DNA,10,"Duct adenocarcinoma, NOS",primary,21343,Duct adenocarcinoma,,NOS,Duct adenocarcinoma
725,"Epithelial Neoplasms, NOS",Pancreas,Metastatic,DNA,40,"Carcinoma, NOS",metastasis,20533,Carcinoma,,NOS,Carcinoma
726,"Epithelial Neoplasms, NOS",Pancreas,Metastatic,DNA,30,"Carcinoma, NOS",metastasis,21454,Carcinoma,,NOS,Carcinoma
727,Ductal and Lobular Neoplasms,Pancreas,Metastatic,DNA,30,"Duct adenocarcinoma, NOS",metastasis,21734,Duct adenocarcinoma,,NOS,Duct adenocarcinoma
