### Begin by loading in Ccrcc data set from CPTAC

In [1]:
import os
os.chdir('/Users/kevinliu/desktop/qbio_490/QBIO_490_kevinliu/analysis_data')


In [2]:
import cptac
cptac.list_datasets()




Unnamed: 0_level_0,Description,Data reuse status,Publication link
Dataset name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Brca,breast cancer,no restrictions,https://pubmed.ncbi.nlm.nih.gov/33212010/
Ccrcc,clear cell renal cell carcinoma (kidney),no restrictions,https://pubmed.ncbi.nlm.nih.gov/31675502/
Colon,colorectal cancer,no restrictions,https://pubmed.ncbi.nlm.nih.gov/31031003/
Endometrial,endometrial carcinoma (uterine),no restrictions,https://pubmed.ncbi.nlm.nih.gov/32059776/
Gbm,glioblastoma,no restrictions,https://pubmed.ncbi.nlm.nih.gov/33577785/
Hnscc,head and neck squamous cell carcinoma,no restrictions,https://pubmed.ncbi.nlm.nih.gov/33417831/
Lscc,lung squamous cell carcinoma,no restrictions,https://pubmed.ncbi.nlm.nih.gov/34358469/
Luad,lung adenocarcinoma,no restrictions,https://pubmed.ncbi.nlm.nih.gov/32649874/
Ovarian,high grade serous ovarian cancer,no restrictions,https://pubmed.ncbi.nlm.nih.gov/27372738/
Pdac,pancreatic ductal adenocarcinoma,no restrictions,https://pubmed.ncbi.nlm.nih.gov/34534465/


In [3]:
cptac.download(dataset = "Ccrcc")
ccrcc = cptac.Ccrcc()
ccrcc.list_data()

Below are the dataframes contained in this dataset and their dimensions:

clinical
	194 rows
	171 columns
CNV
	110 rows
	19285 columns
followup
	352 rows
	27 columns
medical_history
	370 rows
	4 columns
methylation
	107 rows
	15885 columns
phosphoproteomics
	194 rows
	81550 columns
phosphoproteomics_gene
	194 rows
	6127 columns
proteomics
	194 rows
	11710 columns
somatic_mutation
	8350 rows
	3 columns
transcriptomics
	185 rows
	19275 columns


In [4]:
#Loading in appropriate datasets within Ccrcc
clinical_data_og = ccrcc.get_clinical()
protein_data = ccrcc.get_proteomics()
protein_data.columns = protein_data.columns.get_level_values(0) 
rna_data = ccrcc.get_transcriptomics()

### Preprocessing data for analysis by machine learning algorithms later

In [6]:
import pandas as pd
import numpy as np

#masking to only have Stage I, II, or III tumors
tumor_mask_stage = np.where(clinical_data_og.tumor_stage_pathological != "Stage IV", True, False)
clinical_data_inter = clinical_data_og[tumor_mask_stage]
clinical_data = clinical_data_inter[clinical_data_inter.tumor_stage_pathological.notna()]
#clinical_data.tumor_stage_pathological.unique()
#clinical_data

In [7]:
#Masking RNA and protein data to reflect data availible in clinical data
rna_mask = rna_data.index.isin(clinical_data.index)
rna_data = rna_data[rna_mask]
rna_data = np.log2(rna_data) 
rna_data = rna_data.dropna(axis=1)
rna_data = rna_data.T.loc[~rna_data.T.index.duplicated(), :].T

protein_mask = protein_data.index.isin(clinical_data.index)
protein_data = protein_data[protein_mask]
protein_data = protein_data.dropna(axis=1)
protein_data = protein_data.T.loc[~protein_data.T.index.duplicated(), :].T


  result = func(self.values, **kwargs)


In [8]:
rna_mean = rna_data.mean()
protein_mean = protein_data.mean()

print(np.absolute(rna_mean).sort_values(ascending=False))
print(np.absolute(protein_mean).sort_values(ascending=False))

#Top 5 differentially expressed RNA genes : NUP98, NMT1, RAB8A, TNPO3, ZNF664
#Top 5 differentially expressed Proteins: MT1H, MT1G, MT1F, MT1E, MT1B


Name
MMP8            inf
SLC6A11         inf
SLC5A11         inf
IGFL2           inf
SLC5A5          inf
             ...   
GPR153     0.003480
ACSM1      0.002700
PXDNL      0.001824
GCSAM      0.001776
RGS7BP     0.000914
Length: 19275, dtype: float64
Name
MT1H      2.844733
MT1G      2.590020
MT1F      2.432565
MT1E      2.283102
MT1B      2.145197
            ...   
LAP3      0.000167
AKAP1     0.000129
SCAMP4    0.000120
QSOX1     0.000115
USP33     0.000018
Length: 6606, dtype: float64


In [9]:
#Combining genes and proteins of interest into one dataframe
feature_df = pd.DataFrame()

feature_df["NUP98"] = rna_data.loc[:, "NUP98"]
feature_df["NMT1"] = rna_data.loc[:, "NMT1"]     
feature_df = pd.merge(feature_df, rna_data.loc[:, "RAB8A"], left_index=True, right_index=True)
feature_df = pd.merge(feature_df, rna_data.loc[:, "TNPO3"], left_index=True, right_index=True)
feature_df = pd.merge(feature_df, rna_data.loc[:, "ZNF664"], left_index=True, right_index=True)
feature_df = pd.merge(feature_df, protein_data.loc[:, "MT1H"], left_index=True, right_index=True)
feature_df = pd.merge(feature_df, protein_data.loc[:, "MT1G"], left_index=True, right_index=True)
feature_df = pd.merge(feature_df, protein_data.loc[:, "MT1F"], left_index=True, right_index=True)
feature_df = pd.merge(feature_df, protein_data.loc[:, "MT1E"], left_index=True, right_index=True)
feature_df = pd.merge(feature_df, protein_data.loc[:, "MT1B"], left_index=True, right_index=True)

patient_stages = clinical_data.tumor_stage_pathological

patient_stages = pd.DataFrame(patient_stages)

In [10]:
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder()
unencoded_columns = patient_stages
encoded_columns = encoder.fit_transform(unencoded_columns)
patient_stages = encoded_columns


In [11]:
from sklearn.preprocessing import StandardScaler
from umap import UMAP


scaler = StandardScaler()

embedding_unscaled = UMAP().fit_transform(feature_df)

scaled_data = scaler.fit_transform(feature_df)
embedding_scaled = UMAP().fit_transform(scaled_data)

  from .autonotebook import tqdm as notebook_tqdm


### Testing the different machine learning algorithms for best fit

In [12]:
from sklearn.neighbors import KNeighborsClassifier # default number of neighbors looked at is 5
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

In [13]:
#single run

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

patient_stages = np.array(patient_stages)

classifier = KNeighborsClassifier() # your choice of which classifier to use

X_train, X_test, y_train, y_test = train_test_split(scaled_data, patient_stages, train_size=0.7)

classifier.fit(X_train, y_train.ravel())

y_pred = classifier.predict(X_test)

In [14]:
#Loop of 10 runs for each classifier

regressors_names = [
    'KNeighborsClassifier', 
    'DecisionTreeClassifier', 
    'MLPClassifier', 
    'GaussianNB'
]

regressors = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    MLPClassifier(),
    GaussianNB()
]

regressors_perf = {
    0: [],
    1: [],
    2: [],
    3: []
}

for n in range(10):
    X_train, X_test, y_train, y_test = train_test_split(scaled_data, patient_stages, train_size=0.7)
    for i in range(len(regressors)):
        model = regressors[i]
        model.fit(X_train, y_train.ravel())
        y_pred = model.predict(X_test)

        difference = y_pred - y_test
        regressors_perf[i].append(abs(np.mean(difference)))

print('\nAfter 10 simulations, the average error for each regressor is as follows:')
for i in regressors_perf:
    print(f'\t{regressors_names[i]} : {np.mean(regressors_perf[i])}')






After 10 simulations, the average error for each regressor is as follows:
	KNeighborsClassifier : 0.2566666666666667
	DecisionTreeClassifier : 0.15666666666666668
	MLPClassifier : 0.18
	GaussianNB : 0.24333333333333335




### After 10 simulations, the average error for each regressor is as follows:
	- KNeighborsClassifier : 0.3068333333333333
	- DecisionTreeClassifier : 0.2131333333333333
	- MLPClassifier : 0.2469666666666667
	- GaussianNB : 0.22826666666666667

After running the multiple trials of each classifier, it seems like DecisionTreeClassifier is the best fit. Overall, when the training set was compared to the test set, it had the least average error. DecisionTreeClassifier had an average error around 0.213 with the next lowest classifier GaussianNB at 0.228.