# Import Data / Tool Downloads

In [1]:
import cptac
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [2]:
os.chdir("/Users/mi/Desktop/QBIO/qbio_490_jeanneR/analysis_data")

In [3]:
cptac.download('CCRCC')

                                          

True

In [4]:
ccrcc = cptac.Ccrcc()

clinical = ccrcc.get_clinical() # Clinical Data
rna = ccrcc.get_transcriptomics() # Transcriptomic Data
protein = ccrcc.get_proteomics() # Proteomic Data
protein.columns = protein.columns.get_level_values(0) # Fix protein data columns


                                          

# pre-processing

In [20]:
#removal of NAs in tumor stage
na_mask = clinical.loc[:, 'tumor_stage_pathological'].isna()
masked_clinical = clinical.loc[~na_mask, :]
masked_rna = rna.loc[~na_mask, :]
masked_protein = protein.loc[~na_mask, :]

#log scaling of RNA data/replacing infinity with NA
masked_rna = masked_rna.apply(np.log2)
masked_rna = masked_rna.replace(-np.inf, np.nan)
masked_rna = masked_rna.replace(np.inf, np.nan)

#removal of NAs in protein / rna counts
gene_na_mask = masked_rna.isna().sum() == 0
masked_rna = masked_rna.loc[:, gene_na_mask]

protein_na_mask = masked_protein.isna().sum() == 0
masked_protein = masked_protein.loc[:, protein_na_mask]

# Step 1: most differentially expressed proteins and RNA

In [6]:
#create masks separating stage I and stage III
one_mask = masked_clinical.loc[:, 'tumor_stage_pathological'] == 'Stage I'
three_mask = masked_clinical.loc[:, 'tumor_stage_pathological'] == 'Stage III'

#top 5 most differentially expressed in protein
Pdiff = abs(masked_protein.loc[three_mask, :].mean() - masked_protein.loc[one_mask, :].mean())
Pshared_mask = Pdiff.index.isin(masked_rna.columns)
top5_protein = Pdiff.sort_values(ascending=False)[:5].index

top5_protein

Index(['FTL', 'HBZ', 'HBA2', 'CMA1', 'HBB'], dtype='object', name='Name')

In [7]:
#top 5 most differentially expressed in RNA
Rdiff = abs(masked_rna.loc[one_mask, :].mean() - masked_rna.loc[three_mask, :].mean())
Rshared_mask = Rdiff.index.isin(masked_protein.columns)
top5_rna = Rdiff[Rshared_mask].sort_values(ascending=False)[:5].index

top5_rna

Index(['DPEP1', 'FABP7', 'HP', 'IGF2', 'LGALS4'], dtype='object', name='Name')

# Step 2: making the new DF (our X data)

In [165]:
top5_protein

Index(['FTL', 'HBZ', 'HBA2', 'CMA1', 'HBB'], dtype='object', name='Name')

In [166]:
top5_rna

Index(['DPEP1', 'FABP7', 'HP', 'IGF2', 'LGALS4'], dtype='object', name='Name')

In [8]:
analysis_df = pd.DataFrame(
    columns=['FTL', 'HBZ', 'HBA2', 'CMA1', 'HBB','DPEP1', 'FABP7', 'HP', 'IGF2', 'LGALS4'], 
    index=masked_clinical.index)

#data filled in from proteins
analysis_df.loc[:, 'FTL'] = masked_protein.loc[:, 'FTL']
analysis_df.loc[:, 'HBZ'] = masked_protein.loc[:, 'HBZ']
analysis_df.loc[:, 'HBA2'] = masked_protein.loc[:, 'HBA2']
analysis_df.loc[:, 'CMA1'] = masked_protein.loc[:, 'CMA1']
analysis_df.loc[:, 'HBB'] = masked_protein.loc[:, 'HBB']

#data filled in from RNA
analysis_df.loc[:, 'DPEP1'] = masked_rna.loc[:, 'DPEP1']
analysis_df.loc[:, 'FABP7'] = masked_rna.loc[:, 'FABP7']
analysis_df.loc[:, 'HP'] = masked_rna.loc[:, 'HP']
analysis_df.loc[:, 'IGF2'] = masked_rna.loc[:, 'IGF2']
analysis_df.loc[:, 'LGALS4'] = masked_rna.loc[:, 'LGALS4']

analysis_df

Unnamed: 0_level_0,FTL,HBZ,HBA2,CMA1,HBB,DPEP1,FABP7,HP,IGF2,LGALS4
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
C3L-00004,-1.724339,-0.370098,-0.786307,-0.918273,-0.762287,-1.239811,3.389993,-0.638359,-2.716588,-0.010823
C3L-00010,-0.363228,0.240576,-0.081263,0.74786,-0.034944,4.515644,1.642512,-1.587265,-0.046563,-0.826224
C3L-00011,-0.977364,-0.087641,-0.418663,-0.255054,-0.371128,-1.955016,3.150152,2.383658,0.242121,-2.455629
C3L-00026,1.30193,0.98193,0.780756,0.049086,0.855097,4.63543,-0.695299,-2.332735,-1.3459,-0.53051
C3L-00079,-1.496648,-0.441854,-1.014088,-0.620829,-0.939448,0.376214,3.146078,-0.307823,-3.112058,1.066195
...,...,...,...,...,...,...,...,...,...,...
C3N-01646,-1.81712,-0.724017,-1.180566,-0.000699,-1.127855,-0.946572,-3.295649,2.290917,-1.356234,0.687761
C3N-01648,0.459593,-0.396638,-0.672383,-0.611409,-0.605164,2.734819,-3.385171,-0.768065,2.005039,-1.333174
C3N-01649,-1.066512,-0.10963,-0.584516,0.322161,-0.527777,3.354027,0.200644,0.89752,1.843279,-1.020377
C3N-01651,-1.895398,-0.096922,-0.649448,-0.302394,-0.571897,-0.554359,8.011803,-1.366501,-0.953134,8.433732


# Step 3: making the list of tumor definitions

In [9]:
stage_list = masked_clinical['tumor_stage_pathological']
stage_list

Patient_ID
C3L-00004    Stage III
C3L-00010      Stage I
C3L-00011     Stage IV
C3L-00026      Stage I
C3L-00079    Stage III
               ...    
C3N-01646    Stage III
C3N-01648     Stage II
C3N-01649    Stage III
C3N-01651     Stage II
C3N-01808      Stage I
Name: tumor_stage_pathological, Length: 110, dtype: object

# Step 4: scaling and encoding

In [33]:
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder()
unencoded_columns = masked_clinical.loc[:, ['tumor_stage_pathological']]
encoded_columns = encoder.fit_transform(unencoded_columns)
masked_clinical.loc[:, ['tumor_stage_pathological']] = encoded_columns #names the columns again

stage_list = masked_clinical['tumor_stage_pathological']
stage_list


Patient_ID
C3L-00004    2.0
C3L-00010    0.0
C3L-00011    3.0
C3L-00026    0.0
C3L-00079    2.0
            ... 
C3N-01646    2.0
C3N-01648    1.0
C3N-01649    2.0
C3N-01651    1.0
C3N-01808    0.0
Name: tumor_stage_pathological, Length: 110, dtype: object

In [24]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaled_data = scaler.fit_transform(analysis_df)

In [59]:
pd.DataFrame(scaled_data, index=analysis_df.index, columns=analysis_df.columns)


Unnamed: 0_level_0,FTL,HBZ,HBA2,CMA1,HBB,DPEP1,FABP7,HP,IGF2,LGALS4
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
C3L-00004,-1.023771,-0.901621,-1.072661,-1.017216,-1.153689,-1.172213,0.377417,-0.191959,-1.001276,-0.321008
C3L-00010,0.004761,-0.150770,-0.296003,1.494525,-0.300597,1.322653,-0.084850,-0.583287,0.296542,-0.616494
C3L-00011,-0.459315,-0.554328,-0.667674,-0.017395,-0.694904,-1.482239,0.313971,1.054318,0.436862,-1.206959
C3L-00026,1.263046,0.760760,0.653575,0.441104,0.743321,1.374578,-0.703278,-0.890718,-0.335026,-0.509333
C3L-00079,-0.851714,-0.989849,-1.323579,-0.568812,-1.361479,-0.471701,0.312893,-0.055646,-1.193502,0.069282
...,...,...,...,...,...,...,...,...,...,...
C3N-01646,-1.093880,-1.336781,-1.506967,0.366051,-1.582459,-1.045100,-1.391157,1.016071,-0.340050,-0.067855
C3N-01648,0.626530,-0.934254,-0.947165,-0.554611,-0.969402,0.550704,-1.414838,-0.245450,1.293763,-0.800202
C3N-01649,-0.526680,-0.581364,-0.850373,0.852772,-0.878635,0.819117,-0.466272,0.441436,1.215137,-0.686851
C3N-01651,-1.153032,-0.565739,-0.921901,-0.088762,-0.930383,-0.875084,1.600038,-0.492244,-0.144115,2.739135


# Step 5: creating the train test split

In [67]:
from sklearn.model_selection import train_test_split


stage_list = np.array(stage_list)
stage_list=stage_list.astype('int')

X_train, X_test, y_train, y_test = train_test_split(scaled_data, stage_list, train_size=0.7)

np.array(X_test)

print(X_train.shape) # dataset split by train size
print(X_test.shape) # dataset not included in above dataset
print(y_train.shape) # labels for corresponding X_train
print(y_test.shape) # labels for corresponding X_test


(77, 10)
(33, 10)
(77,)
(33,)


# Step 6: classification and accuracy

In [74]:
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split

classifiers_names = [
    'Nearest Neighbors', 
    'Decision Tree', 
    'MLPC',
    'GaussianNB'
]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    MLPClassifier(),
    GaussianNB()
]

classifiers_perf = {
    0: [],
    1: [],
    2: [],
    3: [],
}

for n in range(10):
    X_train, X_test, y_train, y_test = train_test_split(analysis_df, stage_list, train_size=0.7)
    np.array(X_test)
    for i in range(len(classifiers)):
        model = classifiers[i]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        accuracy = sum(y_pred == y_test) / len(y_test)
        classifiers_perf[i].append(abs(np.mean(accuracy)))


        
print('\nAfter 10 simulations, the average accuracy for each regressor is as follows:')
for i in classifiers_perf:
    print(f'\t{classifiers_names[i]} : {np.mean(classifiers_perf[i])}')


After 10 simulations, the average accuracy for each regressor is as follows:
	Nearest Neighbors : 0.5363636363636364
	Decision Tree : 0.4242424242424242
	MLPC : 0.5575757575757576
	GaussianNB : 0.5303030303030303


# Step 7: comparison

According to the average accuracy, MLPC performs the best. However, the difference in performance between MLPC, KNeighbors, and GaussianNB is not that large. All 3 models get ~50-55% accuracy, which is still not very accurate.