<a href="https://colab.research.google.com/github/jhphan/ML-Notebooks/blob/main/tcga-ov-ml-therapy-test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# authenticate to gcloud
!gcloud auth application-default login

In [2]:
# udpate these variables
client_project = 'cgc-05-0051'
cancer_type = 'TCGA-OV'
project = 'isb-cgc'

In [3]:
# Load dependencies
from google.cloud import bigquery
import pandas as pd
from sklearn.preprocessing import StandardScaler


In [4]:
# Create a client to access the data within BigQuery
client = bigquery.Client(client_project)



Get Gene Expression Data from Big Query Table


In [26]:
ge_data = client.query(("""
  SELECT
    ge.case_barcode AS sample,
    labels.response_label AS label,
    ge.gene_name AS gene_name,
    -- Multiple samples may exist per case, take the max value
    MAX(LOG(ge.HTSeq__FPKM_UQ+1)) AS gene_expression
  FROM `isb-cgc.TCGA_hg38_data_v0.RNAseq_Gene_Expression` AS ge
  INNER JOIN (
    SELECT
      *
    FROM (
      SELECT
        case_barcode,
        primary_therapy_outcome_success,
        CASE
          -- Complete Reponse    --> label as 1
          -- All other responses --> label as 0
          WHEN primary_therapy_outcome_success = 'Complete Remission/Response' THEN 1
          WHEN (
            primary_therapy_outcome_success IN (
              'Partial Remission/Response','Progressive Disease','Stable Disease'
            )
          ) THEN 0
        END AS response_label
        FROM `isb-cgc.TCGA_bioclin_v0.Clinical`
        WHERE
          project_short_name = 'TCGA-OV' -- Only Ovarian cancer dataset
          AND primary_therapy_outcome_success IS NOT NULL
    )
  ) labels
  ON labels.case_barcode = ge.case_barcode
  WHERE gene_name IN ( -- 33 Gene signature, leave out PRSS2 (aka TRYP2)
    'RHOT1','MYO7A','ZBTB10','MATK','ST18','RPS23','GCNT1','DROSHA','NUAK1','CCPG1',
    'PDGFD','KLRAP1','MTAP','RNF13','THBS1','MLX','FAP','TIMP3','PRSS1','SLC7A11',
    'OLFML3','RPS20','MCM5','POLE','STEAP4','LRRC8D','WBP1L','ENTPD5','SYNE1','DPT',
    'COPZ2','TRIO','PDPR'
  )
  GROUP BY sample, label, gene_name
""")).result().to_dataframe()
ge_data


Unnamed: 0,sample,label,gene_name,gene_expression
0,TCGA-29-2427,1,ZBTB10,12.332616
1,TCGA-25-2399,0,TRIO,12.260439
2,TCGA-24-1563,1,ENTPD5,11.090034
3,TCGA-25-2404,1,MATK,9.166014
4,TCGA-04-1530,1,TIMP3,11.379099
5,TCGA-10-0933,1,PDPR,11.596913
6,TCGA-25-1630,1,RHOT1,11.33276
7,TCGA-24-2289,1,DROSHA,12.715656
8,TCGA-23-1113,0,MATK,9.802521
9,TCGA-59-A5PD,1,LRRC8D,12.250964


In [36]:
ge_data_pivot = ge_data.pivot(index=('sample', 'label'), columns='gene_name', values='gene_expression').reset_index(level=['sample','label'])
print(ge_data_pivot.info())
ge_data_pivot

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264 entries, 0 to 263
Data columns (total 35 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   sample   264 non-null    object 
 1   label    264 non-null    int64  
 2   CCPG1    264 non-null    float64
 3   COPZ2    264 non-null    float64
 4   DPT      264 non-null    float64
 5   DROSHA   264 non-null    float64
 6   ENTPD5   264 non-null    float64
 7   FAP      264 non-null    float64
 8   GCNT1    264 non-null    float64
 9   KLRAP1   264 non-null    float64
 10  LRRC8D   264 non-null    float64
 11  MATK     264 non-null    float64
 12  MCM5     264 non-null    float64
 13  MLX      264 non-null    float64
 14  MTAP     264 non-null    float64
 15  MYO7A    264 non-null    float64
 16  NUAK1    264 non-null    float64
 17  OLFML3   264 non-null    float64
 18  PDGFD    264 non-null    float64
 19  PDPR     264 non-null    float64
 20  POLE     264 non-null    float64
 21  PRSS1    264 non

gene_name,sample,label,CCPG1,COPZ2,DPT,DROSHA,ENTPD5,FAP,GCNT1,KLRAP1,LRRC8D,MATK,MCM5,MLX,MTAP,MYO7A,NUAK1,OLFML3,PDGFD,PDPR,POLE,PRSS1,RHOT1,RNF13,RPS20,RPS23,SLC7A11,ST18,STEAP4,SYNE1,THBS1,TIMP3,TRIO,WBP1L,ZBTB10
0,TCGA-04-1331,1,10.539111,11.143231,10.009571,12.334575,10.781688,10.283354,12.588672,11.374252,12.638973,8.502774,12.673201,11.781499,11.173863,8.263564,11.334484,12.914012,11.122056,11.406349,11.924447,12.367228,11.273918,12.549913,16.004095,13.83526,10.512128,5.029529,9.41056,9.049248,12.284884,10.274116,11.829727,12.660331,11.903866
1,TCGA-04-1341,1,10.569199,11.980806,10.222321,12.484973,10.353219,9.944509,9.108526,10.554263,12.092996,10.115035,12.3016,12.945691,10.39911,11.439968,10.828685,12.535803,9.648688,10.857763,10.974823,12.379495,11.421387,12.033471,17.707094,16.086429,8.733232,0.0,8.155717,8.475682,12.389445,9.221587,11.501418,12.669757,11.696892
2,TCGA-04-1343,0,10.890308,12.631115,11.120903,11.822304,10.407624,10.748226,9.787761,9.737273,12.284605,11.183509,12.551777,12.246615,10.259087,9.632067,11.303465,13.258534,11.581108,11.22433,10.978964,11.755699,11.201657,12.219116,16.928277,16.167832,10.38576,6.017366,10.086793,8.861457,13.138837,11.055373,11.312023,13.080233,11.475221
3,TCGA-04-1347,1,10.718091,11.138805,9.646658,12.224155,9.966276,9.036678,10.764306,10.461862,13.021845,7.921795,12.622306,12.529089,10.573285,8.868582,10.401551,14.063014,11.287625,11.778924,11.173281,12.510965,11.8329,12.487581,17.593193,15.812298,8.321389,4.83755,7.716122,8.529101,12.319825,10.372511,11.251312,13.079971,11.069757
4,TCGA-04-1350,0,10.232808,10.439182,6.812439,12.370328,11.517169,0.0,10.621041,9.824635,12.264505,8.167888,11.824207,11.722005,10.575625,9.134887,9.395592,11.901866,9.983738,11.548242,10.484034,11.710467,11.743296,12.280039,17.672137,16.220792,11.480417,5.018684,5.264262,7.519461,10.966667,10.150531,11.526722,13.643295,12.604795
5,TCGA-04-1356,0,11.105739,11.186329,10.46077,12.133691,10.305486,10.215275,9.876985,11.57013,12.182387,9.474869,12.615892,12.121093,10.731033,10.463049,10.052049,11.668067,9.759041,10.080539,10.975385,13.031346,11.140847,12.582605,16.513217,14.535177,10.309926,5.481416,8.200149,8.466287,12.378987,9.015502,11.670936,12.835863,12.584512
6,TCGA-04-1361,0,10.612369,10.933169,8.088687,12.182003,10.872513,7.415769,11.524966,9.293005,12.092919,10.482661,12.323442,11.815045,10.419118,9.89664,10.192579,13.656734,10.363386,11.422839,10.699376,10.482222,11.377121,12.681935,16.30145,14.245391,9.567712,5.94341,9.301417,9.069132,13.521503,10.274618,11.276957,13.140158,11.415166
7,TCGA-04-1364,0,10.165883,10.359229,8.017052,12.153836,10.71543,6.902682,9.887658,10.102168,12.057047,7.370549,13.219701,12.357547,10.981638,9.011422,10.180239,11.94234,10.193979,11.619984,10.875693,12.078099,11.551157,12.359608,16.425143,15.52813,7.69899,4.002283,7.545581,8.259097,11.713764,10.202413,11.303401,12.330303,12.348752
8,TCGA-04-1365,1,10.417514,10.545824,8.422281,12.734106,10.691195,8.842034,10.726867,10.476481,12.571115,9.210443,12.759112,12.258327,11.234237,9.814794,10.121529,12.239835,9.905958,11.464068,11.690091,11.84248,11.229934,12.35004,16.420414,14.195807,10.615813,5.881127,8.779495,8.453133,12.058706,8.774769,11.488067,12.792773,12.315494
9,TCGA-04-1514,1,9.873516,10.533711,7.365841,12.621936,11.028765,8.460977,10.07817,10.928764,12.423648,7.52809,11.927324,11.985848,11.295664,11.062341,11.383891,12.535974,11.223519,12.736526,11.492937,11.516413,11.964629,11.722918,15.164296,15.123933,8.879413,3.997486,7.188567,8.608052,12.147,10.335974,12.8363,12.662514,11.56213


In [111]:
# remove sample names from table
ge_data_pivot_nosample = ge_data_pivot.drop(labels='sample',axis=1)
#print(ge_data_pivot_nosample.info())
#print(ge_data_pivot_nosample)

# split data into train and test sets
train_data = ge_data_pivot_nosample.sample(frac=0.5, random_state=1).sort_index()
#print(train_data.info())
#print(train_data)

test_data = ge_data_pivot_nosample.drop(train_data.index)
#print(test_data)
#print(test_data.info())

data = dict()
data['train_y'] = train_data.pop('label')
data['test_y'] = test_data.pop('label')

#scaler = StandardScaler()
data['train_x'] = scaler.fit_transform(train_data)
data['test_x'] = scaler.transform(test_data)
#data['scaler'] = scaler

#print(data['train_y'])

In [117]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

lr = LogisticRegression(max_iter=1000)
lr.fit(data['train_x'], data['train_y'])
pred = lr.decision_function(data['test_x'])
fpr, tpr, thresholds = metrics.roc_curve(data['test_y'], pred)
auc = metrics.auc(fpr, tpr)
print('auc:', auc)
print('auc:', auc, 'fpr:', fpr, 'tpr:', tpr, 'thresh:', thresholds)

rf = RandomForestClassifier()
rf.fit(data['train_x'], data['train_y'])
pred = rf.predict(data['test_x'])
fpr, tpr, thresholds = metrics.roc_curve(data['test_y'], pred)
auc = metrics.auc(fpr, tpr)
print('auc:', auc)
print('auc:', auc, 'fpr:', fpr, 'tpr:', tpr, 'thresh:', thresholds)
      

auc: 0.741892254087376
auc: 0.741892254087376 fpr: [0.         0.         0.         0.02439024 0.02439024 0.04878049
 0.04878049 0.07317073 0.07317073 0.09756098 0.09756098 0.12195122
 0.12195122 0.14634146 0.14634146 0.17073171 0.17073171 0.19512195
 0.19512195 0.2195122  0.2195122  0.24390244 0.24390244 0.26829268
 0.26829268 0.29268293 0.29268293 0.34146341 0.34146341 0.36585366
 0.36585366 0.3902439  0.3902439  0.41463415 0.41463415 0.43902439
 0.43902439 0.48780488 0.48780488 0.53658537 0.53658537 0.58536585
 0.58536585 0.6097561  0.6097561  0.65853659 0.65853659 0.70731707
 0.70731707 0.73170732 0.73170732 0.95121951 0.95121951 0.97560976
 0.97560976 1.        ] tpr: [0.         0.01098901 0.17582418 0.17582418 0.25274725 0.25274725
 0.3956044  0.3956044  0.41758242 0.41758242 0.42857143 0.42857143
 0.43956044 0.43956044 0.48351648 0.48351648 0.49450549 0.49450549
 0.51648352 0.51648352 0.53846154 0.53846154 0.61538462 0.61538462
 0.62637363 0.62637363 0.63736264 0.63736264 0.65

In [115]:
from sklearn import svm
from sklearn.metrics import accuracy_score

sv = svm.SVC(gamma=0.001, C=100)
sv.fit(data['train_x'], data['train_y'])
pred = sv.decision_function(data['test_x'])
print(pred)
fpr, tpr, thresholds = metrics.roc_curve(data['test_y'], pred)
auc = metrics.auc(fpr, tpr)
print('auc:', auc, 'fpr:', fpr, 'tpr:', tpr, 'thresh:', thresholds)


[-0.32458632 -2.25811669 -0.53890999 -0.09372597  0.29517058  1.64919899
 -0.04412576  0.92508798 -0.62389982 -0.44696897  2.32819176  2.7515359
  2.15020502  1.25747519  0.37994827  1.45750523 -1.03151382  1.54356784
  1.970623    0.46554394  0.40344879 -0.38878075  0.25826014  1.97900549
  0.15736329  0.11156121 -1.34952207  1.7965553   0.72278105  0.10058439
  1.20205524  0.15042829  0.82575615  3.38889774  1.702792   -0.20991142
  0.62101711  1.86964192  1.87268517  2.00842552 -0.39962761  0.92440571
 -1.27520536  2.14979576  1.69527717  1.06624378  1.37536708  1.49912412
 -0.48296274 -0.19881182 -0.45136152  0.64240058  1.89685762  0.22409761
  1.1783178   1.80010785 -0.2099434   1.29974703 -0.67821068  2.51449502
  1.03083066  1.02654118  0.620451   -0.03420452  0.40138476 -0.26879511
  0.14769651 -0.68460086  1.39902904  1.33888282  0.12831301  0.98755342
  0.63141426 -1.20705895  2.5390692  -1.01900279  0.25116914  0.52493528
  0.18561978  0.87326979  0.93744399  0.64699071  0.

In [None]:
data['val_y'], pred

In [None]:
# build DNN model

from keras.layers import Input, Dense, Dropout
from keras.models import Model

input_features = data['train_x'].shape[1]

# build the network
inputs = Input(shape=(input_features,), name='input')
x = Dense(64, activation='relu', name='hidden1', kernel_regularizer='l2')(inputs)
x = Dropout(0.5)(x)
x = Dense(32, activation='relu', name='hidden2', kernel_regularizer='l2')(x)
x = Dense(16, activation='relu', name='hidden3')(x)
x = Dense(8, activation='relu', name='hidden4')(x)
x = Dense(4, activation='relu', name='hidden5')(x)
prediction = Dense(1, activation='sigmoid', name='final')(inputs)
model = Model(inputs=inputs, outputs=prediction)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



In [None]:
model.fit(x=data['train_x'], y=data['train_y'], \
          batch_size=32, epochs=300, verbose=1, validation_data=(data['val_x'], data['val_y']))