<a href="https://colab.research.google.com/github/jhphan/ML-Notebooks/blob/main/tcga-ov-ml-therapy-test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# authenticate to gcloud
!gcloud auth application-default login

In [2]:
# udpate these variables
client_project = 'cgc-05-0051'
cancer_type = 'TCGA-OV'
project = 'isb-cgc'

In [3]:
# Load dependencies
from google.cloud import bigquery
import pandas as pd
from sklearn.preprocessing import StandardScaler


In [4]:
# Create a client to access the data within BigQuery
client = bigquery.Client(client_project)



Get Gene Expression Data from Big Query Table


In [None]:
ge_data = client.query(("""
  SELECT
    ge.case_barcode AS sample,
    labels.response_label AS label,
    ge.gene_name AS gene_name,
    -- Multiple samples may exist per case, take the max value
    MAX(LOG(ge.HTSeq__FPKM_UQ+1)) AS gene_expression
  FROM `isb-cgc.TCGA_hg38_data_v0.RNAseq_Gene_Expression` AS ge
  INNER JOIN (
    SELECT
      *
    FROM (
      SELECT
        case_barcode,
        primary_therapy_outcome_success,
        CASE
          -- Complete Reponse    --> label as 1
          -- All other responses --> label as 0
          WHEN primary_therapy_outcome_success = 'Complete Remission/Response' THEN 1
          WHEN (
            primary_therapy_outcome_success IN (
              'Partial Remission/Response','Progressive Disease','Stable Disease'
            )
          ) THEN 0
        END AS response_label
        FROM `isb-cgc.TCGA_bioclin_v0.Clinical`
        WHERE
          project_short_name = 'TCGA-OV' -- Only Ovarian cancer dataset
          AND primary_therapy_outcome_success IS NOT NULL
    )
  ) labels
  ON labels.case_barcode = ge.case_barcode
  WHERE gene_name IN ( -- 33 Gene signature, leave out PRSS2 (aka TRYP2)
    'RHOT1','MYO7A','ZBTB10','MATK','ST18','RPS23','GCNT1','DROSHA','NUAK1','CCPG1',
    'PDGFD','KLRAP1','MTAP','RNF13','THBS1','MLX','FAP','TIMP3','PRSS1','SLC7A11',
    'OLFML3','RPS20','MCM5','POLE','STEAP4','LRRC8D','WBP1L','ENTPD5','SYNE1','DPT',
    'COPZ2','TRIO','PDPR'
  )
  GROUP BY sample, label, gene_name
""")).result().to_dataframe()
ge_data


In [None]:
ge_data_pivot = ge_data.pivot(index=('sample', 'label'), columns='gene_name', values='gene_expression').reset_index(level=['sample','label'])
print(ge_data_pivot.info())
ge_data_pivot

In [111]:
# remove sample names from table
ge_data_pivot_nosample = ge_data_pivot.drop(labels='sample',axis=1)
#print(ge_data_pivot_nosample.info())
#print(ge_data_pivot_nosample)

# split data into train and test sets
train_data = ge_data_pivot_nosample.sample(frac=0.5, random_state=1).sort_index()
#print(train_data.info())
#print(train_data)

test_data = ge_data_pivot_nosample.drop(train_data.index)
#print(test_data)
#print(test_data.info())

data = dict()
data['train_y'] = train_data.pop('label')
data['test_y'] = test_data.pop('label')

#scaler = StandardScaler()
data['train_x'] = scaler.fit_transform(train_data)
data['test_x'] = scaler.transform(test_data)
#data['scaler'] = scaler

#print(data['train_y'])

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

lr = LogisticRegression(max_iter=1000)
lr.fit(data['train_x'], data['train_y'])
pred = lr.decision_function(data['test_x'])
fpr, tpr, thresholds = metrics.roc_curve(data['test_y'], pred)
auc = metrics.auc(fpr, tpr)
print('auc:', auc)
print('auc:', auc, 'fpr:', fpr, 'tpr:', tpr, 'thresh:', thresholds)

rf = RandomForestClassifier()
rf.fit(data['train_x'], data['train_y'])
pred = rf.predict(data['test_x'])
pred_proba = rf.predict_proba(data['test_x'])
print('pred:', pred)
print('pred proba:', pred_proba)
fpr, tpr, thresholds = metrics.roc_curve(data['test_y'], pred_proba[:,1])
auc = metrics.auc(fpr, tpr)
print('auc:', auc, 'fpr:', fpr, 'tpr:', tpr, 'thresh:', thresholds)


In [115]:
from sklearn import svm
from sklearn.metrics import accuracy_score

sv = svm.SVC(gamma=0.001, C=100)
sv.fit(data['train_x'], data['train_y'])
pred = sv.decision_function(data['test_x'])
print(pred)
fpr, tpr, thresholds = metrics.roc_curve(data['test_y'], pred)
auc = metrics.auc(fpr, tpr)
print('auc:', auc, 'fpr:', fpr, 'tpr:', tpr, 'thresh:', thresholds)


[-0.32458632 -2.25811669 -0.53890999 -0.09372597  0.29517058  1.64919899
 -0.04412576  0.92508798 -0.62389982 -0.44696897  2.32819176  2.7515359
  2.15020502  1.25747519  0.37994827  1.45750523 -1.03151382  1.54356784
  1.970623    0.46554394  0.40344879 -0.38878075  0.25826014  1.97900549
  0.15736329  0.11156121 -1.34952207  1.7965553   0.72278105  0.10058439
  1.20205524  0.15042829  0.82575615  3.38889774  1.702792   -0.20991142
  0.62101711  1.86964192  1.87268517  2.00842552 -0.39962761  0.92440571
 -1.27520536  2.14979576  1.69527717  1.06624378  1.37536708  1.49912412
 -0.48296274 -0.19881182 -0.45136152  0.64240058  1.89685762  0.22409761
  1.1783178   1.80010785 -0.2099434   1.29974703 -0.67821068  2.51449502
  1.03083066  1.02654118  0.620451   -0.03420452  0.40138476 -0.26879511
  0.14769651 -0.68460086  1.39902904  1.33888282  0.12831301  0.98755342
  0.63141426 -1.20705895  2.5390692  -1.01900279  0.25116914  0.52493528
  0.18561978  0.87326979  0.93744399  0.64699071  0.

In [None]:
data['val_y'], pred

In [None]:
# build DNN model

from keras.layers import Input, Dense, Dropout
from keras.models import Model

input_features = data['train_x'].shape[1]

# build the network
inputs = Input(shape=(input_features,), name='input')
x = Dense(64, activation='relu', name='hidden1', kernel_regularizer='l2')(inputs)
x = Dropout(0.5)(x)
x = Dense(32, activation='relu', name='hidden2', kernel_regularizer='l2')(x)
x = Dense(16, activation='relu', name='hidden3')(x)
x = Dense(8, activation='relu', name='hidden4')(x)
x = Dense(4, activation='relu', name='hidden5')(x)
prediction = Dense(1, activation='sigmoid', name='final')(inputs)
model = Model(inputs=inputs, outputs=prediction)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



In [None]:
model.fit(x=data['train_x'], y=data['train_y'], \
          batch_size=32, epochs=300, verbose=1, validation_data=(data['val_x'], data['val_y']))