<a href="https://colab.research.google.com/github/jhphan/ML-Notebooks/blob/main/tcga-ov-ml-therapy-test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# authenticate to gcloud
!gcloud auth application-default login

In [None]:
# udpate these variables
client_project = 'cgc-05-0051'
cancer_type = 'TCGA-OV'
project = 'isb-cgc'

In [None]:
# Load dependencies
from google.cloud import bigquery
import pandas as pd
from sklearn.preprocessing import StandardScaler


In [None]:
# Create a client to access the data within BigQuery
client = bigquery.Client(client_project)

In [None]:
protein_data = client.query(
    ("""
    SELECT 
      pe.case_barcode as sample, 
      labels.response_label as label,
      pe.protein_name as protein_name,
      pe.protein_expression as protein_expression
    FROM `isb-cgc.TCGA_hg38_Data_v0.Protein_Expression` as pe
    INNER JOIN (
      SELECT *
      FROM (
        SELECT
          case_barcode,
          primary_therapy_outcome_success,
          CASE
            WHEN primary_therapy_outcome_success = 'Complete Remission/Response' THEN 1
            WHEN (
              primary_therapy_outcome_success IN (
                'Partial Remission/Response', 'Progressive Disease', 'Stable Disease'
              )
            ) THEN 0
          END AS response_label
        FROM `isb-cgc.TCGA_bioclin_v0.Clinical`
        WHERE
          project_short_name = 'TCGA-OV' 
          AND primary_therapy_outcome_success IS NOT NULL
      )
    ) labels
    ON labels.case_barcode = pe.case_barcode
    """)
)

In [None]:
# find and remove samples that have no valid protein values (all missing)
samples_notnull = protein_expression.loc[:, 'p_AR':].notnull().sum(axis=1)>0
# find and remove proteins that have any NULL values
proteins_notnull = pd.Series(
  [True, True],
  index=['sample', 'label']
).append(
  protein_expression.loc[:, 'p_AR':].isnull().sum(axis=0)==0
)
# create a new filtered DF
protein_expression_filtered = protein_expression.loc[samples_notnull, proteins_notnull]
protein_expression_filtered

In [None]:
# remove sample names from table
protein_expression_filtered.pop('sample')

# split data into train, val, and test sets
train_data = protein_expression_filtered.sample(frac=0.8, random_state=1)
val_data = protein_expression_filtered.drop(train_data.index)
test_data = val_data.sample(frac=0.5, random_state=1)
val_data = val_data.drop(test_data.index)

data = dict()
data['train_y'] = train_data.pop('label')
data['val_y'] = val_data.pop('label')
data['test_y'] = test_data.pop('label')

# calculate fold change and get top 20 proteins
train_fold_change = abs(train_data.loc[data['train_y']==1, 'p_AR':].mean(axis=0) - train_data.loc[data['train_y']==0, 'p_AR':].mean(axis=0))
top_train_fold_change = train_fold_change.sort_values(ascending=False) #.head(10)

scaler = StandardScaler()
data['train_x'] = scaler.fit_transform(train_data.loc[:,top_train_fold_change.index])
data['val_x'] = scaler.transform(val_data.loc[:,top_train_fold_change.index])
data['test_x'] = scaler.transform(test_data.loc[:,top_train_fold_change.index])

data['scaler'] = scaler

In [None]:
from sklearn import svm
from sklearn.metrics import accuracy_score

clf = svm.LinearSVC(max_iter=2000)
clf.fit(data['train_x'], data['train_y'])
pred = clf.predict(data['val_x'])
acc = accuracy_score(data['val_y'], pred)
acc, clf


In [None]:
data['val_y'], pred

In [None]:
# build DNN model

from keras.layers import Input, Dense, Dropout
from keras.models import Model

input_features = data['train_x'].shape[1]

# build the network
inputs = Input(shape=(input_features,), name='input')
x = Dense(64, activation='relu', name='hidden1', kernel_regularizer='l2')(inputs)
x = Dropout(0.5)(x)
x = Dense(32, activation='relu', name='hidden2', kernel_regularizer='l2')(x)
x = Dense(16, activation='relu', name='hidden3')(x)
x = Dense(8, activation='relu', name='hidden4')(x)
x = Dense(4, activation='relu', name='hidden5')(x)
prediction = Dense(1, activation='sigmoid', name='final')(inputs)
model = Model(inputs=inputs, outputs=prediction)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



In [None]:
model.fit(x=data['train_x'], y=data['train_y'], \
          batch_size=32, epochs=300, verbose=1, validation_data=(data['val_x'], data['val_y']))