<a href="https://colab.research.google.com/github/jhphan/ML-Notebooks/blob/main/tcga-scikit-learn-logistic-regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# authenticate to gcloud
!gcloud auth application-default login

In [85]:
# udpate these variables
client_project = 'cgc-05-0051'
cancer_type = 'TCGA-GBM'
project = 'isb-cgc-bq'

In [3]:
# Load dependencies
from google.cloud import bigquery
import pandas as pd
from sklearn.preprocessing import StandardScaler


In [None]:
# Create a client to access the data within BigQuery
client = bigquery.Client(client_project)

In [None]:
# Get the list of unique proteins to create a pivot table
demo_gender = client.query(
  ("""
    SELECT count(demo__gender) as number, demo__gender from `{}.TCGA.clinical_gdc_current` where proj__project_id = '{}' group by demo__gender
  """).format(project, cancer_type)
).result().to_dataframe()
demo_gender

In [None]:
# Get the list of unique proteins to create a pivot table
protein_names = client.query(
  ("""
    SELECT CONCAT(
      '("',
      STRING_AGG(
        DISTINCT CONCAT('p_', REPLACE(protein_name, "-", "_")), '", "'
      ),
      '")'
    ) AS protein_names 
    FROM `{}.TCGA.protein_expression_hg38_gdc_current`
  """).format(project)
).result().to_dataframe()['protein_names'][0]
protein_names

In [None]:
# Join clinical and protein expression data to create a labeled pivot table
protein_expression = client.query(
  ("""
    SELECT * FROM (
      SELECT 
        pe.case_barcode AS sample,
        labels.gender AS label,
        CONCAT('p_', REPLACE(pe.protein_name, "-", "_")) AS protein_name,
        pe.protein_expression AS protein_expression
      FROM `{}.TCGA.protein_expression_hg38_gdc_current` AS pe
      INNER JOIN (
        SELECT *
        FROM (
          SELECT
            submitter_id,
            demo__gender,
            CASE
              WHEN demo__gender = 'female' THEN 1
              WHEN demo__gender = 'male' THEN 0
            END AS gender
          FROM `{}.TCGA.clinical_gdc_current`
          WHERE proj__project_id = '{}'
        )
        WHERE demo__gender IN ('male', 'female')
      ) labels
      ON labels.submitter_id = pe.case_barcode
    )
    PIVOT (
      MAX(protein_expression) FOR protein_name IN {}
    )
    ORDER BY sample ASC
  """).format(project, project, cancer_type, protein_names)
).result().to_dataframe()
protein_expression

In [None]:
# find and remove samples that have no valid protein values (all missing)
samples_notnull = protein_expression.loc[:, 'p_AR':].notnull().sum(axis=1)>0
# find and remove proteins that have any NULL values
proteins_notnull = pd.Series(
  [True, True],
  index=['sample', 'label']
).append(
  protein_expression.loc[:, 'p_AR':].isnull().sum(axis=0)==0
)
# create a new filtered DF
protein_expression_filtered = protein_expression.loc[samples_notnull, proteins_notnull]
protein_expression_filtered

In [109]:
# remove sample names from table
pd.set_option('max_rows', 99999)
protein_expression_filtered.pop('sample')

# split data into train, val, and test sets
train_data = protein_expression_filtered.sample(frac=0.8, random_state=1).sort_index()
#print(train_data)

val_data = protein_expression_filtered.drop(train_data.index)
test_data = val_data.sample(frac=0.5, random_state=1).sort_index()
val_data = val_data.drop(test_data.index)
#print(test_data)
#print(val_data)

data = dict()
data['train_y'] = train_data.pop('label')
data['val_y'] = val_data.pop('label')
data['test_y'] = test_data.pop('label')
#print(data['train_y'])
#print(data['val_y'])
#print(data['test_y'])

# calculate fold change and get top 20 proteins
train_fold_change = abs(train_data.loc[data['train_y']==1, 'p_AR':].mean(axis=0) - train_data.loc[data['train_y']==0, 'p_AR':].mean(axis=0))
top_train_fold_change = train_fold_change.sort_values(ascending=False).head(20)

#scaler = StandardScaler()
data['train_x'] = scaler.fit_transform(train_data.loc[:,top_train_fold_change.index])
data['val_x'] = scaler.transform(val_data.loc[:,top_train_fold_change.index])
data['test_x'] = scaler.transform(test_data.loc[:,top_train_fold_change.index])

#data['train_x'] = train_data
#data['val_x'] = val_data
#data['test_x'] = test_data


#data['scaler'] = scaler

In [None]:
print(top_train_fold_change)

In [110]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression()
lr.fit(data['train_x'], data['train_y'])
pred = lr.predict(data['val_x'])
acc = accuracy_score(data['val_y'], pred)
print('accuracy:', acc)



accuracy: 0.6875
