<a href="https://colab.research.google.com/github/jhphan/ML-Notebooks/blob/main/tcga-ov-ml-test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# authenticate to gcloud
!gcloud auth application-default login

In [3]:
# udpate these variables
client_project = 'cgc-05-0051'
cancer_type = 'TCGA-OV'
project = 'isb-cgc-bq'

In [4]:
# Load dependencies
from google.cloud import bigquery
import pandas as pd

In [None]:
# Create a client to access the data within BigQuery
client = bigquery.Client(client_project)

In [13]:
# Get the list of unique proteins to create a pivot table
protein_names = client.query(
  ("""
    SELECT CONCAT(
      '("',
      STRING_AGG(
        DISTINCT CONCAT('p_', REPLACE(protein_name, "-", "_")), '", "'
      ),
      '")'
    ) AS protein_names 
    FROM `{}.TCGA.protein_expression_hg38_gdc_current`
  """).format(project)
).result().to_dataframe()['protein_names'][0]
protein_names

'("p_AR", "p_PR", "p_Rb", "p_S6", "p_ATM", "p_Akt", "p_Bak", "p_Bax", "p_Bid", "p_Bim", "p_Lck", "p_NF2", "p_SCD", "p_SF2", "p_Src", "p_Syk", "p_TAZ", "p_YAP", "p_p21", "p_p27", "p_p53", "p_ACC1", "p_ASNS", "p_BRD4", "p_CD20", "p_CD26", "p_CD31", "p_CDK1", "p_COG3", "p_Chk1", "p_Chk2", "p_DJ_1", "p_Dvl3", "p_EGFR", "p_ERK2", "p_FASN", "p_G6PD", "p_GAB2", "p_HER2", "p_HER3", "p_IRS1", "p_JAB1", "p_JNK2", "p_Jak2", "p_Ku80", "p_LKB1", "p_MEK1", "p_MSH2", "p_MSH6", "p_PCNA", "p_PDK1", "p_PTEN", "p_Smac", "p_TFRC", "p_TSC1", "p_TTF1", "p_XBP1", "p_YB_1", "p_cIAP", "p_eEF2", "p_mTOR", "p_53BP1", "p_A_Raf", "p_ADAR1", "p_B_Raf", "p_BRCA2", "p_Bcl_2", "p_C_Raf", "p_CD49b", "p_DUSP4", "p_EPPK1", "p_ERCC1", "p_ERCC5", "p_ETS_1", "p_FoxM1", "p_GAPDH", "p_GATA3", "p_HSP70", "p_IRF_1", "p_MIG_6", "p_MYH11", "p_Mre11", "p_N_Ras", "p_PAI_1", "p_PARP1", "p_PDCD4", "p_PEA15", "p_PRDX1", "p_PREX1", "p_RBM15", "p_Rab11", "p_Rab25", "p_Rad50", "p_Rad51", "p_SETD2", "p_Smad1", "p_Smad3", "p_Smad4", "p_Sna

In [106]:
# Join clinical and protein expression data to create a labeled pivot table
protein_expression = client.query(
  ("""
    SELECT * FROM (
      SELECT 
        pe.case_barcode AS sample,
        labels.survival_label AS label,
        CONCAT('p_', REPLACE(pe.protein_name, "-", "_")) AS protein_name,
        pe.protein_expression AS protein_expression
      FROM `isb-cgc-bq.TCGA.protein_expression_hg38_gdc_current` AS pe
      INNER JOIN (
        SELECT *
        FROM (
          SELECT
            submitter_id,
            demo__vital_status,
            demo__days_to_death,
            diag__days_to_last_follow_up,
            CASE
              WHEN demo__vital_status = 'Dead' AND demo__days_to_death < 365*5 THEN 0
              WHEN (
                (demo__vital_status = 'Dead' AND demo__days_to_death >= 365*5)
                OR (demo__vital_status = 'Alive' AND diag__days_to_last_follow_up >= 365*5)
              ) THEN 1
            END AS survival_label
          FROM `{}.TCGA.clinical_gdc_current`
          WHERE proj__project_id = '{}'
        )
        WHERE survival_label IS NOT NULL
      ) labels
      ON labels.submitter_id = pe.case_barcode
    )
    PIVOT (
      MAX(protein_expression) FOR protein_name IN {}
    )
    ORDER BY sample ASC
  """).format(project, cancer_type, protein_names)
).result().to_dataframe()
protein_expression

Unnamed: 0,sample,label,p_AR,p_PR,p_Rb,p_S6,p_ATM,p_Akt,p_Bak,p_Bax,p_Bid,p_Bim,p_Lck,p_NF2,p_SCD,p_SF2,p_Src,p_Syk,p_TAZ,p_YAP,p_p21,p_p27,p_p53,p_ACC1,p_ASNS,p_BRD4,p_CD20,p_CD26,p_CD31,p_CDK1,p_COG3,p_Chk1,p_Chk2,p_DJ_1,p_Dvl3,p_EGFR,p_ERK2,p_FASN,p_G6PD,p_GAB2,...,p_Axl,p_ENY2,p_GATA6,p_GCN5L2,p_SLC1A5,p_Myosin_IIa,p_Caspase_9,p_CK5,p_p63,p_E2F1,p_EZH2,p_Nrf2,p_CD274,p_CTLA4,p_KEAP1,p_LCN2a,p_MACC1,p_PD_L1,p_PDCD1,p_Napsin_A,p_PARP_Ab_3,p_Ret_pY905,p_Synaptophysin,p_Thymidilate_Synthase,p_Chromogranin_A_N_term,p_CA9,p_GYS,p_LDHA,p_LDHB,p_PKM2,p_PYGB,p_PYGL,p_PYGM,p_PYGB_AB2,p_GYS_pS641,p_HIF_1_alpha,p_Mitochondria,p_GYG_Glycogenin1,p_Complex_II_subunit30,p_Oxphos_complex_V_subunitb
0,TCGA-04-1335,0,-0.273046,0.655473,-0.059948,0.396466,0.471839,-0.360947,0.073951,-0.027833,0.220378,-0.169537,-0.207308,0.472585,-0.250574,0.069452,0.105560,0.807701,0.185348,-0.245572,0.169811,0.734501,0.378387,0.339863,-0.094780,-0.736922,0.105748,0.023813,0.179431,0.000000,-0.172312,0.107592,-0.122052,0.104710,-0.058025,0.088305,0.481404,-0.184973,0.465112,-0.618447,...,-0.246250,0.155728,-0.322177,-0.063724,-0.114690,0.161475,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,TCGA-04-1342,0,-0.454864,-0.279352,0.307657,-0.586464,-0.344297,-0.309244,0.014425,-0.084531,0.073717,-0.027273,0.163764,-0.312678,0.137515,-0.149619,0.007761,-0.293853,0.230295,-0.182857,0.437850,0.206107,-0.410178,0.327813,-0.274818,-0.576875,0.397169,0.234139,0.165179,0.206260,-0.237384,-0.097684,-0.011831,0.068046,0.017215,0.010933,-0.201780,-0.039214,0.312275,-0.164887,...,-0.052207,-0.323342,0.069502,0.210930,-0.478186,-0.261103,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,TCGA-04-1343,0,-0.812353,-0.115662,-0.186997,-0.017769,0.999080,-0.640257,-0.064718,0.214193,0.000000,-0.133464,-0.225404,0.015661,-0.094096,-0.276277,-0.049998,-0.261935,-0.113985,-0.120271,-0.151624,0.007576,-0.392061,0.390392,0.229451,-0.573896,-0.093023,0.076420,0.073038,-0.058808,-0.236441,-0.165233,0.401111,0.309949,0.059410,-0.057202,0.355117,-0.581117,0.066364,0.225440,...,0.060905,-0.353180,0.364148,-0.165845,-0.245247,0.051274,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,TCGA-04-1348,0,-0.204006,0.656353,0.153361,0.201853,0.001272,0.093862,0.021014,0.078214,0.274942,0.000205,0.603212,-0.001236,-0.220823,-0.170245,-0.247288,0.651018,-0.015465,-0.454123,0.122147,-0.365260,0.730890,0.111853,0.033802,0.557226,-0.171966,-0.162129,0.085110,0.630019,-0.044885,-0.127430,0.016262,0.098972,-0.213957,-0.390981,-0.216503,0.343462,-0.026268,-0.291124,...,0.189060,0.084445,-0.258488,0.257244,0.479030,-0.161060,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,TCGA-04-1349,0,-0.402298,-0.203032,-0.084841,-0.192992,0.127635,-0.014273,-0.009569,0.060733,-0.110762,0.599423,0.710612,0.269994,0.123238,-0.163776,-0.247338,0.152199,0.076972,0.088868,-0.186733,0.107346,0.742740,-0.254874,-0.103956,0.580279,0.000000,0.129442,-0.009636,-0.075983,-0.180238,-0.305958,-0.008633,0.382945,0.109137,-0.153002,-0.136994,-0.728293,-0.023969,-0.141958,...,0.086404,-0.156375,-0.139420,0.111413,0.069204,0.019398,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284,TCGA-61-2612,0,-0.776812,0.087338,-0.070156,-0.134533,0.450552,0.000000,0.009048,0.099295,-0.008934,0.214266,0.391562,-0.087230,-0.207105,0.025773,0.145475,0.246677,0.006485,-0.386157,0.221902,-0.020325,0.177514,0.095346,-0.524317,-0.210354,-0.028488,-0.193579,-0.396098,0.307265,-0.118952,-0.051644,-0.241628,-0.198156,0.116604,0.362567,-0.054745,0.019763,-0.086951,0.305645,...,0.351722,0.180331,-0.600856,-0.185719,0.072589,0.421616,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
285,TCGA-61-2613,0,-0.306061,-0.481772,0.165202,-0.552728,-0.051152,0.067081,0.217886,0.351792,0.108980,-0.714210,0.131974,-0.340148,-2.065505,0.061596,0.185257,-0.634498,0.046911,-0.134043,0.173838,-0.001541,0.291845,-0.280591,-0.255930,-0.123881,0.292878,-0.087925,-0.895859,-0.287585,-0.248591,-0.121267,-0.486722,0.145079,-0.268257,0.029688,0.398041,0.030989,0.162011,-0.425031,...,0.346217,-0.889076,-1.534363,-1.132235,0.249797,-0.256018,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
286,TCGA-61-2614,0,-0.677054,-0.621494,-0.154606,-0.757609,-0.125776,0.013161,0.001671,0.118831,-0.132100,-0.038402,0.285499,0.110970,-0.206647,0.013964,-0.002351,-0.281877,0.102228,-0.316937,0.399365,0.155778,-0.138287,0.058881,-0.393581,-0.492960,0.092365,-0.076735,-0.419810,0.137525,-0.085716,-0.099747,-0.359345,-0.044759,0.030451,0.113731,-0.189235,-0.023115,-0.106389,0.195218,...,0.328815,-0.302793,-0.261439,-0.046175,-0.485099,0.303292,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
287,TCGA-VG-A8LO,0,0.000393,-0.007970,0.020822,-0.284314,0.053684,-0.170763,0.296391,-0.188514,-0.265357,-0.450634,-0.040540,-0.033737,0.405014,0.357923,0.113846,-0.028972,-0.087254,0.268683,0.009163,-0.452158,0.821184,-0.079237,0.011494,-0.090661,-0.036131,-0.104876,0.344383,0.010923,0.017761,0.084584,0.254040,-0.088322,-0.039518,0.058969,0.160717,0.106766,0.221369,0.386031,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [126]:
# find and remove samples that have no valid protein values (all missing)
samples_notnull = protein_expression.loc[:, 'p_AR':].notnull().sum(axis=1)>0
# find and remove proteins that have any NULL values
proteins_notnull = pd.Series(
  [True, True],
  index=['sample', 'label']
).append(
  protein_expression.loc[:, 'p_AR':].isnull().sum(axis=0)==0
)
# create a new filtered DF
protein_expression_filtered = protein_expression.loc[samples_notnull, proteins_notnull]
protein_expression_filtered

Unnamed: 0,sample,label,p_AR,p_PR,p_Rb,p_S6,p_ATM,p_Akt,p_Bak,p_Bax,p_Bid,p_Bim,p_Lck,p_NF2,p_SCD,p_SF2,p_Src,p_Syk,p_TAZ,p_YAP,p_p21,p_p27,p_p53,p_ACC1,p_ASNS,p_BRD4,p_CD20,p_CD26,p_CD31,p_CDK1,p_COG3,p_Chk1,p_Chk2,p_DJ_1,p_Dvl3,p_EGFR,p_ERK2,p_FASN,p_G6PD,p_GAB2,...,p_HER2_pY1248,p_HER3_pY1289,p_NDRG1_pT346,p_PEA15_pS116,p_SHP_2_pY542,p_STAT3_pY705,p_STAT5_alpha,p_mTOR_pS2448,p_PARP_cleaved,p_PRAS40_pT246,p_beta_Catenin,p_c_Met_pY1235,p_p70S6K_pT389,p_Rb_pS807_S811,p_Rictor_pT1135,p_S6_pS235_S236,p_S6_pS240_S244,p_14_3_3_epsilon,p_ER_alpha_pS118,p_Tuberin_pT1462,p_p38_pT180_Y182,p_p62_LCK_ligand,p_4E_BP1_pT37_T46,p_GSK3_alpha_beta,p_JNK_pT183_pY185,p_MAPK_pT202_Y204,p_MEK1_pS217_S221,p_NF_kB_p65_pS536,p_PI3K_p110_alpha,p_PKC_alpha_pS657,p_PKC_delta_pS664,p_Transglutaminase,p_FOXO3a_pS318_S321,p_Myosin_IIa_pS1943,p_p90RSK_pT359_S363,p_IGF1R_pY1135_Y1136,p_PKC_pan_BetaII_pS660,p_Caspase_7_cleavedD198,p_Acetyl_a_Tubulin_Lys40,p_GSK3_alpha_beta_pS21_S9
0,TCGA-04-1335,0,-0.273046,0.655473,-0.059948,0.396466,0.471839,-0.360947,0.073951,-0.027833,0.220378,-0.169537,-0.207308,0.472585,-0.250574,0.069452,0.105560,0.807701,0.185348,-0.245572,0.169811,0.734501,0.378387,0.339863,-0.094780,-0.736922,0.105748,0.023813,0.179431,0.000000,-0.172312,0.107592,-0.122052,0.104710,-0.058025,0.088305,0.481404,-0.184973,0.465112,-0.618447,...,-0.339159,-0.178893,1.331814,0.017710,-0.557437,0.178060,0.028357,0.150470,-0.206254,-0.127033,-0.431882,0.115390,-0.160759,-0.804339,-0.252946,0.375944,0.236813,0.060983,0.058905,-0.396233,0.168464,0.028056,-0.048466,0.312429,0.281828,0.741087,0.082948,-1.407001,0.437694,0.825335,-0.096460,0.243821,0.322438,0.526129,-0.607325,-0.068938,-0.138731,1.653131,0.818933,-0.477782
1,TCGA-04-1342,0,-0.454864,-0.279352,0.307657,-0.586464,-0.344297,-0.309244,0.014425,-0.084531,0.073717,-0.027273,0.163764,-0.312678,0.137515,-0.149619,0.007761,-0.293853,0.230295,-0.182857,0.437850,0.206107,-0.410178,0.327813,-0.274818,-0.576875,0.397169,0.234139,0.165179,0.206260,-0.237384,-0.097684,-0.011831,0.068046,0.017215,0.010933,-0.201780,-0.039214,0.312275,-0.164887,...,-0.110993,-0.264136,0.158581,0.074474,0.001113,0.393934,0.270804,0.294532,-0.203075,-0.061406,-0.710550,-0.062630,-0.139000,0.224644,0.060161,-0.262808,-0.320358,-0.124606,-0.060838,-0.028149,0.498661,0.339931,-0.082461,-0.241279,0.508507,0.366694,0.087833,-0.210518,-0.061265,0.355921,0.054219,0.649424,0.465429,-0.087063,-0.344065,-0.032280,0.100753,0.087032,1.109947,0.154006
2,TCGA-04-1343,0,-0.812353,-0.115662,-0.186997,-0.017769,0.999080,-0.640257,-0.064718,0.214193,0.000000,-0.133464,-0.225404,0.015661,-0.094096,-0.276277,-0.049998,-0.261935,-0.113985,-0.120271,-0.151624,0.007576,-0.392061,0.390392,0.229451,-0.573896,-0.093023,0.076420,0.073038,-0.058808,-0.236441,-0.165233,0.401111,0.309949,0.059410,-0.057202,0.355117,-0.581117,0.066364,0.225440,...,-0.156587,-0.280530,0.143684,-0.146335,0.035849,0.205411,0.980761,0.262303,-0.190126,0.032810,0.167977,-0.254243,0.328657,0.358925,-0.098969,0.843283,0.482688,-0.281790,0.066026,-0.366648,0.156353,0.614416,0.315432,-0.061957,0.455016,1.226107,0.268519,-0.200132,0.006960,0.116279,-0.119450,0.105629,-0.029056,0.347015,0.178142,0.144897,0.441277,-0.219526,0.113292,0.422419
3,TCGA-04-1348,0,-0.204006,0.656353,0.153361,0.201853,0.001272,0.093862,0.021014,0.078214,0.274942,0.000205,0.603212,-0.001236,-0.220823,-0.170245,-0.247288,0.651018,-0.015465,-0.454123,0.122147,-0.365260,0.730890,0.111853,0.033802,0.557226,-0.171966,-0.162129,0.085110,0.630019,-0.044885,-0.127430,0.016262,0.098972,-0.213957,-0.390981,-0.216503,0.343462,-0.026268,-0.291124,...,-0.352401,-0.225012,-0.543618,-0.301129,-0.451915,0.374583,0.577296,0.160390,0.491646,-0.241347,-0.086655,0.062835,-0.045406,0.403121,-0.138935,0.251899,0.010971,-0.121825,-0.040124,-0.538325,-1.034615,1.207555,-0.033689,0.441053,-0.522041,-0.843406,-0.351060,-0.564693,0.277164,-0.319801,-0.130624,-0.157858,-0.154463,0.125667,-0.031132,-0.016402,-0.419550,2.326973,1.417407,-0.228444
4,TCGA-04-1349,0,-0.402298,-0.203032,-0.084841,-0.192992,0.127635,-0.014273,-0.009569,0.060733,-0.110762,0.599423,0.710612,0.269994,0.123238,-0.163776,-0.247338,0.152199,0.076972,0.088868,-0.186733,0.107346,0.742740,-0.254874,-0.103956,0.580279,0.000000,0.129442,-0.009636,-0.075983,-0.180238,-0.305958,-0.008633,0.382945,0.109137,-0.153002,-0.136994,-0.728293,-0.023969,-0.141958,...,-0.275401,-0.268980,0.233785,-0.041325,-0.082723,0.418192,0.323301,0.107734,-0.151178,-0.203139,0.250418,-0.137053,-0.095296,-0.254705,-0.015595,0.235535,0.145125,-0.118329,0.063703,-0.275653,-0.126398,-0.017040,1.010088,-0.450323,0.254229,0.132387,-0.058163,-0.177256,0.517278,0.171533,0.023870,-0.011295,0.091811,0.502718,-0.044020,0.064202,0.077196,0.794850,0.435950,-0.306676
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284,TCGA-61-2612,0,-0.776812,0.087338,-0.070156,-0.134533,0.450552,0.000000,0.009048,0.099295,-0.008934,0.214266,0.391562,-0.087230,-0.207105,0.025773,0.145475,0.246677,0.006485,-0.386157,0.221902,-0.020325,0.177514,0.095346,-0.524317,-0.210354,-0.028488,-0.193579,-0.396098,0.307265,-0.118952,-0.051644,-0.241628,-0.198156,0.116604,0.362567,-0.054745,0.019763,-0.086951,0.305645,...,0.149081,-0.272486,0.241186,-0.038942,0.076402,-0.320628,0.229824,-0.112301,-0.596632,0.151407,-0.097757,-0.250246,0.441870,0.535680,0.105017,0.545506,0.273425,-0.017625,-0.215968,0.182947,0.545647,-0.454598,0.456455,0.753104,0.875134,1.327171,0.711419,0.774714,-0.004425,-0.472439,-0.187888,0.675117,0.229382,0.534236,-0.006172,-0.136183,0.373718,0.795469,1.364479,0.994294
285,TCGA-61-2613,0,-0.306061,-0.481772,0.165202,-0.552728,-0.051152,0.067081,0.217886,0.351792,0.108980,-0.714210,0.131974,-0.340148,-2.065505,0.061596,0.185257,-0.634498,0.046911,-0.134043,0.173838,-0.001541,0.291845,-0.280591,-0.255930,-0.123881,0.292878,-0.087925,-0.895859,-0.287585,-0.248591,-0.121267,-0.486722,0.145079,-0.268257,0.029688,0.398041,0.030989,0.162011,-0.425031,...,0.034518,-0.073823,0.183414,0.025834,0.160589,-0.025260,0.109264,-0.738265,-0.793653,0.150077,0.468656,-0.335409,0.278050,-0.290875,0.249992,-0.044262,-0.344219,-0.067142,-0.224001,0.138434,0.819858,-0.583823,0.108646,-0.166542,0.332824,2.136209,0.884847,0.104166,0.071946,0.622356,-0.277270,1.131375,0.476895,-0.158713,0.165061,-0.012886,0.765851,0.454873,0.059135,0.786130
286,TCGA-61-2614,0,-0.677054,-0.621494,-0.154606,-0.757609,-0.125776,0.013161,0.001671,0.118831,-0.132100,-0.038402,0.285499,0.110970,-0.206647,0.013964,-0.002351,-0.281877,0.102228,-0.316937,0.399365,0.155778,-0.138287,0.058881,-0.393581,-0.492960,0.092365,-0.076735,-0.419810,0.137525,-0.085716,-0.099747,-0.359345,-0.044759,0.030451,0.113731,-0.189235,-0.023115,-0.106389,0.195218,...,0.014427,-0.102031,0.462043,-0.115999,0.325147,0.000000,1.018881,-0.141753,-0.517037,0.153797,-0.028260,-0.357954,0.567495,0.475102,0.398299,-0.094662,0.053356,-0.049961,-0.082308,0.609703,0.485053,0.296044,0.397314,0.035909,0.095578,1.222245,0.666592,0.656659,-0.155665,0.078717,-0.222370,0.683790,0.180387,0.542671,0.086257,-0.080777,0.407172,0.456994,0.878758,0.722858
287,TCGA-VG-A8LO,0,0.000393,-0.007970,0.020822,-0.284314,0.053684,-0.170763,0.296391,-0.188514,-0.265357,-0.450634,-0.040540,-0.033737,0.405014,0.357923,0.113846,-0.028972,-0.087254,0.268683,0.009163,-0.452158,0.821184,-0.079237,0.011494,-0.090661,-0.036131,-0.104876,0.344383,0.010923,0.017761,0.084584,0.254040,-0.088322,-0.039518,0.058969,0.160717,0.106766,0.221369,0.386031,...,-0.367535,-0.062990,0.847275,0.262164,0.338366,0.016521,-0.375254,0.359501,0.388872,0.126484,-0.096099,-0.069963,0.129274,-0.125613,-0.212500,0.963481,0.722070,0.075523,-0.184429,0.241986,-0.145313,0.476948,0.091112,-0.170223,0.088080,0.395261,-0.101836,-0.268788,0.053762,-0.119586,-0.239591,0.261412,-0.034532,0.118619,0.225501,0.100257,-0.001349,0.479691,-0.381227,-0.174953


In [127]:
# split data into train and test sets

train_data = protein_expression_filtered.sample(frac=0.5, random_state=0)
test_data = protein_expression_filtered.drop(train_data.index)

In [131]:
sum(test_data.label)

51