In [19]:
import pandas as pd
import numpy as np

def getMatrix(met_data_pth:str, met_ref_pth:str, label_pth:str):
  
    # upload
    met_data = pd.read_csv(met_data_pth, sep='\t', index_col=0)
    met_ref_orig = pd.read_csv(met_ref_pth, sep = '\t')
    exp_orig = pd.read_hdf('/root/work/data/expression/sample_matrix_fc_gtex.h5',key='fc_sample_matrix')
    ppi_orig = pd.read_csv('/root/work/data/ppi_network', index_col=0)
    label_orig = pd.read_csv(label_pth)

    # split genes in met_ref
    met_ref_orig = met_ref_orig[['#id', 'gene']]
    met_ref_orig.columns = ['id', 'gene']
    met_ref_orig['gene'] = met_ref_orig['gene'].map(lambda x:x.split(','))
    met_ref_sp = np.dstack((np.repeat(met_ref_orig.id.values,list(map(len,met_ref_orig.gene.values))),np.concatenate(met_ref_orig.gene.values)))
    met_ref_sp = pd.DataFrame(data = met_ref_sp[0],columns = met_ref_orig.columns)

    # align genes
    glist1 = set(ppi_orig.columns.tolist())
    glist2 = set(met_ref_sp['gene'].tolist())
    glist3 = set(exp_orig.index.tolist())
    glist = glist1 & glist2 & glist3
    # refine
    met_ref = met_ref_sp.set_index('gene').T[glist].T
    exp = exp_orig.T[glist].T
    ppi = ppi_orig[glist].T[glist].sort_index().T.sort_index()

    # cut out patients
    met_data.columns = met_data.columns.map(lambda x :x.rsplit('-', 1)[0])
    exp.columns = exp.columns.map(lambda x :x.rsplit('-', 4)[0])

    # align patients
    plist1 = set(met_data.columns.tolist())
    plist2 = set(exp.columns.tolist())
    plist3 = set(label_orig['patient'].tolist())
    plist = plist1 & plist2 & plist3
    # refine
    met_data = met_data[plist].groupby(level=0, axis=1).mean()
    exp = exp[plist].groupby(level=0, axis=1).mean()
    label = label_orig.set_index('patient')[['BRCA_Subtype_PAM50']].T[plist].T.sort_index() #brac:'BRCA_Subtype_PAM50', stad:'Molecular.Subtype'

    # map genes to patients' data
    met_matrix = met_ref.reset_index().set_index('id').join(met_data, how='inner').groupby('gene').mean().sort_index(axis=1)
    exp_matrix = exp.T[glist].T

    return met_matrix, exp_matrix, ppi, label

In [17]:
import os
os.chdir('/root/work/data/BRCA')
os.listdir()

['TCGA-BRCA.htseq_fpkm-uq.tsv.gz',
 'gencode.v22.annotation.gene.probeMap',
 'brca.csv',
 'ppi_met_brca',
 'illuminaMethyl450_hg38_GDC',
 'TCGA-BRCA.methylation450.tsv.gz',
 'label_met_brca',
 'exp_matrix_brca']

In [20]:
met_data_pth = os.listdir()[5]
met_ref_pth = os.listdir()[4]
label_pth = os.listdir()[2]
met_matrix, exp_matrix, ppi, label = getMatrix(met_data_pth, met_ref_pth, label_pth)
## 2m30s

In [25]:
met_matrix.to_csv('/root/work/data/brca_met_matrix')
exp_matrix.to_csv('/root/work/data/brca_exp_matrix')
ppi.to_csv('/root/work/data/brca_ppi')
label.to_csv('/root/work/data/brca_label')

In [21]:
label

Unnamed: 0_level_0,BRCA_Subtype_PAM50
patient,Unnamed: 1_level_1
TCGA-3C-AAAU,LumA
TCGA-3C-AALI,Her2
TCGA-3C-AALJ,LumB
TCGA-3C-AALK,LumA
TCGA-4H-AAAK,LumA
...,...
TCGA-S3-AA17,LumB
TCGA-W8-A86G,LumA
TCGA-XX-A899,LumA
TCGA-XX-A89A,LumA


In [22]:
met_matrix

Unnamed: 0_level_0,TCGA-3C-AAAU,TCGA-3C-AALI,TCGA-3C-AALJ,TCGA-3C-AALK,TCGA-4H-AAAK,TCGA-5L-AAT0,TCGA-5L-AAT1,TCGA-5T-A9QA,TCGA-A1-A0SB,TCGA-A1-A0SE,...,TCGA-S3-AA10,TCGA-S3-AA11,TCGA-S3-AA12,TCGA-S3-AA14,TCGA-S3-AA15,TCGA-S3-AA17,TCGA-W8-A86G,TCGA-XX-A899,TCGA-XX-A89A,TCGA-Z7-A8R6
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.500193,0.648384,0.652031,0.628504,0.624439,0.489813,0.597930,0.676860,0.669689,0.654515,...,0.649354,0.653618,0.661610,0.601540,0.577192,0.644117,0.630100,0.602062,0.578375,0.576197
A2M,0.559654,0.607505,0.662360,0.727982,0.692364,0.678895,0.739322,0.782634,0.733085,0.763117,...,0.688911,0.786679,0.749555,0.689548,0.674185,0.753690,0.724407,0.775173,0.474026,0.591693
A4GALT,0.484800,0.550047,0.476107,0.556016,0.504870,0.520578,0.520201,0.497782,0.420809,0.543648,...,0.437017,0.559577,0.599268,0.519949,0.419363,0.468790,0.548482,0.511834,0.490493,0.471531
AAAS,0.218886,0.195403,0.220431,0.195497,0.137551,0.132440,0.161494,0.352224,0.080972,0.184608,...,0.105098,0.253432,0.273283,0.154658,0.083512,0.168486,0.147091,0.134264,0.180330,0.173260
AACS,0.694343,0.735782,0.667349,0.724165,0.728511,0.729063,0.715102,0.735271,0.672578,0.737307,...,0.681621,0.719413,0.708344,0.726225,0.707928,0.745882,0.739579,0.735378,0.695507,0.702027
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZXDB,0.374230,0.435934,0.436083,0.436011,0.461552,0.459076,0.441082,0.552530,0.491248,0.442121,...,0.427244,0.470643,0.298607,0.410893,0.518882,0.508846,0.498620,0.486524,0.389045,0.337799
ZYG11B,0.512226,0.434024,0.449525,0.452280,0.511217,0.465069,0.497588,0.428527,0.481082,0.493436,...,0.440924,0.454441,0.445380,0.436188,0.484977,0.481925,0.470183,0.477143,0.450977,0.443132
ZYX,0.218678,0.233804,0.219681,0.231844,0.246778,0.217715,0.234620,0.266835,0.194870,0.226041,...,0.225685,0.266630,0.236318,0.253760,0.211091,0.229414,0.224694,0.216222,0.177629,0.230049
ZZEF1,0.482044,0.458847,0.482641,0.485944,0.490736,0.477654,0.482508,0.515038,0.390105,0.496053,...,0.446964,0.500840,0.509031,0.496905,0.435891,0.486330,0.483586,0.478826,0.479048,0.504242


In [23]:
ppi

Unnamed: 0,A1BG,A2M,A4GALT,AAAS,AACS,AADAT,AAGAB,AAK1,AAMDC,AAMP,...,ZSCAN9,ZSWIM7,ZW10,ZWILCH,ZWINT,ZXDB,ZYG11B,ZYX,ZZEF1,ZZZ3
A1BG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A4GALT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAAS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AACS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZXDB,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ZYG11B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ZYX,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ZZEF1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
exp_matrix

Unnamed: 0_level_0,TCGA-3C-AAAU,TCGA-3C-AALI,TCGA-3C-AALJ,TCGA-3C-AALK,TCGA-4H-AAAK,TCGA-5L-AAT0,TCGA-5L-AAT1,TCGA-5T-A9QA,TCGA-A1-A0SB,TCGA-A1-A0SE,...,TCGA-S3-AA10,TCGA-S3-AA11,TCGA-S3-AA12,TCGA-S3-AA14,TCGA-S3-AA15,TCGA-S3-AA17,TCGA-W8-A86G,TCGA-XX-A899,TCGA-XX-A89A,TCGA-Z7-A8R6
Hugo_Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DOCK11,-3.782204,-2.692484,-2.144160,-2.235380,-2.204904,-2.174494,-2.002377,-5.744303,-4.210871,-2.672160,...,-1.830580,-4.295058,-4.116074,-2.417915,-1.185254,-1.396808,-3.512688,-0.732764,-0.401313,-3.709551
GPBP1L1,-0.270177,-0.780617,-1.100992,-0.100058,-0.270177,-0.100058,-0.180115,-1.020885,0.010005,-0.170103,...,-0.200123,0.150089,-0.360241,-0.510359,-0.490342,-0.100058,-0.160102,-0.440301,-0.200123,-0.570416
MLYCD,-0.020089,-0.652926,-0.884340,-1.398423,-0.924584,-0.793749,-0.642884,-0.140541,-0.552410,-1.307606,...,-1.307606,-0.281112,-1.126085,-1.176478,-0.733387,-1.590348,-0.954820,-0.944737,-0.733387,-2.106923
GMPS,0.420412,0.410402,0.050057,-0.110138,0.010007,-0.650940,-0.590831,0.270281,0.380381,0.720645,...,1.220938,-0.360466,-0.370483,0.350358,1.010828,0.580544,-0.721062,-0.160206,-0.260337,0.400398
CYBA,-1.050825,0.570252,2.840661,0.600258,0.940368,1.220438,1.540504,1.430483,-0.260154,-0.400251,...,1.680529,-1.371220,-1.180981,-0.500321,3.000672,1.030390,0.540239,1.390474,2.040581,0.870347
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
FOXF2,-1.519530,-0.400458,0.275910,0.712786,0.793972,1.581962,0.570683,-4.041899,-3.679938,0.520025,...,-0.648276,-1.149341,-2.324261,1.238996,1.390350,-1.012711,-0.133181,0.651986,0.783756,0.631709
AADAT,-3.534986,-1.939074,-0.721462,-0.792672,-0.609258,-1.243888,-1.357296,-0.091190,1.109447,-1.688708,...,1.300432,-1.326202,-1.233399,-0.233190,0.424519,-1.491342,-1.522673,-1.028007,-0.751979,-1.761593
FRMD4B,-1.366206,-1.577769,-0.973797,-1.195028,-1.024041,-0.802909,-0.632128,-4.050080,0.230594,-0.883277,...,-2.052367,-1.698815,-1.255464,-1.466859,-0.531750,-1.527370,-0.672322,-0.682342,-0.351097,-2.021959
EIF5A2,0.010075,-0.575565,-1.766808,0.110821,-0.363268,-0.342954,-1.459488,-0.666576,0.000000,-0.131077,...,-0.899737,-2.386988,-3.017831,-1.102866,-0.181459,-1.787443,-2.324771,-0.454187,-0.181459,-2.428581
