# Processing ATAC-seq and DNA Methylation data
### Initial processing derived from : https://github.com/clorislili/MLFG

In [38]:
from tqdm import tqdm
import re 
import pandas as pd



### Raw ATAC-seq data

In [39]:
DATADIR = '../'
atacseq_data = pd.read_csv(DATADIR + "brca-brca_peak_Log2Counts_dedup", sep='\t')
print(atacseq_data.shape)
atacseq_data.head()

(215920, 75)


Unnamed: 0,sample,TCGA-A2-A0YC-01A,TCGA-BH-A0DL-01A,TCGA-A8-A06N-01A,TCGA-C8-A130-01A,TCGA-A8-A094-01A,TCGA-AR-A0U0-01A,TCGA-AO-A03L-01A,TCGA-C8-A137-01A,TCGA-C8-A12V-01A,...,TCGA-A2-A0ET-01A,TCGA-A2-A0CX-01A,TCGA-AO-A12F-01A,TCGA-A2-A0YJ-01A,TCGA-A8-A08J-01A,TCGA-AO-A124-01A,TCGA-A2-A0SW-01A,TCGA-4H-AAAK-01A,TCGA-AQ-A04L-01A,TCGA-A2-A0YL-01A
0,BRCA_2,0.327265,0.664015,0.89019,-0.104723,0.442554,0.214903,0.786471,-0.068554,0.343639,...,-0.388585,-0.002186,-0.947975,0.320434,-0.014448,0.794401,0.524688,0.324844,0.784515,0.01539
1,BRCA_3,1.149027,1.865047,2.169195,1.264818,1.528699,1.772169,0.976686,1.960191,1.39768,...,2.640007,0.834361,0.923517,1.380733,1.449543,1.461813,1.635304,2.137842,1.298113,0.765273
2,BRCA_4,2.113571,2.470078,2.850043,1.789378,3.835523,1.869746,2.438283,1.083974,2.626652,...,1.566984,2.132028,0.607614,1.475995,2.652544,2.488973,2.027855,2.627991,1.737446,1.763204
3,BRCA_5,0.91542,0.676237,1.014768,0.710795,1.198746,1.075143,2.054559,-0.097414,0.871569,...,1.805219,2.159534,0.946661,0.716055,1.666294,2.048029,1.619529,0.666586,1.402731,0.519809
4,BRCA_6,0.217013,0.529762,1.071146,0.304052,1.152537,0.62419,1.396391,-0.068554,0.227504,...,-0.869649,1.08216,0.38228,0.766072,1.85362,1.427197,1.497779,0.099494,0.580887,-0.013765


### Raw DNAme data

In [40]:
methylation_data_full = pd.read_csv(DATADIR + "TCGA-BRCA.methylation450.tsv", sep = '\t')
methylation_data = methylation_data_full[methylation_data_full.columns[methylation_data_full.columns.isin(atacseq_data.columns)]]
methylation_data.to_csv(DATADIR + "TCGA-BRCA.methylation.tsv", sep="\t")
methylation_data = pd.read_csv(DATADIR + "TCGA-BRCA.methylation.tsv", sep='\t').dropna()
methylation_data['Composite Element REF'] = methylation_data_full['Composite Element REF']
methylation_data = methylation_data.set_index(methylation_data['Composite Element REF'])
methylation_data = methylation_data.drop('Unnamed: 0',axis  = 1)
methylation_data.head()

Unnamed: 0_level_0,TCGA-4H-AAAK-01A,TCGA-AR-A0U4-01A,TCGA-A2-A0YH-01A,TCGA-AO-A0JM-01A,TCGA-BH-A0DP-01A,TCGA-A2-A0SW-01A,TCGA-AO-A124-01A,TCGA-AO-A0JB-01A,TCGA-BH-A0E0-01A,TCGA-AO-A03L-01A,...,TCGA-A2-A0SX-01A,TCGA-BH-A0HP-01A,TCGA-A2-A0YD-01A,TCGA-S3-AA0Z-01A,TCGA-3C-AALJ-01A,TCGA-A7-A0D9-01A,TCGA-BH-A0DV-01A,TCGA-A7-A13F-01A,TCGA-BH-A1EV-01A,Composite Element REF
Composite Element REF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cg00000029,0.205067,0.230003,0.153617,0.134256,0.231998,0.172861,0.10819,0.104946,0.273929,0.246532,...,0.340357,0.165222,0.209104,0.188283,0.146742,0.125216,0.213518,0.116363,0.143252,cg00000029
cg00000165,0.444188,0.605254,0.248725,0.648626,0.517955,0.496663,0.497221,0.188923,0.451665,0.394817,...,0.642726,0.448543,0.266291,0.613981,0.700575,0.247129,0.345927,0.428334,0.596631,cg00000165
cg00000236,0.871755,0.885958,0.872617,0.925808,0.901752,0.927091,0.909144,0.922237,0.907376,0.918064,...,0.884333,0.925454,0.926678,0.905192,0.765546,0.910713,0.881799,0.918171,0.921108,cg00000236
cg00000289,0.545964,0.515607,0.568605,0.777486,0.696329,0.667617,0.557751,0.590143,0.579652,0.759224,...,0.666332,0.767627,0.702365,0.423248,0.466364,0.675472,0.64833,0.61901,0.711667,cg00000289
cg00000292,0.796488,0.459399,0.58042,0.352668,0.811162,0.475505,0.420457,0.564806,0.575604,0.78527,...,0.436685,0.574433,0.707471,0.837387,0.850628,0.908439,0.662061,0.840011,0.343049,cg00000292


### Gene Mapping

In [41]:
gene_mapping = pd.read_csv(DATADIR + "brca-brca_peak.probeMap", sep='\t')
gene_mapping = gene_mapping[ ~gene_mapping['chrom'].isin(["chrX","chrY"]) ] # only keep autosomes (non sex chromosomes)
gene_mapping = gene_mapping.sort_values(['chrom', 'chromStart']).drop_duplicates() # sort so we can interleave negatives
genestrip = []
for i in gene_mapping['gene']:
    try:
        genestrip.append(i[:6]+'....')
    except:
        genestrip.append('.....')

gene_mapping['gene'] = pd.Series(bleh)
gene_mapping[['id','chrom','chromStart','chromEnd','gene']].head()

Unnamed: 0,id,chrom,chromStart,chromEnd,gene
0,BRCA_2,chr1,17233,17733,DDX11L....
1,BRCA_3,chr1,180633,181133,DDX11L....
2,BRCA_4,chr1,181206,181706,DDX11L....
3,BRCA_5,chr1,183556,184056,DDX11L....
4,BRCA_6,chr1,184246,184746,DDX11L....


In [42]:
gene_mapping_methylation = pd.read_csv(DATADIR + "illuminaMethyl450_hg38_GDC", sep='\t')
gene_mapping_methylation = gene_mapping_methylation[ ~gene_mapping_methylation['chrom'].isin(["chrX","chrY"]) ] # only keep autosomes (non sex chromosomes)
gene_mapping_methylation = gene_mapping_methylation.sort_values(['#id']).drop_duplicates() # sort so we can interleave negatives
gene_mapping_methylation = gene_mapping_methylation[ ~gene_mapping_methylation['gene'].isin(["."]) ]
gene_mapping_methylation[['#id','chrom','chromStart','chromEnd','gene']].head()

Unnamed: 0,#id,chrom,chromStart,chromEnd,gene
0,cg00000029,chr16,53434200,53434201,RBL2
2,cg00000236,chr8,42405776,42405777,VDAC3
3,cg00000289,chr14,68874422,68874423,ACTN1
4,cg00000292,chr16,28878779,28878780,ATP2A1
5,cg00000321,chr8,41310283,41310284,SFRP1


In [43]:
methylation = methylation_data.set_index('Composite Element REF').join(gene_mapping_methylation.set_index('#id'))
methylation= methylation.sort_values(['chrom', 'chromStart']).dropna()
methylation

Unnamed: 0_level_0,TCGA-4H-AAAK-01A,TCGA-AR-A0U4-01A,TCGA-A2-A0YH-01A,TCGA-AO-A0JM-01A,TCGA-BH-A0DP-01A,TCGA-A2-A0SW-01A,TCGA-AO-A124-01A,TCGA-AO-A0JB-01A,TCGA-BH-A0E0-01A,TCGA-AO-A03L-01A,...,TCGA-3C-AALJ-01A,TCGA-A7-A0D9-01A,TCGA-BH-A0DV-01A,TCGA-A7-A13F-01A,TCGA-BH-A1EV-01A,gene,chrom,chromStart,chromEnd,strand
Composite Element REF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cg13869341,0.863537,0.908140,0.897792,0.941725,0.921435,0.914881,0.916600,0.903855,0.761811,0.935323,...,0.945299,0.971138,0.848365,0.886880,0.887436,WASH7P,chr1,15865.0,15866.0,.
cg14008030,0.713760,0.622443,0.548639,0.651462,0.754530,0.580554,0.558209,0.579566,0.722342,0.670157,...,0.663823,0.630505,0.642672,0.803315,0.631507,"MIR6859-3,WASH7P",chr1,18827.0,18828.0,.
cg12045430,0.025781,0.054903,0.029967,0.051277,0.037666,0.044323,0.018956,0.022220,0.022503,0.080894,...,0.032259,0.028483,0.046798,0.055561,0.072155,"MIR1302-9,RP11-34P13.3,WASH7P",chr1,29407.0,29408.0,.
cg20826792,0.119533,0.154726,0.145204,0.130768,0.169766,0.182809,0.157739,0.120229,0.168281,0.224178,...,0.051799,0.108951,0.214588,0.175870,0.222474,"MIR1302-9,RP11-34P13.3,WASH7P",chr1,29425.0,29426.0,.
cg00381604,0.029008,0.018851,0.016402,0.014158,0.019387,0.018051,0.017559,0.010865,0.014076,0.019340,...,0.032256,0.015090,0.020067,0.016328,0.012783,"MIR1302-9,RP11-34P13.3,WASH7P",chr1,29435.0,29436.0,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
cg14022794,0.167949,0.331636,0.163405,0.247585,0.289788,0.123148,0.118860,0.119279,0.334406,0.347570,...,0.508298,0.364872,0.197908,0.147242,0.298635,CACNA1B,chr9,138123005.0,138123006.0,.
cg16372751,0.301460,0.359007,0.062328,0.136824,0.177966,0.105285,0.042910,0.179836,0.065660,0.160989,...,0.247762,0.062135,0.123754,0.309982,0.158418,TUBBP5,chr9,138150057.0,138150058.0,.
cg14008164,0.820442,0.957454,0.961767,0.978765,0.971995,0.959770,0.947595,0.977511,0.976941,0.980471,...,0.904957,0.964830,0.964154,0.962915,0.974684,FAM157B,chr9,138214752.0,138214753.0,.
cg23867978,0.599583,0.925666,0.701603,0.753172,0.789591,0.732615,0.868101,0.621468,0.900059,0.844528,...,0.910268,0.870470,0.866561,0.508539,0.922752,FAM157B,chr9,138218977.0,138218978.0,.


In [44]:
atacseq = atacseq_data.set_index('sample').join(gene_mapping.set_index('id'))
atacseq = atacseq[atacseq.columns[atacseq.columns.isin(methylation.columns)]]
atacseq = atacseq.sort_values(['chrom', 'chromStart']).dropna()
atacseq

Unnamed: 0_level_0,TCGA-A2-A0YC-01A,TCGA-AR-A0U0-01A,TCGA-AO-A03L-01A,TCGA-BH-A0E0-01A,TCGA-A2-A4RX-01A,TCGA-BH-A0BA-01A,TCGA-BH-A0HP-01A,TCGA-BH-A0B5-01A,TCGA-AO-A03N-01B,TCGA-BH-A0DP-01A,...,TCGA-A2-A0YJ-01A,TCGA-AO-A124-01A,TCGA-A2-A0SW-01A,TCGA-4H-AAAK-01A,TCGA-A2-A0YL-01A,gene,chrom,chromStart,chromEnd,strand
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BRCA_2,0.327265,0.214903,0.786471,0.502152,0.739069,0.174714,-0.802113,0.769375,1.033614,0.153050,...,0.320434,0.794401,0.524688,0.324844,0.015390,DDX11L....,chr1,17233.0,17733.0,+
BRCA_3,1.149027,1.772169,0.976686,1.317993,1.042483,0.483563,-0.207317,2.016297,1.735418,0.215667,...,1.380733,1.461813,1.635304,2.137842,0.765273,DDX11L....,chr1,180633.0,181133.0,+
BRCA_4,2.113571,1.869746,2.438283,2.539754,1.443043,1.179219,1.146447,2.556850,4.005177,1.507031,...,1.475995,2.488973,2.027855,2.627991,1.763204,DDX11L....,chr1,181206.0,181706.0,+
BRCA_5,0.915420,1.075143,2.054559,2.652942,0.736985,-0.889863,0.426391,0.419075,1.238526,-0.109221,...,0.716055,2.048029,1.619529,0.666586,0.519809,DDX11L....,chr1,183556.0,184056.0,+
BRCA_6,0.217013,0.624190,1.396391,2.354217,0.211199,-0.889178,0.012028,0.513864,1.136212,0.490342,...,0.766072,1.427197,1.497779,0.099494,-0.013765,DDX11L....,chr1,184246.0,184746.0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BRCA_115606,-0.492523,1.277071,0.758905,1.332081,1.201992,-0.199605,0.363374,0.696908,-0.458679,0.598379,...,0.821829,2.489485,0.132625,0.052925,0.644234,RP11-5....,chr9,138022278.0,138022778.0,+
BRCA_115607,-0.421970,0.802838,0.555127,1.061624,1.425078,0.384108,-0.238475,0.744250,-0.578816,-0.076925,...,0.795207,2.252733,0.155094,0.000524,-0.003475,RP11-5....,chr9,138022865.0,138023365.0,+
BRCA_115608,0.140645,1.852739,1.694982,2.202415,0.198767,0.837935,0.547877,0.914698,0.316224,0.066433,...,1.307433,-0.309568,0.326216,0.151963,1.112440,RP11-5....,chr9,138092027.0,138092527.0,+
BRCA_115609,1.735064,1.741332,1.965833,2.391806,1.956009,0.384108,1.016706,1.546509,2.448380,1.436926,...,1.127994,2.633120,1.158043,1.014306,2.352523,RP11-5....,chr9,138150044.0,138150544.0,+


In [45]:
atacseq.index = pd.IntervalIndex.from_arrays(atacseq['chromStart'],atacseq['chromEnd'],closed='both',name='chromRange')
atacseq = atacseq.sort_index(axis=1)
atacseq
methylation['chromRange'] = ","
methylation = methylation.set_index('chromStart')
methylation = methylation.sort_index(axis=1)

## Processing

In [11]:
import numpy as np
chromrange = atacseq.index
count=1
temp_chromrange = [] 
for i, row in tqdm(methylation.iterrows()):
  bool_list = chromrange.contains(i)
  chrom_match = np.where(atacseq.chrom==row.chrom, True, False)
  both_true = bool_list & chrom_match

  if True in bool_list:   
    interval = chromrange[np.where(both_true)]
    interval = re.findall("\[(\d+.0, \d+.0)\]",str(interval))
    if len(interval) > 0:
      temp_chromrange.append(interval)
    else:
      temp_chromrange.append(",")
  else:
    temp_chromrange.append(",")
  count+=1

326675it [59:00, 92.26it/s]


In [12]:
methylation['chromRange'] = temp_chromrange
methylation.to_csv(DATADIR+'methylation_matched.csv')
methylation_data = pd.read_csv(DATADIR + "methylation_matched.csv")
methylation_data.head()
methylation_data = methylation_data[ ~methylation_data['chromRange'].isin([","]) ]
methylation_data[:10]
parsed = methylation_data['chromRange'].str.replace(r'(\')','')
methylation_data['chromRange'] = parsed
atacseq = atacseq.reset_index()

(125826, 49)


  parsed = methylation_data['chromRange'].str.replace(r'(\')','')


                            TCGA-3C-AALJ-01A  TCGA-4H-AAAK-01A  \
chromRange                                                       
[17233.0, 17733.0]                  1.031215          0.324844   
[180633.0, 181133.0]                1.666548          2.137842   
[181206.0, 181706.0]                1.839497          2.627991   
[183556.0, 184056.0]                1.374965          0.666586   
[184246.0, 184746.0]                0.768645          0.099494   
...                                      ...               ...   
[138022278.0, 138022778.0]          0.626510          0.052925   
[138022865.0, 138023365.0]          0.002770          0.000524   
[138092027.0, 138092527.0]          1.374965          0.151963   
[138150044.0, 138150544.0]          2.469279          1.014306   
[138217351.0, 138217851.0]          0.167330         -1.315842   

                            TCGA-A2-A0SV-01A  TCGA-A2-A0SW-01A  \
chromRange                                                       
[17233.0,

In [15]:
#atacseq = atacseq.drop(['level_0','index'],axis = 1)
#atacseq = atacseq.reset_index()
atacseq['chromRange'] = atacseq.chromRange.astype("|S")
methylation_data['chromRange'] = methylation_data.chromRange.astype("|S")

Unnamed: 0.1,chromStart,TCGA-3C-AALJ-01A,TCGA-4H-AAAK-01A,TCGA-A2-A0SV-01A,TCGA-A2-A0SW-01A,TCGA-A2-A0SX-01A,TCGA-A2-A0T4-01A,TCGA-A2-A0T5-01A,TCGA-A2-A0T6-01A,TCGA-A2-A0T7-01A,...,TCGA-BH-A0HP-01A,TCGA-BH-A1EV-01A,TCGA-C8-A8HR-01A,TCGA-S3-AA0Z-01A,Unnamed: 0,chrom,chromEnd,chromRange,gene,strand
16,778541.0,0.045462,0.080959,0.070239,0.071469,0.066771,0.063428,0.051999,0.053982,0.052196,...,0.059373,0.059980,0.048112,0.039880,191274,chr1,778542.0,"b'[778473.0, 778973.0]'","RP11-206L10.9,RP11-206L10.2",.
17,778622.0,0.131050,0.140814,0.059830,0.116826,0.081094,0.062668,0.069935,0.078603,0.067194,...,0.079771,0.128468,0.096039,0.138826,214957,chr1,778623.0,"b'[778473.0, 778973.0]'","RP11-206L10.9,RP11-206L10.2",.
18,778641.0,0.016846,0.018983,0.014452,0.011500,0.024582,0.009875,0.017412,0.011717,0.048974,...,0.017242,0.010446,0.020900,0.014157,332347,chr1,778642.0,"b'[778473.0, 778973.0]'","RP11-206L10.9,RP11-206L10.2",.
19,778797.0,0.019049,0.020676,0.010318,0.013000,0.013940,0.010253,0.012000,0.011795,0.011695,...,0.011017,0.020778,0.022842,0.019229,21355,chr1,778798.0,"b'[778473.0, 778973.0]'","RP11-206L10.9,RP11-206L10.2",.
25,827502.0,0.021141,0.022321,0.013144,0.012484,0.012274,0.012204,0.014860,0.014767,0.013300,...,0.013751,0.016432,0.014477,0.016651,32067,chr1,827503.0,"b'[827303.0, 827803.0]'","LINC00115,LINC01128",.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
326575,137619564.0,0.038326,0.027134,0.026894,0.026730,0.034716,0.032503,0.022229,0.026697,0.046435,...,0.027199,0.027169,0.024163,0.028131,379570,chr9,137619565.0,"b'[137619338.0, 137619838.0]'","EHMT1,ARRDC1-AS1",.
326655,138022528.0,0.179657,0.024322,0.033313,0.032337,0.028546,0.051576,0.439415,0.033883,0.040600,...,0.028477,0.018727,0.044723,0.018014,253742,chr9,138022529.0,"b'[138022278.0, 138022778.0]'",CACNA1B,.
326656,138022961.0,0.451888,0.453381,0.510625,0.599515,0.112738,0.475246,0.630365,0.435613,0.189542,...,0.331645,0.036379,0.169102,0.076748,221622,chr9,138022962.0,"b'[138022865.0, 138023365.0]'",CACNA1B,.
326657,138022979.0,0.592524,0.321032,0.692532,0.681245,0.118021,0.616109,0.668913,0.546414,0.100680,...,0.605807,0.108185,0.137698,0.040238,251389,chr9,138022980.0,"b'[138022865.0, 138023365.0]'",CACNA1B,.


In [16]:
data = pd.merge(
    atacseq,
    methylation_data,
    how="inner",
    on=["chromRange","chrom"],
    left_on=None,
    right_on=None,
    left_index=False,
    right_index=False,
    sort=True,
    suffixes=("_atac", "_meth"),
    copy=True,
    indicator=False,
    validate=None,
)
data = data.reindex(sorted(data.columns), axis=1)
data.to_csv(DATADIR+'data.csv')
data.head()

Unnamed: 0,TCGA-3C-AALJ-01A_atac,TCGA-3C-AALJ-01A_meth,TCGA-4H-AAAK-01A_atac,TCGA-4H-AAAK-01A_meth,TCGA-A2-A0SV-01A_atac,TCGA-A2-A0SV-01A_meth,TCGA-A2-A0SW-01A_atac,TCGA-A2-A0SW-01A_meth,TCGA-A2-A0SX-01A_atac,TCGA-A2-A0SX-01A_meth,...,chrom,chromEnd_atac,chromEnd_meth,chromRange,chromStart_atac,chromStart_meth,gene_atac,gene_meth,strand_atac,strand_meth
0,3.285399,0.526853,4.316849,0.26635,4.29838,0.289444,3.620005,0.524326,2.792367,0.614292,...,chr3,100003329.0,100003007.0,"b'[100002829.0, 100003329.0]'",100002829.0,100003006.0,"RP11-779P15.2,RP11-201E8.1,COL8A1,AC055723.1,R...","FILIP1L,CMSS1",+,.
1,-0.69566,0.382175,1.36385,0.222729,0.479393,0.25643,2.328401,0.3714,-0.004197,0.442423,...,chr10,100009142.0,100008824.0,"b'[100008642.0, 100009142.0]'",100008642.0,100008823.0,"LINC01475,RP11-129J12.1,NKX2-3,SLC25A28,RP11-8...",DNMBP,+,.
2,-0.69566,0.216018,1.36385,0.148321,0.479393,0.192806,2.328401,0.419475,-0.004197,0.454219,...,chr10,100009142.0,100008945.0,"b'[100008642.0, 100009142.0]'",100008642.0,100008944.0,"LINC01475,RP11-129J12.1,NKX2-3,SLC25A28,RP11-8...",DNMBP,+,.
3,3.20138,0.051344,3.247254,0.030604,2.793906,0.036163,3.178637,0.027852,2.430728,0.023746,...,chr10,100010228.0,100010215.0,"b'[100009728.0, 100010228.0]'",100009728.0,100010214.0,"LINC01475,RP11-129J12.1,NKX2-3,SLC25A28,RP11-8...",DNMBP,+,.
4,4.02319,0.034686,4.351152,0.052043,3.766912,0.025707,4.584341,0.023516,3.607636,0.024295,...,chr7,100015805.0,100015483.0,"b'[100015305.0, 100015805.0]'",100015305.0,100015482.0,"FAM200A,ZNF655,GS1-259H13.11,AC005020.1,GS1-25...",ZKSCAN1,+,.


### Saving  DNAme and ATACseq combined

In [18]:
data = pd.read_csv(DATADIR +'data.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,TCGA-3C-AALJ-01A_atac,TCGA-3C-AALJ-01A_meth,TCGA-4H-AAAK-01A_atac,TCGA-4H-AAAK-01A_meth,TCGA-A2-A0SV-01A_atac,TCGA-A2-A0SV-01A_meth,TCGA-A2-A0SW-01A_atac,TCGA-A2-A0SW-01A_meth,TCGA-A2-A0SX-01A_atac,...,chrom,chromEnd_atac,chromEnd_meth,chromRange,chromStart_atac,chromStart_meth,gene_atac,gene_meth,strand_atac,strand_meth
0,0,3.285399,0.526853,4.316849,0.26635,4.29838,0.289444,3.620005,0.524326,2.792367,...,chr3,100003329.0,100003007.0,"b'[100002829.0, 100003329.0]'",100002829.0,100003006.0,"RP11-779P15.2,RP11-201E8.1,COL8A1,AC055723.1,R...","FILIP1L,CMSS1",+,.
1,1,-0.69566,0.382175,1.36385,0.222729,0.479393,0.25643,2.328401,0.3714,-0.004197,...,chr10,100009142.0,100008824.0,"b'[100008642.0, 100009142.0]'",100008642.0,100008823.0,"LINC01475,RP11-129J12.1,NKX2-3,SLC25A28,RP11-8...",DNMBP,+,.
2,2,-0.69566,0.216018,1.36385,0.148321,0.479393,0.192806,2.328401,0.419475,-0.004197,...,chr10,100009142.0,100008945.0,"b'[100008642.0, 100009142.0]'",100008642.0,100008944.0,"LINC01475,RP11-129J12.1,NKX2-3,SLC25A28,RP11-8...",DNMBP,+,.
3,3,3.20138,0.051344,3.247254,0.030604,2.793906,0.036163,3.178637,0.027852,2.430728,...,chr10,100010228.0,100010215.0,"b'[100009728.0, 100010228.0]'",100009728.0,100010214.0,"LINC01475,RP11-129J12.1,NKX2-3,SLC25A28,RP11-8...",DNMBP,+,.
4,4,4.02319,0.034686,4.351152,0.052043,3.766912,0.025707,4.584341,0.023516,3.607636,...,chr7,100015805.0,100015483.0,"b'[100015305.0, 100015805.0]'",100015305.0,100015482.0,"FAM200A,ZNF655,GS1-259H13.11,AC005020.1,GS1-25...",ZKSCAN1,+,.


## Saving final DNAme and ATAC-seq files

In [21]:
atac_final = pd.DataFrame(data[['Unnamed: 0','chrom','chromRange']])
meth_final = pd.DataFrame(data[['Unnamed: 0','chrom','chromRange']])

for column in data.columns:
    if 'meth' in column:
        meth_final[column.split('_')[0]] = data[column]
    
    if 'atac' in column:
        atac_final[column.split('_')[0]] = data[column]

In [22]:
meth_final.head()

Unnamed: 0.1,Unnamed: 0,chrom,chromRange,TCGA-3C-AALJ-01A,TCGA-4H-AAAK-01A,TCGA-A2-A0SV-01A,TCGA-A2-A0SW-01A,TCGA-A2-A0SX-01A,TCGA-A2-A0T4-01A,TCGA-A2-A0T5-01A,...,TCGA-BH-A0DV-01A,TCGA-BH-A0E0-01A,TCGA-BH-A0HP-01A,TCGA-BH-A1EV-01A,TCGA-C8-A8HR-01A,TCGA-S3-AA0Z-01A,chromEnd,chromStart,gene,strand
0,0,chr3,"b'[100002829.0, 100003329.0]'",0.526853,0.26635,0.289444,0.524326,0.614292,0.433036,0.290172,...,0.529944,0.429105,0.428201,0.09959,0.538694,0.215498,100003007.0,100003006.0,"FILIP1L,CMSS1",.
1,1,chr10,"b'[100008642.0, 100009142.0]'",0.382175,0.222729,0.25643,0.3714,0.442423,0.262707,0.112821,...,0.255715,0.59713,0.177014,0.092278,0.387801,0.811644,100008824.0,100008823.0,DNMBP,.
2,2,chr10,"b'[100008642.0, 100009142.0]'",0.216018,0.148321,0.192806,0.419475,0.454219,0.202633,0.157218,...,0.179274,0.352504,0.118339,0.085383,0.386564,0.643237,100008945.0,100008944.0,DNMBP,.
3,3,chr10,"b'[100009728.0, 100010228.0]'",0.051344,0.030604,0.036163,0.027852,0.023746,0.025524,0.026518,...,0.02714,0.02219,0.019805,0.027966,0.024031,0.030325,100010215.0,100010214.0,DNMBP,.
4,4,chr7,"b'[100015305.0, 100015805.0]'",0.034686,0.052043,0.025707,0.023516,0.024295,0.02474,0.029157,...,0.025459,0.021501,0.029713,0.02564,0.02466,0.027669,100015483.0,100015482.0,ZKSCAN1,.


In [23]:
atac_final.head()

Unnamed: 0.1,Unnamed: 0,chrom,chromRange,TCGA-3C-AALJ-01A,TCGA-4H-AAAK-01A,TCGA-A2-A0SV-01A,TCGA-A2-A0SW-01A,TCGA-A2-A0SX-01A,TCGA-A2-A0T4-01A,TCGA-A2-A0T5-01A,...,TCGA-BH-A0DV-01A,TCGA-BH-A0E0-01A,TCGA-BH-A0HP-01A,TCGA-BH-A1EV-01A,TCGA-C8-A8HR-01A,TCGA-S3-AA0Z-01A,chromEnd,chromStart,gene,strand
0,0,chr3,"b'[100002829.0, 100003329.0]'",3.285399,4.316849,4.29838,3.620005,2.792367,3.75741,4.386887,...,4.428868,2.923903,3.180884,3.958643,4.293276,3.200028,100003329.0,100002829.0,"RP11-779P15.2,RP11-201E8.1,COL8A1,AC055723.1,R...",+
1,1,chr10,"b'[100008642.0, 100009142.0]'",-0.69566,1.36385,0.479393,2.328401,-0.004197,0.462299,1.194493,...,1.663403,0.864947,1.659997,1.32564,0.895851,0.302204,100009142.0,100008642.0,"LINC01475,RP11-129J12.1,NKX2-3,SLC25A28,RP11-8...",+
2,2,chr10,"b'[100008642.0, 100009142.0]'",-0.69566,1.36385,0.479393,2.328401,-0.004197,0.462299,1.194493,...,1.663403,0.864947,1.659997,1.32564,0.895851,0.302204,100009142.0,100008642.0,"LINC01475,RP11-129J12.1,NKX2-3,SLC25A28,RP11-8...",+
3,3,chr10,"b'[100009728.0, 100010228.0]'",3.20138,3.247254,2.793906,3.178637,2.430728,3.479366,2.711596,...,3.513757,3.074589,3.35452,2.604256,3.589404,2.583158,100010228.0,100009728.0,"LINC01475,RP11-129J12.1,NKX2-3,SLC25A28,RP11-8...",+
4,4,chr7,"b'[100015305.0, 100015805.0]'",4.02319,4.351152,3.766912,4.584341,3.607636,4.203985,3.896241,...,4.115693,4.270799,3.94242,3.080959,3.886262,2.673944,100015805.0,100015305.0,"FAM200A,ZNF655,GS1-259H13.11,AC005020.1,GS1-25...",+


In [24]:
meth_final.to_csv(DATADIR + 'methylation_processed.csv')
atac_final.to_csv(DATADIR + 'atacseq_processed.csv')