# Range and Chromosome grouping

In [12]:
import pandas as pd
import numpy
import json
DATADIR = '../data/'
dname = pd.read_csv(DATADIR+'methylation_processed.csv')
atac = pd.read_csv(DATADIR+'atacseq_processed.csv')

In [13]:
dname = dname.drop(['Unnamed: 0','Unnamed: 0.1'],axis = 1)
atac = atac.drop(['Unnamed: 0','Unnamed: 0.1'],axis = 1)

In [14]:
def return_columns(df):
    df = df.drop(['chrom','chromRange','chromEnd','chromStart','gene','strand'],axis = 1)
    return df.columns

In [15]:
data_columns = list(return_columns(dname))
dname['methylation values'] = list(dname[data_columns].values)
atac['atac values'] = list(atac[data_columns].values)

In [16]:
dname_proc = dname.drop(data_columns+['strand','gene'],axis=1)
atac_proc = atac.drop(data_columns+['strand','gene'],axis=1)

In [17]:
atac_proc  =atac_proc.sort_values(by='chromStart')
dname_proc  = dname_proc.sort_values(by='chromStart')

In [18]:
dnamechunks = [x for _, x in dname_proc.groupby(dname_proc['chromRange'])]
atacseqchunks = [x for _, x in atac_proc.groupby(atac_proc['chromRange'])]


## Range grouped processing

In [23]:
def group_by_range(chunks,data_name):
    agg = {}
    agg_index = 0
    for chunk in chunks:
        range = chunk.iloc[0]['chromRange']
        start  = int(float(range.split(',')[0][3:]))
        end =  int(float(range.split(',')[1][1:-3]))
        new_row = {'chromStart':start,'chromEnd':end}
        i = 0
        chunk = chunk.reset_index(drop = True)
        for index,row in chunk.iterrows():
            if index == len(chunk) -1:
                new_row['point'+str(i)] = list(row[data_name+' values'])
                break
            nextrow = chunk.iloc[index+1]
            diff = nextrow['chromStart'] - row['chromEnd']

            if nextrow['chromStart'] == row['chromStart']:
                i+=1
                continue

            new_row['point'+str(i)] = list(row[data_name+' values'])
            new_row['dist'+str(i)+'_'+str(i+1)] = diff
            i+=1

        agg[agg_index] = new_row
        agg_index+=1
    return agg
        
agg_dname = group_by_range(dnamechunks,'methylation')
agg_atac = group_by_range(atacseqchunks,'atac')

In [27]:
dname_chroms = [x for _, x in dname_proc.groupby(dname_proc['chrom'])]
atacseq_chroms = [x for _, x in atac_proc.groupby(atac_proc['chrom'])]

## Chromosome grouped processing

In [28]:
def group_by_chromosome(chunks,data_name):
    agg = {}
    for chunk in chunks:
        chrom = chunk.iloc[0]['chrom']
        if chrom not in agg:
            agg[chrom] = {}
        new_row = {'chrom':chunk.iloc[0]['chrom']}
        i = 0
        chunk = chunk.reset_index(drop = True)
        for index,row in chunk.iterrows():
            if index == len(chunk) -1:
                range = row['chromRange']
                start  = int(float(range.split(',')[0][3:]))
                end =  int(float(range.split(',')[1][1:-3]))
                agg[chrom][rangeKey]['data']['point'+str(i)] = list(row[data_name+' values'])
                break
            
            nextrow = chunk.iloc[index+1]
            diff = nextrow['chromStart'] - row['chromEnd']

            if nextrow['chromStart'] == row['chromStart']:
                continue

            range = row['chromRange']
            start  = int(float(range.split(',')[0][3:]))
            end =  int(float(range.split(',')[1][1:-3]))
            rangeKey = str(start) + '_' + str(end)

            if rangeKey in agg[chrom]:
                agg[chrom][rangeKey]['data']['point'+str(i)] = list(row[data_name+' values'])
                agg[chrom][rangeKey]['data']['dist'+str(i)+'_'+str(i+1)] = diff
                i+=1
            else:
                i=0
                agg[chrom][rangeKey] = {'chromStart':start,'chromEnd':end,'data':{'point' + str(i) : list(row[data_name+' values']), 'dist'+str(i)+'_'+str(i+1):diff}}
                i+=1
    return agg

chrom_agg_dname = group_by_chromosome(dname_chroms,'methylation')
chrom_agg_atac = group_by_chromosome(atacseq_chroms,'atac')

In [30]:
import json
with open(DATADIR+'dname_range_grouped.json', 'w') as fout:
    json.dump(agg_dname, fout)

with open(DATADIR+'atac_range_grouped.json', 'w') as fout:
    json.dump(agg_atac, fout)

with open(DATADIR+'dname_chrom_grouped.json', 'w') as fout:
    json.dump(chrom_agg_dname, fout)

with open(DATADIR+'atac_chrom_grouped.json', 'w') as fout:
    json.dump(chrom_agg_atac, fout)
