# DSS Single Region All DML

- Only calculate single region
- Save all the DML statsitics for two group of samples


In [None]:
import pandas as pd
from pysam import TabixFile
import rpy2
import pathlib
from rpy2.robjects.vectors import IntVector
%load_ext rpy2.ipython

## Parameters

In [None]:
region = 'chr19:0-1000000'
allc_paths = [
    '../../DG_ALLC/DG_1000_0.allc.tsv.gz',
    '../../DG_ALLC/DG_1000_1.allc.tsv.gz',
    '../../DG_ALLC/DG_1000_2.allc.tsv.gz',
    '../../IT-L23_ALLC/IT-L23_1000_0.allc.tsv.gz',
    '../../IT-L23_ALLC/IT-L23_1000_1.allc.tsv.gz',
    '../../IT-L23_ALLC/IT-L23_1000_2.allc.tsv.gz'
]
group1 = ['DG_1000_0', 'DG_1000_1', 'DG_1000_2']
group2 = ['IT-L23_1000_0', 'IT-L23_1000_1', 'IT-L23_1000_2']
smoothing = True

In [None]:
output_path = f'{region}.DSS.DML.hdf'
samples = group1 + group2

## R Library

If not installed, run these code to install:
```R
%%R

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("DSS")
```

In [None]:
%%R
library(DSS)
require(bsseq)

## Load Data

In [None]:
# reformat allc to dss required format
def get_data(allc_paths):
    dss_dfs = []
    for input_path in allc_paths:
        records = []
        with TabixFile(str(input_path)) as f:
            for line in f.fetch(region):
                chromosome, pos, _, _, mc, cov, _ = (line.split('\t'))
                records.append([chromosome, int(pos), int(cov), int(mc)])
        dss_dfs.append(pd.DataFrame(records, columns=['chr', 'pos', 'N', 'X']))
    return dss_dfs

dss_dfs = get_data(allc_paths)

In [None]:
group1_count = 0
group2_count = 0
for sample, dss_df in zip(samples, dss_dfs):
    if sample in group1:
        group1_count += dss_df.shape[0]
    else:
        group2_count += dss_df.shape[0]
if (group1_count == 0) or (group2_count == 0):
    # create an empty df in case one of the group is all 0
    dmls = pd.DataFrame([],
                        columns=[
                            'chr', 'pos', 'mu1', 'mu2', 'diff', 'diff.se',
                            'stat', 'phi1', 'phi2', 'pval', 'fdr',
                            'postprob.overThreshold'
                        ])
    # R code will fail in this case, but dmls is created here

## Create Dataset

In [None]:
%%R -i dss_dfs -i samples
BSobj = makeBSseqData(dss_dfs, samples)
BSobj

## DML test

In [None]:
%%R -i group1 -i group2 -i smoothing
# do not parallel in R
default <- registered()
register(MulticoreParam(workers = 1), default = TRUE)

dml_result <- DMLtest(BSobj, 
                     group1=unlist(group1), # turn list into vector
                     group2=unlist(group2), 
                     smoothing=smoothing,
                     BPPARAM=bpparam())

## Call DML (DMS)

In [None]:
%%R -o dmls
dmls <- callDML(dml_result, p.threshold=1)

## Save

In [None]:
try:
    dmls.to_hdf(output_path, key='data', format="table")
except NameError:
    # in case there is no CpG in the input, R code will fail and not return dmls obj
    import subprocess
    subprocess.run(f'touch {output_path}', shell=True)
    subprocess.run(f'touch {output_path}.empty_flag', shell=True)
