# Purpose
The purpose of this code is to identify PMR from samples of DNA across many different cancers and test the algorithm

# Code

Tell ipython to reload modules so we can update other files

In [1]:
%load_ext autoreload
%autoreload 2

## Imports

In [2]:
import numpy as np
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from methylation_utils.methylation_utils.analyzer import (
    MethylationAnalyzer,
    MethylationAnalyzerHelper,
)
from methylation_utils.methylation_utils.plot import MethPlot, ChromIdeoPlot
from methylation_utils.methylation_utils.utils import compare_bed_dfs
from methylation_utils.generic_utils import instantiate_progress_bar
from pybedtools import BedTool
import pyarrow.feather as feather

In [3]:
INCLUDE_NORMAL_SAMPLES = False

## Samples

In [4]:
SAMPLES_FILE = "data/old_data/runAll.sh.samples"
METH_DYNAMIC_FOLDER = "data/old_data/bottom/samples_meth_dynamic"
METH_REF_FILE = "data/old_data/parse450K.pl.order.lookup"

In [5]:
""" Read in the sample info and methylation reference into memory """
sample_info = pd.read_csv(SAMPLES_FILE,sep="\t")
print('Read ' + str(len(sample_info.index)) + ' samples')
meth_ref = pd.read_csv(METH_REF_FILE,sep="\t")

Read 9714 samples


In [6]:
meth_ref

Unnamed: 0,key,CpG_chrm,CpG_beg,CpG_end,probe_strand,probeID,genesUniq,geneNames,transcriptTypes,transcriptIDs,distToTSS,CGI,CGIposition
0,cg13869341,chr1,15864.0,15866.0,-,cg13869341,WASH7P,WASH7P,unprocessed_pseudogene,ENST00000488147.1,13706,,
1,cg14008030,chr1,18826.0,18828.0,-,cg14008030,MIR6859-1;WASH7P,MIR6859-1;WASH7P,miRNA;unprocessed_pseudogene,ENST00000619216.1;ENST00000488147.1,-1390;10744,,
2,cg12045430,chr1,29406.0,29408.0,-,cg12045430,MIR1302-2;MIR1302-2HG;WASH7P,MIR1302-2;MIR1302-2HG;MIR1302-2HG;WASH7P,miRNA;lncRNA;lncRNA;unprocessed_pseudogene,ENST00000607096.1;ENST00000469289.1;ENST000004...,-959;-860;-147;164,CGI:chr1:28735-29737,Island
3,cg20826792,chr1,29424.0,29426.0,-,cg20826792,MIR1302-2;MIR1302-2HG;WASH7P,MIR1302-2;MIR1302-2HG;MIR1302-2HG;WASH7P,miRNA;lncRNA;lncRNA;unprocessed_pseudogene,ENST00000607096.1;ENST00000469289.1;ENST000004...,-941;-842;-129;146,CGI:chr1:28735-29737,Island
4,cg00381604,chr1,29434.0,29436.0,-,cg00381604,MIR1302-2;MIR1302-2HG;WASH7P,MIR1302-2;MIR1302-2HG;MIR1302-2HG;WASH7P,miRNA;lncRNA;lncRNA;unprocessed_pseudogene,ENST00000607096.1;ENST00000469289.1;ENST000004...,-931;-832;-119;136,CGI:chr1:28735-29737,Island
...,...,...,...,...,...,...,...,...,...,...,...,...,...
485572,cg24238852,,,,*,cg24238852,,,,,,,
485573,cg15254640,,,,*,cg15254640,,,,,,,
485574,cg24336839,,,,*,cg24336839,,,,,,,
485575,cg11478607,,,,*,cg11478607,,,,,,,


In [7]:
sample_info

Unnamed: 0,sample,expression,expression_file,methylation,methylation_file,sample_id,project_id,composition,sample_type,tissue_type,tumor_descriptor,project_descriptor
0,TCGA-EX-A1H6-01B,72ac3cfa-31f8-4371-867d-ab897b2b7cb9.rna_seq.a...,/uufs/chpc.utah.edu/common/home/clementm-group...,b04fb3ae-1075-4e4c-b519-743dd948ae45.methylati...,/uufs/chpc.utah.edu/common/home/clementm-group...,fe018d31-eee8-4335-96a1-95eceabfadf1,TCGA-CESC,Not Reported,Primary Tumor,Not Reported,Not Reported,Cervical squamous cell carcinoma and endocervi...
1,TCGA-C5-A7CM-01A,8d405837-5989-4de9-a19c-4f7af4af65d3.rna_seq.a...,/uufs/chpc.utah.edu/common/home/clementm-group...,d1efc9b7-7499-4f95-90bb-7613a42ce82f.methylati...,/uufs/chpc.utah.edu/common/home/clementm-group...,fefa6b4c-243d-4544-accc-11515bbe516e,TCGA-CESC,Not Reported,Primary Tumor,Not Reported,Not Reported,Cervical squamous cell carcinoma and endocervi...
2,TCGA-C5-A2M1-01A,35e5936c-9722-4a49-9bb3-5c244bd10b13.rna_seq.a...,/uufs/chpc.utah.edu/common/home/clementm-group...,c56b4dcd-d498-4394-9fc9-a449647021b1.methylati...,/uufs/chpc.utah.edu/common/home/clementm-group...,065bb06f-5bb7-4f79-b374-97a3011f22ae,TCGA-CESC,Not Reported,Primary Tumor,Not Reported,Not Reported,Cervical squamous cell carcinoma and endocervi...
3,TCGA-LP-A5U2-01A,0aa549e1-bb9a-4489-b815-1c22817100d8.rna_seq.a...,/uufs/chpc.utah.edu/common/home/clementm-group...,e51138bf-5d5c-4b32-ad9a-d8b84c2e541f.methylati...,/uufs/chpc.utah.edu/common/home/clementm-group...,cbdbe0d3-b3b3-4990-be3a-9af8360889d8,TCGA-CESC,Not Reported,Primary Tumor,Not Reported,Not Reported,Cervical squamous cell carcinoma and endocervi...
4,TCGA-EK-A3GK-01A,20e72530-f9cc-44d6-81f9-bb7c49f1f7e4.rna_seq.a...,/uufs/chpc.utah.edu/common/home/clementm-group...,1d7814b0-8818-4fcb-87e7-fef883f7571f.methylati...,/uufs/chpc.utah.edu/common/home/clementm-group...,f9371b06-c6e5-4363-bdaa-bd85f95a01c4,TCGA-CESC,Not Reported,Primary Tumor,Not Reported,Not Reported,Cervical squamous cell carcinoma and endocervi...
...,...,...,...,...,...,...,...,...,...,...,...,...
9709,TCGA-E2-A15J-01A,723ed50c-2c78-4a3a-9f6e-9f9e8e264f6b.rna_seq.a...,/uufs/chpc.utah.edu/common/home/clementm-group...,0b7f6506-a464-4635-ae6b-2820a0ab5206.methylati...,/uufs/chpc.utah.edu/common/home/clementm-group...,6f68a61c-8aec-42d3-b225-22bc6da3e355,TCGA-BRCA,Not Reported,Primary Tumor,Not Reported,Not Reported,Breast invasive carcinoma
9710,TCGA-B6-A40C-01A,32c15b29-da14-4eb5-8c86-39efc94319df.rna_seq.a...,/uufs/chpc.utah.edu/common/home/clementm-group...,f9dd5273-22ef-46c9-a8ad-c501825d2f3e.methylati...,/uufs/chpc.utah.edu/common/home/clementm-group...,e852c16b-cd36-4551-8cc5-8b59719ca987,TCGA-BRCA,Not Reported,Primary Tumor,Not Reported,Not Reported,Breast invasive carcinoma
9711,TCGA-BH-A0B3-11B,d920c232-82ae-48f9-98d1-b94bf4a45d8d.rna_seq.a...,/uufs/chpc.utah.edu/common/home/clementm-group...,b504445f-170c-44ce-8a84-70db6930397f.methylati...,/uufs/chpc.utah.edu/common/home/clementm-group...,9a81af95-30f8-45b1-bbda-ca5c7a91c5c5,TCGA-BRCA,Not Reported,Solid Tissue Normal,Not Reported,Not Reported,Breast invasive carcinoma
9712,TCGA-E9-A1QZ-01A,0f168687-6f42-4d92-8f91-13efde785a0c.rna_seq.a...,/uufs/chpc.utah.edu/common/home/clementm-group...,a26af0de-1dff-4345-adf2-d3794fd33a6d.methylati...,/uufs/chpc.utah.edu/common/home/clementm-group...,ec76e770-ba6b-4294-a992-993183640ff8,TCGA-BRCA,Not Reported,Primary Tumor,Not Reported,Not Reported,Breast invasive carcinoma


In [8]:
cancer_samples = sample_info[sample_info.sample_type != "Solid Tissue Normal"]

In [9]:
cancer_samples.sample_type.unique()

array(['Primary Tumor', 'Additional - New Primary', 'Metastatic',
       'Recurrent Tumor', 'Additional Metastatic',
       'Recurrent Blood Derived Cancer - Bone Marrow',
       'Primary Blood Derived Cancer - Bone Marrow',
       'Primary Blood Derived Cancer - Peripheral Blood',
       'Recurrent Blood Derived Cancer - Peripheral Blood'], dtype=object)

## Known

In [10]:
# !wget https://zhouserver.research.chop.edu/GenomeAnnotation/hg38/PMD_coordinates_hg38.bed.gz -P files/

In [11]:
zhou_meth_data = pd.read_csv('files/PMD_coordinates_hg38.bed.gz', compression='gzip',sep="\t",header=None,names=['chrom','start','end','val','PMD','type'])

In [12]:
zho_common_pmds = []

current_chrom = None
for index, row in zhou_meth_data.iterrows():
    if row['type'] == "commonPMD":
        if current_chrom is None or current_chrom != row['chrom']:
            #Start new region
            current_chrom = row['chrom']
            zho_common_pmds.append([row['chrom'], row['start'], row['end']])
        else:
            zho_common_pmds[-1][-1] = row['end']
    else:
        current_chrom = None

In [13]:
zho_common_pmds_df = pd.DataFrame(zho_common_pmds, columns=['chrom','start','end'])
zho_common_pmds_df["length"] = zho_common_pmds_df["end"] - zho_common_pmds_df["start"]

## Analysis

### Constants

In [14]:
DEBUG = False

""" For individual samples """
METH_INTERMEDIATE_HIGH_CUTOFF = 70
METH_INTERMEDIATE_LOW_CUTOFF = 20

""" For combined samples """
# Number of samples that need to have intermediate methylation at a site for the site to be considered intermediatly methylated
if INCLUDE_NORMAL_SAMPLES:
    CUTOFF_INTERMEDIATE_VAL = len(sample_info.index) / 2
else:
    CUTOFF_INTERMEDIATE_VAL = len(cancer_samples.index) / 2


# Percent of markers that need to be intermediatly methylated in a window
CUTOFF_PCT = 0.60

In [15]:
analyzer = MethylationAnalyzer(
    sample_info if INCLUDE_NORMAL_SAMPLES else cancer_samples,
    meth_ref,
    METH_INTERMEDIATE_HIGH_CUTOFF,
    METH_INTERMEDIATE_LOW_CUTOFF,
    CUTOFF_PCT,
    sample_cutoff_val_low = CUTOFF_INTERMEDIATE_VAL,
    debug = DEBUG,
    optimize_for_humans=False,
)
analyzer.analyze(True)

IntProgress(value=0, max=9315)

IntProgress(value=0, max=485577)

In [16]:
meth_ref_sorted = analyzer.sorted_reference_data

In [17]:
all_sample_meth_vals = meth_ref.copy()
headers_to_exclude = set(all_sample_meth_vals.columns.values.tolist())
headers_to_exclude.remove("CpG_beg")
headers_to_exclude.remove("CpG_chrm")

meth_vals = {}

if INCLUDE_NORMAL_SAMPLES:
    samples = sample_info
else:
    samples = cancer_samples
        
for row_index, sample_row in samples.iterrows():

    sample_name = sample_row["sample"]
    meth_file = sample_row["methylation_file"]

    meth = np.load(meth_file).astype(float)
    meth[meth == 255] = np.nan
    meth_vals[sample_name] = meth

methylation_values_df = pd.DataFrame(meth_vals)
all_sample_meth_vals = pd.concat(
    [
        all_sample_meth_vals.reset_index(drop=True),
        methylation_values_df.reset_index(drop=True),
    ],
    axis=1,
)


all_sample_meth_vals.dropna(subset=["CpG_beg"], inplace=True)

all_samples = all_sample_meth_vals.drop(columns=headers_to_exclude)
all_samples["chrom_num"] = all_samples["CpG_chrm"].str.extract("(\d+)")
all_samples.dropna(subset=["chrom_num"], inplace=True)
all_samples["chrom_num"] = all_samples["chrom_num"].astype(int)
all_samples.reset_index()

Unnamed: 0,index,CpG_chrm,CpG_beg,TCGA-EX-A1H6-01B,TCGA-C5-A7CM-01A,TCGA-C5-A2M1-01A,TCGA-LP-A5U2-01A,TCGA-EK-A3GK-01A,TCGA-Q1-A73P-01A,TCGA-UC-A7PI-01A,...,TCGA-EW-A1P1-01A,TCGA-OL-A66L-01A,TCGA-AR-A0TR-01A,TCGA-BH-A0HA-01A,TCGA-BH-A1ES-06A,TCGA-E2-A15J-01A,TCGA-B6-A40C-01A,TCGA-E9-A1QZ-01A,TCGA-S3-AA10-01A,chrom_num
0,0,chr1,15864.0,,,,,,,,...,,,,,,,,,,1
1,1,chr1,18826.0,,,,,,,,...,,,,,,,,,,1
2,2,chr1,29406.0,,,,,,,,...,,,,,,,,,,1
3,3,chr1,29424.0,,,,,,,,...,,,,,,,,,,1
4,4,chr1,29434.0,,,,,,,,...,,,,,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
473873,473873,chr22,50783822.0,,,,,,,,...,,,,,,,,,,22
473874,473874,chr22,50784090.0,,,,,,,,...,,,,,,,,,,22
473875,473875,chr22,50784300.0,,,,,,,,...,,,,,,,,,,22
473876,473876,chr22,50784914.0,,,,,,,,...,,,,,,,,,,22


In [18]:
all_samples

Unnamed: 0,CpG_chrm,CpG_beg,TCGA-EX-A1H6-01B,TCGA-C5-A7CM-01A,TCGA-C5-A2M1-01A,TCGA-LP-A5U2-01A,TCGA-EK-A3GK-01A,TCGA-Q1-A73P-01A,TCGA-UC-A7PI-01A,TCGA-JX-A3Q8-01A,...,TCGA-EW-A1P1-01A,TCGA-OL-A66L-01A,TCGA-AR-A0TR-01A,TCGA-BH-A0HA-01A,TCGA-BH-A1ES-06A,TCGA-E2-A15J-01A,TCGA-B6-A40C-01A,TCGA-E9-A1QZ-01A,TCGA-S3-AA10-01A,chrom_num
0,chr1,15864.0,,,,,,,,,...,,,,,,,,,,1
1,chr1,18826.0,,,,,,,,,...,,,,,,,,,,1
2,chr1,29406.0,,,,,,,,,...,,,,,,,,,,1
3,chr1,29424.0,,,,,,,,,...,,,,,,,,,,1
4,chr1,29434.0,,,,,,,,,...,,,,,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
473873,chr22,50783822.0,,,,,,,,,...,,,,,,,,,,22
473874,chr22,50784090.0,,,,,,,,,...,,,,,,,,,,22
473875,chr22,50784300.0,,,,,,,,,...,,,,,,,,,,22
473876,chr22,50784914.0,,,,,,,,,...,,,,,,,,,,22


In [19]:
windows_df = analyzer.windows_df
windows_df["length"] = windows_df["CpG_end"] - windows_df["CpG_beg"]
windows_df["chrom_num"] = windows_df["CpG_chrm"].str.extract("(\d+)").astype(int)

windows_df

Unnamed: 0,CpG_chrm,CpG_beg,CpG_end,start_idx,end_idx,count_above_pct,count_in_window,count_in_end_tail,length,chrom_num
0,chr1,898802,898802,76,76,1,1,0,0,1
1,chr1,904054,904054,83,83,1,1,0,0,1
2,chr1,908110,908110,88,88,1,1,0,0,1
3,chr1,910429,924641,93,128,15,24,0,14212,1
4,chr1,930479,930479,142,142,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...
16814,chr9,137837099,137837099,473816,473816,1,1,0,0,9
16815,chr9,137876462,137876462,473824,473824,1,1,0,0,9
16816,chr9,137877146,137919088,473826,473838,6,8,1,41942,9
16817,chr9,138045126,138045126,473852,473852,1,1,0,0,9


In [20]:
import swifter

In [21]:
def get_percent_coverge(row):
    window_samples = all_samples[
        (all_samples["chrom_num"] == row["chrom_num"])
        & (all_samples["CpG_beg"] >= row["CpG_beg"])
        & (all_samples["CpG_beg"] <= row["CpG_end"])
    ]
    window_meth_vals = window_samples.drop(columns=["CpG_chrm", "chrom_num", "CpG_beg"])
    count_with_value = window_meth_vals.notnull().sum().sum()
    total_count = window_meth_vals.size
    return np.where(total_count == 0, 0, count_with_value / total_count)



In [22]:
windows_df["coverage_percentage"] = windows_df.swifter.apply(
    get_percent_coverge, axis=1
)

Pandas Apply:   0%|          | 0/16819 [00:00<?, ?it/s]

In [23]:
windows_df

Unnamed: 0,CpG_chrm,CpG_beg,CpG_end,start_idx,end_idx,count_above_pct,count_in_window,count_in_end_tail,length,chrom_num,coverage_percentage
0,chr1,898802,898802,76,76,1,1,0,0,1,1.0
1,chr1,904054,904054,83,83,1,1,0,0,1,1.0
2,chr1,908110,908110,88,88,1,1,0,0,1,1.0
3,chr1,910429,924641,93,128,15,24,0,14212,1,0.9721059223474683
4,chr1,930479,930479,142,142,1,1,0,0,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...
16814,chr9,137837099,137837099,473816,473816,1,1,0,0,9,1.0
16815,chr9,137876462,137876462,473824,473824,1,1,0,0,9,1.0
16816,chr9,137877146,137919088,473826,473838,6,8,1,41942,9,0.7524753292869235
16817,chr9,138045126,138045126,473852,473852,1,1,0,0,9,1.0


In [24]:
zho_common_pmds_df

Unnamed: 0,chrom,start,end,length
0,chr1,0,200000,200000
1,chr1,500000,700000,200000
2,chr1,2800000,3400000,600000
3,chr1,3500000,3600000,100000
4,chr1,3900000,5800000,1900000
...,...,...,...,...
1989,chr9,131000000,131100000,100000
1990,chr9,132100000,132200000,100000
1991,chr9,133600000,133700000,100000
1992,chr9,134500000,135800000,1300000


In [25]:
feather.write_feather(meth_ref_sorted, "shared_data/meth_ref_sorted.feather" if INCLUDE_NORMAL_SAMPLES else "shared_data/meth_ref_sorted.cancer.feather")

In [26]:
feather.write_feather(all_samples, "shared_data/all_samples.feather" if INCLUDE_NORMAL_SAMPLES else "shared_data/all_samples.cancer.feather")

In [27]:
windows_df["coverage_percentage"] = windows_df["coverage_percentage"].astype(str)

In [28]:
feather.write_feather(windows_df, "shared_data/windows_df.feather" if INCLUDE_NORMAL_SAMPLES else "shared_data/windows_df.cancer.feather")

In [29]:
windows_df["coverage_percentage"] = windows_df["coverage_percentage"].astype(float)

In [30]:
feather.write_feather(zho_common_pmds_df, "shared_data/zho_common_pmds_df.feather")

In [31]:
LENGTH_CUTTOFF = 1000
COUNT_CUTTOFF = 50
COVERAGE_CUTOFF = .75

In [32]:
windows_to_analyze = windows_df[
    (windows_df["length"] > LENGTH_CUTTOFF)
    & (windows_df["count_in_window"] > COUNT_CUTTOFF)
    & (windows_df["coverage_percentage"] > COVERAGE_CUTOFF)
]

In [33]:
windows_to_analyze.sort_values(
    by=["chrom_num", "CpG_beg"], ascending=True, inplace=True
)
windows_to_analyze.reset_index(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  windows_to_analyze.sort_values(


In [34]:
windows_to_analyze

Unnamed: 0,index,CpG_chrm,CpG_beg,CpG_end,start_idx,end_idx,count_above_pct,count_in_window,count_in_end_tail,length,chrom_num,coverage_percentage
0,1169,chr1,118980013,119632249,27789,27962,42,67,2,652236,1,0.898576
1,9600,chr2,44929502,45170662,256085,256255,47,75,3,241160,2,0.954641
2,10121,chr2,176098854,177076134,272094,272457,79,126,5,977280,2,0.917738
3,10266,chr2,221999366,222517706,277707,277849,33,54,0,518340,2,0.969795
4,12095,chr3,147385052,149086862,324950,325187,41,66,2,1701810,3,0.890606
5,13063,chr5,73176013,73451873,360610,360764,39,57,7,275860,5,0.912059
6,14334,chr6,100446636,104859853,402561,402762,36,51,8,4413217,6,0.836262
7,14687,chr7,1291090,1448147,414661,414840,51,82,2,157057,7,0.893054
8,14866,chr7,27110783,27144749,420688,420833,32,52,1,33966,7,0.95215
9,14874,chr7,27185507,27364567,421030,421211,53,75,13,179060,7,0.933422


In [35]:
helper = MethylationAnalyzerHelper(
    upper_cutoff=METH_INTERMEDIATE_HIGH_CUTOFF,
    lower_cutoff=METH_INTERMEDIATE_LOW_CUTOFF,
    dmr_percent_cutoff=CUTOFF_PCT,
    sample_cutoff_val_low=CUTOFF_INTERMEDIATE_VAL,
    debug=DEBUG,
    optimize_for_humans=False,
)

In [36]:
import collections

In [37]:
windows_to_analyze

Unnamed: 0,index,CpG_chrm,CpG_beg,CpG_end,start_idx,end_idx,count_above_pct,count_in_window,count_in_end_tail,length,chrom_num,coverage_percentage
0,1169,chr1,118980013,119632249,27789,27962,42,67,2,652236,1,0.898576
1,9600,chr2,44929502,45170662,256085,256255,47,75,3,241160,2,0.954641
2,10121,chr2,176098854,177076134,272094,272457,79,126,5,977280,2,0.917738
3,10266,chr2,221999366,222517706,277707,277849,33,54,0,518340,2,0.969795
4,12095,chr3,147385052,149086862,324950,325187,41,66,2,1701810,3,0.890606
5,13063,chr5,73176013,73451873,360610,360764,39,57,7,275860,5,0.912059
6,14334,chr6,100446636,104859853,402561,402762,36,51,8,4413217,6,0.836262
7,14687,chr7,1291090,1448147,414661,414840,51,82,2,157057,7,0.893054
8,14866,chr7,27110783,27144749,420688,420833,32,52,1,33966,7,0.95215
9,14874,chr7,27185507,27364567,421030,421211,53,75,13,179060,7,0.933422


In [38]:
def populate_region_stats(meth_windows):
    meth_windows["next_start"] = meth_windows.groupby("chrom_num")["CpG_beg"].shift(-1)
    meth_windows["dist_to_next"] = meth_windows["next_start"] - meth_windows["CpG_end"]

    if "intermediate_percent" in meth_windows.columns:
        meth_windows = meth_windows.drop(columns=["intermediate_percent"])

    progress_bar = instantiate_progress_bar(max=len(meth_windows.index))
    percentages = []
    dist_to_next = collections.OrderedDict()

    for index, window in meth_windows.iterrows():
        progress_bar.value += 1

        pmrs, percent, int_count, total_count = helper.check_percent_in_region(
            meth_ref_sorted,
            window["CpG_chrm"],
            window["CpG_beg"],
            window["CpG_end"],
        )
        percentages.append(percent)
    return pd.concat(
        [
            meth_windows.reset_index(drop=True),
            pd.DataFrame(percentages, columns=["intermediate_percent"]).reset_index(
                drop=True
            ),
        ],
        axis=1,
    )

In [39]:

windows_to_analyze = populate_region_stats(windows_to_analyze)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meth_windows["next_start"] = meth_windows.groupby("chrom_num")["CpG_beg"].shift(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meth_windows["dist_to_next"] = meth_windows["next_start"] - meth_windows["CpG_end"]


IntProgress(value=0, max=28)

In [40]:
windows_to_analyze

Unnamed: 0,index,CpG_chrm,CpG_beg,CpG_end,start_idx,end_idx,count_above_pct,count_in_window,count_in_end_tail,length,chrom_num,coverage_percentage,next_start,dist_to_next,intermediate_percent
0,1169,chr1,118980013,119632249,27789,27962,42,67,2,652236,1,0.898576,,,0.626866
1,9600,chr2,44929502,45170662,256085,256255,47,75,3,241160,2,0.954641,176098854.0,130928192.0,0.626667
2,10121,chr2,176098854,177076134,272094,272457,79,126,5,977280,2,0.917738,221999366.0,44923232.0,0.626984
3,10266,chr2,221999366,222517706,277707,277849,33,54,0,518340,2,0.969795,,,0.611111
4,12095,chr3,147385052,149086862,324950,325187,41,66,2,1701810,3,0.890606,,,0.621212
5,13063,chr5,73176013,73451873,360610,360764,39,57,7,275860,5,0.912059,,,0.684211
6,14334,chr6,100446636,104859853,402561,402762,36,51,8,4413217,6,0.836262,,,0.705882
7,14687,chr7,1291090,1448147,414661,414840,51,82,2,157057,7,0.893054,27110783.0,25662636.0,0.621951
8,14866,chr7,27110783,27144749,420688,420833,32,52,1,33966,7,0.95215,27185507.0,40758.0,0.615385
9,14874,chr7,27185507,27364567,421030,421211,53,75,13,179060,7,0.933422,94656359.0,67291792.0,0.706667


# Sources

1. Zhou, W., Dinh, H.Q., Ramjan, Z. et al. DNA methylation loss in late-replicating domains is linked to mitotic cell division. Nat Genet 50, 591–602 (2018). https://doi.org/10.1038/s41588-018-0073-4

2. Dale RK, Pedersen BS, and Quinlan AR. 2011. Pybedtools: a flexible Python library for manipulating genomic datasets and annotations. Bioinformatics 27(24):3423-3424.

3. Quinlan AR and Hall IM, 2010. BEDTools: a flexible suite of utilities for comparing genomic features. Bioinformatics 26(6):841–842.

# Dataset
<pre>
data from https://www.nature.com/articles/s41588-018-0073-4
DNA methylation loss in late-replicating domains is linked to mitotic cell division
incredible resource here: https://zwdzwd.github.io/pmd
!wget https://zhouserver.research.chop.edu/GenomeAnnotation/hg38/PMD_coordinates_hg38.bed.gz
</pre>