In [75]:
%load_ext autoreload
%autoreload 2

import os
import subprocess
import sys
import warnings

warnings.simplefilter("ignore", FutureWarning)

import xclone_config
project_config = xclone_config
os.chdir(project_config.ROOT)

from collections import defaultdict, OrderedDict
import multiprocessing as mp
import sys

import numpy as np
import pandas as pd
from tqdm import tqdm, tqdm_notebook

import matplotlib.pyplot as plt
import seaborn as sns

from data_types import ase
import plotlib
import toolkit
import util
import test_phasing
from workspace.workspace_manager import WorkspaceManager

workspace = WorkspaceManager(
    task_name="preprocessing",
    experiment_info={"sample" : "N5CC3E-T1", 
                     "modality" : "scATAC"},
    verbose=True
)
workspace.load_workspace()

sns.set()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Loading data

In [76]:
requirements = ["raw_snp_counts", "phasing", "blocks"]

data = {
    dtype : 
    util.pickle_load(workspace.tmp_data[dtype])
    for dtype in tqdm_notebook(
        requirements,
        "loading datasets into RAM"
    )
}

HBox(children=(IntProgress(value=0, description='loading datasets into RAM', max=3, style=ProgressStyle(descri…

Let's ensure that "X" and "y" chromosomes are dropped out.  
They are of no interest when it comes to allelic rates.

In [77]:
snp_df = data["raw_snp_counts"][["CHROM", "POS"]].to_dense()
all_snps = toolkit.extract_snps(
    snp_df[~np.isin(snp_df["CHROM"], ["X", "y"])]
)
phased_snps = toolkit.extract_snps(
    data["phasing"][
        ~np.isin(data["phasing"]["CHROM"], 
                 ["X", "y"])
    ]
)

HBox(children=(IntProgress(value=0, description='extracting snps...', max=1156695, style=ProgressStyle(descrip…

HBox(children=(IntProgress(value=0, description='extracting snps...', max=6629429, style=ProgressStyle(descrip…

In [78]:
preserve_mask = (
    ((snp_df["CHROM"] != 'X') & (snp_df["CHROM"] != 'y'))
    & np.isin(all_snps, phased_snps)
)
filtered_counts = []
for colname in tqdm_notebook(data["raw_snp_counts"].columns):
    dense_col = data["raw_snp_counts"][colname].to_dense()
    filtered_counts.append(
        dense_col[preserve_mask]\
        .astype(np.float64)\
        .reset_index(drop=True)\
        .to_sparse(fill_value=np.nan)
    )
filtered_counts_df = pd.concat(filtered_counts, axis=1)
filtered_counts_df.head()

HBox(children=(IntProgress(value=0, max=14334), HTML(value='')))

Unnamed: 0,CHROM,POS,AAACGAAAGGGAGATA-1_ad,AAACGAAAGGGAGATA-1_dp,AAACGAAAGTCGATAA-1_ad,AAACGAAAGTCGATAA-1_dp,AAACGAACAAACCCTA-1_ad,AAACGAACAAACCCTA-1_dp,AAACGAACAACGCACC-1_ad,AAACGAACAACGCACC-1_dp,...,TTTGTGTGTTTGATCG-1_ad,TTTGTGTGTTTGATCG-1_dp,TTTGTGTTCCTATCAT-1_ad,TTTGTGTTCCTATCAT-1_dp,TTTGTGTTCCTCATTA-1_ad,TTTGTGTTCCTCATTA-1_dp,TTTGTGTTCGCTAGTA-1_ad,TTTGTGTTCGCTAGTA-1_dp,TTTGTGTTCTTAATCC-1_ad,TTTGTGTTCTTAATCC-1_dp
0,1.0,14001867.0,,,,,,,,,...,,,,,,,,,,
1,1.0,14003581.0,,,,,,,,,...,,,,,,,,,,
2,1.0,14007558.0,,,,,,,,,...,,,,,,,,,,
3,1.0,14007649.0,,,,,,,,,...,,,,,,,,,,
4,1.0,14008734.0,,,,,,,,,...,,,,,,,,,,


In [79]:
data["raw_snp_counts"] = filtered_counts_df

In [80]:
data["phasing"] = data["phasing"][
    ~np.isin(data["phasing"].CHROM, ["X", "y"])
].reset_index(drop=True)

In [81]:
data["blocks"] = data["blocks"][
    ~np.isin(data["blocks"].CHROM, ["X", "y"])
].reset_index(drop=True)

In [82]:
data["blocks"].head()

Unnamed: 0,CHROM,START,END,COPY_NUMBER,BLOCK_ID
0,1,14000000,24000000,2,0
1,1,16000000,26000000,2,1
2,1,18000000,28000000,2,2
3,1,20000000,30000000,2,3
4,1,22000000,32000000,2,4


## Ensuring that alternative allele is maternal (0|1 in raw phasing results)

In [83]:
def phase_snp_counts(raw_snp_counts_df, phasing_df):

    counts_df = raw_snp_counts_df.copy()
    counts_df["PHASE"] = phasing_df.PHASE
    male_alt = counts_df.PHASE == 1
    
    print("Ensuring that non-phased SNPs were filtered out")
    assert np.all(np.isin(toolkit.extract_snps(counts_df), 
                  toolkit.extract_snps(phasing_df)))

    for barcode in tqdm_notebook(toolkit.extract_barcodes(counts_df), 
                                 desc=f"cell_barcode"):
        ad = counts_df[f"{barcode}_ad"].to_dense()
        dp = counts_df[f"{barcode}_dp"].to_dense()
        ad[male_alt] = dp[male_alt].sub(ad[male_alt], fill_value=0)
        counts_df[f"{barcode}_ad"] = ad.to_sparse()

    new_nan_stats = util.nan_fraction(counts_df.drop(columns=["PHASE"]))
    old_nan_stats = util.nan_fraction(raw_snp_counts_df)

    assert old_nan_stats == new_nan_stats

    print("{:.2f}% of non-missing read counts".format(
        100 * np.mean(~counts_df.iloc[:, 2:].isna().values.astype(bool))
    ))
    return counts_df

In [84]:
%%time
warnings.simplefilter("ignore", pd.core.common.SettingWithCopyWarning)
data["snp_counts"] = phase_snp_counts(data["raw_snp_counts"], data["phasing"])
data["snp_counts"].head()

Ensuring that non-phased SNPs were filtered out


HBox(children=(IntProgress(value=0, description='extracting snps...', max=1008454, style=ProgressStyle(descrip…

HBox(children=(IntProgress(value=0, description='extracting snps...', max=6629429, style=ProgressStyle(descrip…

HBox(children=(IntProgress(value=0, description='cell_barcode', max=7166, style=ProgressStyle(description_widt…

0.06% of non-missing read counts
CPU times: user 32min 11s, sys: 2min 9s, total: 34min 21s
Wall time: 34min 39s


Unnamed: 0,CHROM,POS,AAACGAAAGGGAGATA-1_ad,AAACGAAAGGGAGATA-1_dp,AAACGAAAGTCGATAA-1_ad,AAACGAAAGTCGATAA-1_dp,AAACGAACAAACCCTA-1_ad,AAACGAACAAACCCTA-1_dp,AAACGAACAACGCACC-1_ad,AAACGAACAACGCACC-1_dp,...,TTTGTGTGTTTGATCG-1_dp,TTTGTGTTCCTATCAT-1_ad,TTTGTGTTCCTATCAT-1_dp,TTTGTGTTCCTCATTA-1_ad,TTTGTGTTCCTCATTA-1_dp,TTTGTGTTCGCTAGTA-1_ad,TTTGTGTTCGCTAGTA-1_dp,TTTGTGTTCTTAATCC-1_ad,TTTGTGTTCTTAATCC-1_dp,PHASE
0,1.0,14001867.0,,,,,,,,,...,,,,,,,,,,1
1,1.0,14003581.0,,,,,,,,,...,,,,,,,,,,1
2,1.0,14007558.0,,,,,,,,,...,,,,,,,,,,1
3,1.0,14007649.0,,,,,,,,,...,,,,,,,,,,1
4,1.0,14008734.0,,,,,,,,,...,,,,,,,,,,1


In [85]:
util.pickle_dump(
    data["snp_counts"],
    os.path.join(
        workspace.tmp_dir,
        "snp_counts.pkl"
    )
)
# util.pickle_dump(
#     data["raw_snp_counts"],
#     os.path.join(
#         workspace.tmp_dir,
#         "raw_snp_counts.pkl"
#     )
# )
# util.pickle_dump(
#     data["phasing"],
#     os.path.join(
#         workspace.tmp_dir,
#         "phasing.pkl"
#     )
# )
# util.pickle_dump(
#     data["blocks"],
#     os.path.join(
#         workspace.tmp_dir,
#         "blocks.pkl"
#     )
# )
# workspace.add_entry("raw_snp_counts_T1", "raw_snp_counts_T1.pkl")
workspace.add_entry("snp_counts", "10mb_eagle2_phased_snp_counts.pkl")
# workspace.add_entry("phasing", "phasing.pkl")
# workspace.add_entry("blocks", "haplotype_blocks.pkl")
workspace.verify()
workspace.push()

/icgc/dkfzlsdf/analysis/B260/users/v390v/xclone/data/tmp/preprocessing/N5CC3E-T1/scATAC/snp_counts.pkl —> /icgc/dkfzlsdf/analysis/B260/users/v390v/xclone/data/processed/N5CC3E-T1/scATAC/10mb_eagle2_phased_snp_counts.pkl
