In [46]:
import pandas as pd
import os

in_path="/data/workspaces/lag/workspaces/lg-ukbiobank/projects/rest-multimodal/wes/genotype_variant_filtering/"

def read_stats(fn):
    column_names = ["chr",
                "block",
                "total_all",
                "total_target_regions",
                "monoallelic",
                "variants_pre",
                "avg_gq",
                "missing",
                "mac",
                "ab",
                "total_removed",
                "total_remaining",
                "ts_pre",
                "tv_pre",
                "ts_tv_ratio_pre",
                "ts_post",
                "tv_post",
                "ts_tv_ratio_post",
                "multiallelic_removed"
               ]
    return pd.read_csv(fn, sep="\t", header=None, names=column_names)

def read_all(file_list, column_names):
    return pd.concat(map(read_stats, file_list))

chrom_list = [str(i) for i in range(1,23, 1)] + ["X"]

file_list = [os.path.join(in_path,"c{}".format(chrom), "c{}_overview_filtering_statistics_per_block.txt".format(chrom)) for chrom in chrom_list]

wes_stats = read_all(file_list, column_names)


#TO DO: ADD MAP FUNCTION AND FOR LOOP FOR ALL CHROMOSOMES
#chrom2 = pd.read_csv(os.path.join(in_path,"c2", "c2_overview_filtering_statistics_per_block.txt"), sep="\t", header=None, names=column_names)
#chrom21 = pd.read_csv(os.path.join(in_path,"c21", "c21_overview_filtering_statistics_per_block.txt"), sep="\t", header=None, names=column_names)

In [47]:
def auto_QC(df):
    """
    Automated QC for success of variant filtering
    1. check if variants prefiltering are correct (no truncated files)
    2. check if removed and remaining variants are correct (no trunctated files)
    3. check if Ts/Tv ratio went up after filtering
    
    Returns:
    dataframe with QC columns
    """
    
    df["delta_variant_pre"] = df["total_target_regions"] - df["monoallelic"] - df["variants_pre"]
    df["delta_filter"] = df["variants_pre"] - df["total_removed"] - df["total_remaining"]
    df["delta_ts_tv"] = df["ts_tv_ratio_post"] - df["ts_tv_ratio_pre"]
    
    df["QC_pass"] = True
    df.loc[df["delta_variant_pre"] != 0, "QC_pass"] = False
    df.loc[df["delta_filter"] != 0, "QC_pass"] = False
    df.loc[df["delta_ts_tv"] < 0, "QC_pass"] = False
    
    if df["QC_pass"].all():
        print("All blocks pass QC")
    else:
        print("WARNING: THESE BLOCKS DO NOT PASS QC")
        print(df[~df["QC_pass"]])
    
    return df

wes_stats = auto_QC(wes_stats)

All blocks pass QC
