In [8]:
import pandas as pd
import os

class WGS_GW_features:
    def __init__(self,
                 input_tsv,
                 motif_order_path,
                 outputdir):
        self.input_tsv = input_tsv
        self.sampleid = input_tsv.split("/")[-1].split(".")[0]
        self.maindf = pd.read_csv(input_tsv, sep = "\t", header = None)
        self.maindf.columns = ["chr", "start", "end", "flen", "readID", "forward_NUC", "reverse_NUC", "forward_EM", "reverse_EM"]
        self.motif_order_path = motif_order_path
        self.motif_order = pd.read_csv(motif_order_path)["motif_order"].values
        self.all_4bp_motifs = [
            "{}{}{}{}".format(i,j,k,l) 
            for i in ["A", "T", "G", "C"] 
            for j in ["A", "T", "G", "C"] 
            for k in ["A", "T", "G", "C"] 
            for l in ["A", "T", "G", "C"]
        ]
        self.outputdir = outputdir
        
    #####-------------------------------------------------------------#####
    ##### Distribution of fragment lengths
    #####-------------------------------------------------------------#####
    def flen_distribution_features(self, 
                                   save_feature = True):
        flendf = self.maindf[["flen"]].copy()
        flendf["abs_flen"] = flendf["flen"].abs()
        if not flendf.empty:
            flen_count = flendf["abs_flen"].value_counts().reset_index()
            flen_count.columns = ["size", "count"]
            ##### keep only fragments that are between 50 and 350 bp
            flen_count = flen_count[(flen_count["size"] >= 50) & (flen_count["size"] <= 350)]
            flen_count["freq"] = flen_count["count"] / flen_count["count"].sum()
            flen_count = flen_count.sort_values("size")
            output_flendf = pd.DataFrame({"size": range(50, 351)})
            output_flendf = output_flendf.merge(flen_count, on="size", how="left").fillna(0)
            output_flendf = output_flendf[["size", "freq", "count"]]
            if save_feature:
                output_flendf.to_csv(os.path.join(self.outputdir, f"{self.sampleid}.flen.csv"), index=False)
            return output_flendf
        
    #####-------------------------------------------------------------#####
    ##### 4bp end motif
    #####-------------------------------------------------------------#####    
    def generate_em_feature(self, 
                            save_feature):
        emdf = pd.concat([self.maindf[["forward_EM"]] , self.maindf[["reverse_EM"]] ], axis = 0)
        emdf.columns = ["motif"]
        emdf["motif"] = emdf["motif"].str.upper()
        output_emdf = emdf["motif"].value_counts().reset_index()
        if not output_emdf.empty:
            output_emdf.columns = ["motif", "count"]
            output_emdf = output_emdf[~output_emdf["motif"].str.contains("N")]
            output_emdf["freq"] = output_emdf["count"] / output_emdf["count"].sum()
            output_emdf = output_emdf[["motif", "freq"]]
            if save_feature:
                output_emdf.to_csv(os.path.join(self.outputdir, f"{self.sampleid}.EM.csv"), index=False)
            return output_emdf

    #####-------------------------------------------------------------#####    
    ##### distribution of distance read-to-nearest nucleosome
    #####-------------------------------------------------------------#####    
    def generate_nuc_feature(self, 
                            save_feature):
        nucdf = pd.concat([self.maindf[["reverse_NUC"]] , self.maindf[["forward_NUC"]] ], axis = 0)
        nucdf.columns = ["value"]
        return nucdf

In [9]:
input_tsv = "/Users/hieunguyen/src/ecd_wgs_features/output/WGShg19.final_output.tsv"
motif_order_path = "/Users/hieunguyen/src/ecd_wgs_features"
outputdir = "/Users/hieunguyen/src/ecd_wgs_features/output"

output_obj = WGS_GW_features(input_tsv = input_tsv,
                             motif_order_path = motif_order_path,
                             outputdir = outputdir)
