In [370]:
import os
import pandas as pd 

def read_snpeff_txt(file): 
    with open(file, "r") as txt: 
        lines = txt.readlines()
        if ("#" in lines[0]) and ("#" in lines[1]) :
            return pd.read_csv(file, delimiter="\t", skiprows=1)
        else: 
            return pd.read_csv(file, delimiter="\t")

def concat_snpEff_output(path,  wildcard=".txt"): 
    big_df=pd.DataFrame()
    for root, dirs, files in os.walk(path):
        for filename in files:
            if wildcard in filename:
                df = read_snpeff_txt(path+"/"+filename)
                big_df=big_df.append(df, sort=True)
    big_df = big_df.reset_index()
    return big_df
        
def add_tag(dataframe, tag, sample): 
    length=len(dataframe)
    Tag= length*[tag]
    Sample = length*[sample]
    dataframe["Tag"]=Tag
    dataframe["sample"]=Sample
    return dataframe
    
    

In [373]:
pd.options.display.float_format = '{:,.0f}'.format

core_H3A_snpeff_annotation = concat_snpEff_output("./core_gene_h3A/")
core_H3A_snpeff_annotation=core_H3A_snpeff_annotation.drop(columns=['index'])
#core_snpeff_annotation.to_csv("variant_core_genes_SnpEffAnnot.csv", index=False)

extended_H3A_snpeff_annotation = concat_snpEff_output("./extended_gene_h3A/")
extended_H3A_snpeff_annotation=extended_H3A_snpeff_annotation.drop(columns=['index'])

core_kg_snpeff_annotation = concat_snpEff_output("./core_gene_kg/")
core_kg_snpeff_annotation=core_kg_snpeff_annotation.drop(columns=['index'])
#core_snpeff_annotation.to_csv("variant_core_genes_SnpEffAnnot.csv", index=False)

extended_kg_snpeff_annotation = concat_snpEff_output("./extended_gene_kg/")
extended_kg_snpeff_annotation=extended_kg_snpeff_annotation.drop(columns=['index'])

non_adme_set1_snpeff_annotation = concat_snpEff_output("./non_adme_set1/")
non_adme_set1_snpeff_annotation=non_adme_set1_snpeff_annotation.drop(columns=['index'])

non_adme_set2_snpeff_annotation = concat_snpEff_output("./non_adme_set2/")
non_adme_set2_snpeff_annotation=non_adme_set2_snpeff_annotation.drop(columns=['index'])

### adding tags to the data  
core_H3A_snpeff_annotation = add_tag( core_H3A_snpeff_annotation, "core", "H3A" )
extended_H3A_snpeff_annotation = add_tag( extended_H3A_snpeff_annotation, "extended", "H3A" )
core_kg_snpeff_annotation = add_tag( core_kg_snpeff_annotation, "core", "KG" )
extended_kg_snpeff_annotation = add_tag( extended_kg_snpeff_annotation, "extended", "KG" )

non_adme_set1_snpeff_annotation = add_tag( non_adme_set1_snpeff_annotation, "non-ADME", "non-ADME1" )
non_adme_set2_snpeff_annotation = add_tag( non_adme_set2_snpeff_annotation, "non-ADME", "non-ADME2" )


In [374]:
# generating gene length 
bed_core_adme=pd.read_csv("./core.bed", delimiter="\s", header=None, names=["chr", "start", "end", "GeneId"], engine='python')
bed_core_adme["gene_size"]=bed_core_adme.end - bed_core_adme.start
bed_core_adme.drop(columns=["chr", "start", "end"], inplace=True)

bed_extended_adme=pd.read_csv("./extended.bed", delimiter="\s", header=None, names=["chr", "start", "end", "GeneId"], engine='python')
bed_extended_adme["gene_size"]=bed_extended_adme.end - bed_extended_adme.start
bed_extended_adme.drop(columns=["chr", "start", "end"], inplace=True)

bed_all_genes=pd.read_csv("./glist-hg19.bed", delimiter="\s", header=None, names=["chr", "start", "end", "GeneId"], engine='python')
bed_all_genes["gene_size"]=bed_all_genes.end - bed_all_genes.start
bed_all_genes.drop(columns=["chr", "start", "end"], inplace=True)

# merging gene length 
core_H3A_snpeff_annotation = pd.merge( core_H3A_snpeff_annotation, bed_core_adme, on='GeneId')
extended_H3A_snpeff_annotation = pd.merge( extended_H3A_snpeff_annotation, bed_extended_adme, on='GeneId')
core_kg_snpeff_annotation = pd.merge( core_kg_snpeff_annotation, bed_core_adme, on='GeneId')
extended_kg_snpeff_annotation = pd.merge( extended_kg_snpeff_annotation , bed_extended_adme, on='GeneId')
non_adme_set1_snpeff_annotation = pd.merge( non_adme_set1_snpeff_annotation , bed_all_genes, on='GeneId')
non_adme_set2_snpeff_annotation = pd.merge( non_adme_set2_snpeff_annotation , bed_all_genes, on='GeneId')

# add exon length per gene 
read_exons = pd.read_csv("./geneLength2.txt", delimiter="\s",  engine='python')
geneList = pd.read_csv("./ensebl_genename.csv",  engine='python')

# merge the ensenble ID 
core_H3A_snpeff_annotation = pd.merge( core_H3A_snpeff_annotation, geneList, on='GeneId')
core_H3A_snpeff_annotation = pd.merge( core_H3A_snpeff_annotation, read_exons, on='ensemble_id')

extended_H3A_snpeff_annotation = pd.merge( extended_H3A_snpeff_annotation, geneList, on='GeneId')
extended_H3A_snpeff_annotation = pd.merge( extended_H3A_snpeff_annotation, read_exons, on='ensemble_id')

core_kg_snpeff_annotation = pd.merge( core_kg_snpeff_annotation, geneList, on='GeneId')
core_kg_snpeff_annotation = pd.merge( core_kg_snpeff_annotation, read_exons, on='ensemble_id')


extended_kg_snpeff_annotation = pd.merge( extended_kg_snpeff_annotation, geneList, on='GeneId')
extended_kg_snpeff_annotation = pd.merge( extended_kg_snpeff_annotation, read_exons, on='ensemble_id')

non_adme_set1_snpeff_annotation = pd.merge( non_adme_set1_snpeff_annotation, geneList, on='GeneId')
non_adme_set1_snpeff_annotation = pd.merge( non_adme_set1_snpeff_annotation, read_exons, on='ensemble_id')


non_adme_set2_snpeff_annotation = pd.merge( non_adme_set2_snpeff_annotation, geneList, on='GeneId')
non_adme_set2_snpeff_annotation = pd.merge( non_adme_set2_snpeff_annotation, read_exons, on='ensemble_id')

# Join all the dataframe together into one big table
joint_df = pd.DataFrame()
for dataframe in [core_H3A_snpeff_annotation, extended_H3A_snpeff_annotation, core_kg_snpeff_annotation,
                 extended_kg_snpeff_annotation, non_adme_set1_snpeff_annotation, non_adme_set2_snpeff_annotation]:
    joint_df=joint_df.append(dataframe, sort=True)

# remove duplicates, remove overlaping columns and fill nan with 0s
joint_df=joint_df.reset_index(drop=True).drop_duplicates()
joint_df=joint_df.fillna(0)
joint_df.to_csv("annotation_ADME_bigtable.csv", index=False)

