In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import t
import numpy as np
import os
import itertools
from matplotlib import pyplot as plt
from tqdm.notebook import trange, tqdm
from datetime import datetime
import subprocess
import logging
import shutil

# bedtools
from pybedtools import BedTool

# pysam
import pysam

# graphical utilities
import pygwalker as pyg
import plotly.express as px
import plotly.offline as pyo


In [9]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


In [10]:
# Function to find and count intersection with a window
def find_intersection(bed_files, window=1000):
    bedtools_objs = [BedTool(bed_file) for bed_file in bed_files]
    intersection = bedtools_objs[0]
    for bedtool in bedtools_objs[1:]:
        intersection = intersection.intersect(bedtool, wa=True, wb=True, e=True, f=0.9, F=0.1)
    #intersection.saveas('intersection.bed')
    return intersection

# Function to find and count union with a window
def find_union(bed_files):
    bedtools_objs = [BedTool(bed_file) for bed_file in bed_files]
    union = bedtools_objs[0]
    for bedtool in bedtools_objs[1:]:
        union = union.cat(bedtool, postmerge=False)
    union = union.sort().merge()
    #union.saveas('union.bed')
    return union

# Function to find and count unique sets with a window
def find_unique(bed_file, other_bed_files, index, window=1000):
    bedtool = BedTool(bed_file)
    other_bedtools = [BedTool(bed_file) for bed_file in other_bed_files]
    unique = bedtool
    for other_bedtool in other_bedtools:
        unique = unique.subtract(other_bedtool, A=True, s=True, r=True, f=window/1000)
    #unique.saveas(f'unique_{index}.bed')
    return unique


# Function to find intersections with a window
def find_intersection_with_window(bed_files, l=300, r=20000):
    bedtools_objs = [BedTool(bed_file) for bed_file in bed_files]
    intersection = bedtools_objs[0]
    for bedtool in bedtools_objs[1:]:
        intersection = intersection.window(b=bedtool, l=l, r=r)
    intersection.saveas('intersection_with_window.bed')
    return len(intersection)

# Function to find union
def find_union(bed_files):
    bedtools_objs = [BedTool(bed_file) for bed_file in bed_files]
    union = bedtools_objs[0]
    for bedtool in bedtools_objs[1:]:
        union = union.cat(bedtool, postmerge=False)
    union = union.sort().merge()
    
    #union.saveas('union.bed')
    return union

# Function to find unique sets with a window
def find_unique_with_window(bed_file, other_bed_files, index, l=300, r=20000):
    bedtool = BedTool(bed_file)
    other_bedtools = [BedTool(bed_file) for bed_file in other_bed_files]
    unique = bedtool
    for other_bedtool in other_bedtools:
        unique = unique.window(b=other_bedtool, l=l, r=r, v=True)
    unique.saveas(f'unique_with_window_{index}.bed')
    return len(unique)

# # Paths to your BED files
# bed_files = ['file1.bed', 'file2.bed', 'file3.bed']

# # Finding the intersection of all BED files and counting intervals with a window
# intersection_count = find_intersection_with_window(bed_files, l=300, r=20000)
# print(f'Number of intervals in intersection with window: {intersection_count}')

# # Finding the union of all BED files and counting intervals
# union_count = find_union(bed_files)
# print(f'Number of intervals in union: {union_count}')

# # Finding unique sets for each file and counting intervals with a window
# for i, bed_file in enumerate(bed_files):
#     other_bed_files = [f for j, f in enumerate(bed_files) if j != i]
#     unique_count = find_unique_with_window(bed_file, other_bed_files, i + 1, l=300, r=20000)
#     print(f'Number of unique intervals in file {i+1} with window: {unique_count}')

In [11]:
def LoadRetroSomResults(directory, te_type):
    

    # List all files in the directory
    files = os.listdir(directory)
        
    # Filter files with '.svg' extension
    svg_files = [file for file in files if file.endswith('.svg')]
    
    column_names = ["strand","family","chrom","position"]
    result_df = pd.DataFrame(columns=column_names)
    
    # Divide filenames by "_" and check if they start with "strand*"
    relevant_files = []
    for filename in svg_files:
        filename = filename[filename.find('strand'):]
        
        parts = filename.split('_')
        
        
        if parts[0] == "strand1":
            parts[0] = "-"
        else:
            parts[0] = "+"
        
        parts[len(parts)-1] = parts[len(parts)-1].replace('.svg','')
        row_df = pd.DataFrame([parts], columns=column_names)      
        result_df = pd.concat([result_df,row_df], ignore_index=True)
        relevant_files.append(filename)

    
    def extract_te_type(family):
        if family.startswith('Alu'):
            return 'Alu'
        elif family.startswith('L1'):
            return 'L1'
        else:
            return None
    
    result_df['class'] = result_df['family'].apply(extract_te_type)    

    result_df = result_df.reindex(columns = [column_names[2],column_names[3],column_names[0],'class',column_names[1]])
    
    result_df = result_df[result_df['class'].str.upper().str.contains(te_type.upper())]

    return result_df 
# # Specify the directory containing the files
# directory = '../results/RetroSom/shortread/mosaic/mixedDataRetroSom/200x/v2/TITR_1/visual'

# # Retrieve relevant SVG files
# df = filter_svg_files(directory, "l1")

# df


def convertResultToBedFile(df, saveFileNameWithDir):
    temp = df
    temp['position'] = temp['position'].astype(int)
    temp['position2'] = temp['position'] +1 
    temp['name'] = '.'
    temp['score'] = None

    temp = temp
    temp = temp[['chrom','position','position2','class','score','strand','family']]

    temp.to_csv(saveFileNameWithDir, sep='\t', index=False, header=False)
    
#    return temp

def count_directories_with_string(directory, search_string):
    count = 0
    
    if not checkFileExist(directory): return count
    
    for filename in os.listdir(directory):
        if os.path.isdir(os.path.join(directory, filename)) and search_string in filename and "NoModel" not in filename:
            count += 1
    return count


def loadVCF(file_path):
    vcf = pysam.VariantFile(file_path)
    print(file_path,countVCF(vcf))
    return vcf

def countVCF(vcf):
    return sum(1 for _ in vcf)

def filterVCF(vcf, filter_condition=None):
    filtered_variants = []
    for record in vcf.fetch():
        if record.filter.keys() == [filter_condition]:
            filtered_variants.append(record)
    return filtered_variants

def VCFtoBED(variants):
    column_names = ['chrom','position','position2','class','score','strand','family']
    bed_records = []
    for variant in variants:
        chrom = variant.chrom
        start = variant.pos - 1  # BED format uses 0-based start
        end = variant.pos + len(variant.ref) - 1
        name =  variant.alts[0].replace("<INS:ME:", "").replace(">", "").capitalize() if variant.alts else "."
        if name == "Line1":
            name = "L1"
        
        score = "0"  # BED score is optional, here we set it to 0
        strand = "."  # Strand information is optional and not present in VCF, so set to '.'
        bed_records.append([chrom, start, end, name, score, strand, "."])

    return pd.DataFrame(bed_records, columns=column_names)

def convertTEClassString(inputClass):
    if inputClass == "L1":
        return "LINE1"
    else:
        return inputClass.upper()

def checkFileExist(fileNameWithPath):    
    if os.path.exists(fileNameWithPath):
        return True
    else: 
        print(fileNameWithPath, " is not existing")
        return False
    
    
def getResultFunctionForTool(toolName,resultParentDirectory,caseSampleName,teClass):
    if toolName.strip().lower() in ["xtea-mosaic", "xtea"] :
        fileNameWithPath = os.path.join(resultParentDirectory,caseSampleName,teClass,caseSampleName+"_"+convertTEClassString(teClass)+".vcf")
        if not checkFileExist(fileNameWithPath): return None
        vcf = loadVCF(fileNameWithPath)
        filtered_vcf = filterVCF(vcf,"PASS")
        tempResultDF = VCFtoBED(filtered_vcf)
    elif toolName.strip().lower() == "melt":
        fileNameWithPath = os.path.join(resultParentDirectory,caseSampleName,convertTEClassString(teClass)+".final_comp.vcf")
        if not checkFileExist(fileNameWithPath): return None
        vcf = loadVCF(fileNameWithPath)
        filtered_vcf = filterVCF(vcf,"PASS")
        tempResultDF = VCFtoBED(filtered_vcf)
    elif toolName.strip().lower() == "retrosom":
        resultDirectory = caseSampleName + "_NoModel"
        visualDirectory = "visual"
        count = count_directories_with_string(resultParentDirectory,caseSampleName)
        if count <= 1:
            resultDirectory = os.path.join(resultParentDirectory, caseSampleName)
        else:
            resultDirectory = os.path.join(resultParentDirectory, resultDirectory)
        
        tempResultDF = LoadRetroSomResults(os.path.join(resultDirectory,visualDirectory) , teClass)
    else:
        print(toolName + "is not available for the function")
        return None
    
    return tempResultDF

def saveResultDFtoBedFileforTool(toolName, resultParentDirectory, resultFileName, resultDF):
    if toolName.strip().lower() in ["xtea-mosaic", "xtea", "melt"] :
        resultDF.to_csv(os.path.join(resultParentDirectory, resultFileName), sep='\t', index=False, header=False)
    elif toolName.strip().lower() == "retrosom":
        convertResultToBedFile(resultDF, os.path.join(resultParentDirectory, resultFileName))

    else: 
        print(toolName + "is not available for the function")
        return None
    
def getUniqueWithWindowFromBeds (bed, other_beds, l=100, r=100):
    unique = bed
    for other_bed in other_beds:
        unique = unique.window(b=other_bed, l=l, r=r, v=True)
    unique = unique.sort()
    
    return unique

def getUnionBed(beds):
    union = beds[0]
    #print(len(union))
    for bed in beds[1:]:
        union = union.cat(bed, postmerge=False)
        #print(len(union))
    union = union.sort().merge()
    
    return union

def getIntersectBed(beds, l=100, r=100):
    intersection = beds[0]
    for bed in beds[1:]:
        intersection = intersection.window(b=bed, l=l, r=r)
    return intersection

def getResultDF(toolName, teClass, depth, vaf, windowSize, resultBedDF, goldStandardDF, detailedBedFileOutputDir = "results/resultComparisons"):
    resultColumns = ['toolName','teClass','depth', 'vaf', 'windowSize','totalEvents','TP','FP', 'FN','precision','recall']
    resultdDF = pd.DataFrame(columns= resultColumns)


    # Load Bed Files : This would be loop for all the results
    caseBedFile = resultBedDF[(resultBedDF['toolName'] == toolName) &
                        (resultBedDF['depth'] == depth) &
                        (resultBedDF['class'] == teClass)]['filePath'].tolist()[0]
    caseBed = BedTool(caseBedFile)

    controlBedFile = goldStandardDF[(goldStandardDF['class'] == teClass) &
                                (goldStandardDF['backBone'] == True)]['filePath'].tolist()[0]
    controlBed = BedTool(controlBedFile)

    answerBedFiles = goldStandardDF[(goldStandardDF['class'] == teClass) &
                                (goldStandardDF['backBone'] == False)]['filePath'].tolist()
    answerBeds = [BedTool(bed_file) for bed_file in answerBedFiles]


    # False Positive
    # (case) - (control) - Union (GoldStandard)
    FP_bed = getUniqueWithWindowFromBeds(caseBed, [controlBed, getUnionBed(answerBeds)],windowSize,windowSize)
    
    # True Positive
    # (1) Unique MEI
    targetAnswerBedFiles = goldStandardDF[(goldStandardDF['class'] == teClass) & 
                                    (goldStandardDF['backBone'] == False) &
                                    (goldStandardDF['mixedRatio'] == vaf)]['filePath'].tolist()

    targetControlBedFiles = [item for item in answerBedFiles if item not in targetAnswerBedFiles]
    targetControlBedFiles.append(controlBedFile)

    if len(targetAnswerBedFiles) > 1 :
        targetAnswerBed = getUnionBed([BedTool(bed_file) for bed_file in targetAnswerBedFiles])
    else:
        targetAnswerBed = BedTool(targetAnswerBedFiles[0])

    targetControlBed = [BedTool(bed_file) for bed_file in targetControlBedFiles]
    uniqueBed = getUniqueWithWindowFromBeds(targetAnswerBed,[getUnionBed(targetControlBed)], windowSize, windowSize)

    # (2) Get True Positive
    TP_bed = getIntersectBed([getUniqueWithWindowFromBeds(caseBed, [controlBed], windowSize, windowSize), uniqueBed],
                            windowSize, windowSize)

    # False Negative
    FN_bed = getUniqueWithWindowFromBeds(caseBed, [TP_bed], windowSize, windowSize)
    
    
    # Recalcuations (ingore the previous codes for FN and FP), 6/12
    FP = len(caseBed) - len(TP_bed)
    FN = len(uniqueBed) - len(TP_bed)
    
    ## Save Bed files
    savefileName = toolName+"-"+teClass+"-"+depth+"-"+str(vaf)+"-"+str(windowSize)
    os.makedirs(detailedBedFileOutputDir, exist_ok=True)
    
    if len(TP_bed) > 0 : TP_bed.saveas(os.path.join(detailedBedFileOutputDir,savefileName+"-TP.bed"))
    
    FP_bed = getUniqueWithWindowFromBeds(caseBed, [TP_bed], windowSize, windowSize)
    if len(FP_bed) > 0 : FP_bed.saveas(os.path.join(detailedBedFileOutputDir,savefileName+"-FP.bed"))
    
    FN_bed = getUniqueWithWindowFromBeds(uniqueBed, [TP_bed], windowSize, windowSize)
    if len(FN_bed) > 0 : FN_bed.saveas(os.path.join(detailedBedFileOutputDir,savefileName+"-FN.bed"))
    
    

    # Precision and Recall
    #precision = len(TP_bed) / (len(TP_bed) + len(FP_bed))
    precision = len(TP_bed) / (len(TP_bed) + FP)
    #recall = len(TP_bed) / (len(TP_bed) + len(FN_bed))
    recall = len(TP_bed) / (len(TP_bed) + FN)
    
    # total results
    #resultdDF.loc[len(resultdDF)] = [toolName, teClass, depth, vaf, windowSize, len(caseBed), len(TP_bed), len(FP_bed), len(FN_bed), precision, recall]
    resultdDF.loc[len(resultdDF)] = [toolName, teClass, depth, vaf, windowSize, len(caseBed), len(TP_bed), FP, FN, precision, recall]
    
    # need to save each results
    
    return resultdDF
    
def checkRowCount(df, toolName, depth, TEclass):
    return len(df[(df['toolName'] == toolName) & (df['depth'] == depth) & (df['class'] == TEclass)])

def runCommand(command, outputFileWithPath = None):
    
    if isinstance(command, list):
        command = ' '.join(command) 
    
    try:
        if (outputFileWithPath is not None):
            with open(outputFileWithPath, 'w') as output_file:
                logging.info(f"Running command: {command}")
                subprocess.run(command, shell=True, check=True, stdout=output_file)
                logging.info(f"Successfully created: {outputFileWithPath}")
        else:
            logging.info(f"Running command: {command}")
            subprocess.run(command, shell=True, check=True)
            logging.info(f"Successfully executed: {outputFileWithPath}")
    except subprocess.CalledProcessError as e:
        logging.error(f"Error running command: {command}")
        logging.error(f"Return code: {e.returncode}")
        logging.error(f"Output: {e.output}")
    except Exception as e:
        logging.error(f"An unexpected error occurred: {e}")
    

def runBedToolsSlop(bedFileWithPath, chromSizeFileWithPath, resultBedFileWithPath, windowSize):
    command = (
        f"bedtools slop -i {bedFileWithPath} -g {chromSizeFileWithPath} -b {windowSize} "
        "| sort -k1,1V -k1,1 -k2,2n"
    )
    
    try:
        runCommand(command,resultBedFileWithPath)
        return True
    except Exception as e:
        return False    


def runInterveneVenn(bedFiles, title, outputDirectory, bedtools_options='f=0.5,r'): # ,r -l 1000 -r 1000'):
    command = (
        f"intervene venn -i {bedFiles} --bedtools-options {bedtools_options} --filenames --title {title} "
        f"--output {outputDirectory} --save-overlaps"
    )
    
    try:
        runCommand(command)
        return True
    except Exception as e:
        return False    
    
def runInterveneUpset(bedFiles, outputDirectory, bedtools_options='f=0.5,r'): # ,r -l 1000 -r 1000'):
    
    command = (
        f"intervene upset -i {bedFiles} --bedtools-options {bedtools_options} --filenames "
        f"--output {outputDirectory} --save-overlaps --showzero --showsize --showshiny"
    ) 
    
    try:
        runCommand(command, "shiny.fig.code")
        return True
    except Exception as e:
        return False    


def removeRLibDir(removalLibPath = "/home/jp394/R-3.5.1/library"):
    command = (
        f"Rscript -e '.libPaths(.libPaths()[.libPaths() != \"{removalLibPath}\"]); print(.libPaths())'"
    )
    
    try:
        runCommand(command)
        return True
    except Exception as e:
        return False        

def generateComparisonFigures(resultBedDF,toolNames,depths,windows,outputDirectory,teClass,resultTitle,
                              chromSizeFile = "/n/data1/bch/genetics/lee/reference/hg38/hg38.chrom.sizes",
                              vennOutputDir="venn",
                              upsetOutputDir="upset",
                              prefix="E",
                              removalLibPath = None):
    
    ## WindowFileGeneration
    selectedResults = resultBedDF[(resultBedDF['toolName'].isin(toolNames)) & ((resultBedDF['class'] == "L1")) & ((resultBedDF['depth'].isin(depths)))]

    selectedResults['windowSize'] = windows
    bedFiles=[]

    if os.path.exists(outputDirectory):
        
        logging.info(f"Directory exists: {outputDirectory}, removing it.")
        shutil.rmtree(outputDirectory, ignore_errors=True)

    os.makedirs(outputDirectory, exist_ok=True)
            

    resultComparisonColumns = ["toolName", "depth", "class", "windowSize", "extendedFileName", "extendedFileWithPath"]
    resultComparisonDF = pd.DataFrame(columns = resultComparisonColumns)

    # 1. Generate extended bed files
    for index, row in selectedResults.iterrows():
        directory_path = os.path.dirname(row['filePath'])
        resultFileName = row['toolName']+"_"+row['depth']+"_"+row['class']+"_"+prefix+str(row['windowSize'])+".bed"
        
        resultBedFileWithPath = os.path.join(outputDirectory, resultFileName)
        
        
        runBedToolsSlop(row['filePath'], chromSizeFile, resultBedFileWithPath, windows)
        resultComparisonDF.loc[len(resultComparisonDF)] = [toolName, depth, teClass, filePath, resultFileName, resultBedFileWithPath]

    resultBedFileList = resultComparisonDF['extendedFileName'].tolist()
    inputBedFiles = ' '.join(resultBedFileList)
    currentDir = os.getcwd()
    os.chdir(outputDirectory)

    # 2. Generate Venn Diagram
    if len(resultBedFileList) <= 6:    
        runInterveneVenn(inputBedFiles, resultTitle, vennOutputDir, bedtools_options='f=0.5,r')
    else:
        logging.error(f"Cannot create venn diagram because total number of bed files is {len(resultBedFileList)}")

    # 3. Generate Upset figure
    #runCommand("Rscript -e 'system(\"defaults write org.R-project.R force.LANG en_US.UTF-8\")'")
    if removalLibPath is not None:
        removeRLibDir(removalLibPath)

    runInterveneUpset(inputBedFiles, upsetOutputDir, bedtools_options='f=0.5,r')

    os.chdir(currentDir)


# 1. Configuration for performance check

In [12]:
sampleDepths = ["50x", "100x", "200x", "200x_UW", "200x_UW_Corrine", "200x_UW_HK", 
                "200x_NYGC", "200x_NYGC_Corrine", "200x_NYGC_HK",
                "300x", "400x", "400x_Corrine", "500x_WashU"]
teClasses = ["Alu", "L1"]
backBoneCellLineName = "HG005"
caseSampleName = "HapMapMix"

toolAndResultDirectory = {
    "RetroSom": "/n/data1/bch/genetics/lee/projects/SMaHT/results/RetroSom/shortread/mosaic/HapMap/v2",
    "RetroSom-v3": "/n/data1/bch/genetics/lee/projects/SMaHT/results/RetroSom/shortread/mosaic/HapMap/v3",
    "xTea-mosaic": "/n/data1/bch/genetics/lee/projects/SMaHT/results/xTea/shortread/mosaic/HapMap",
    "xTea": "/n/data1/bch/genetics/lee/projects/SMaHT/results/xTea/shortread/germline/HapMap",
    "MELT": "/n/data1/bch/genetics/lee/projects/SMaHT/results/MELT/shortread/germline/HapMap"
}

goldStandardDirectory = "/n/data1/bch/genetics/lee/projects/SMaHT/gold_standards/HapMapMix"
goldStandardPrefix= "tier1"

cellLinesAndMixtureRatio = {
    "HG00438":0.5,
    "HG002":2,
    "HG02257":2,
    "HG02486":2,
    "HG02622":10,
    "HG005": 83.5
}


resultExcelFileName = "hapmapmix"
resultExcelPath="results/resultComparisons/HapMapMix"

current_date = datetime.now()
formatted_date = current_date.strftime('%m%d%Y')
bedFileResultSaveDirectory = "results/resultComparisons/HapMapMix/"+formatted_date+"-resultBedFiles"

# 2. Convert results to bed files under the depth

In [13]:
resultFile_description = "Generating result files"

resultFileCombinations = itertools.product(
        toolAndResultDirectory.keys(),
        teClasses,
        sampleDepths)

resultFile_progress_bar = tqdm(resultFileCombinations, desc=resultFile_description)

for toolName, teClass, sampleDepth in resultFile_progress_bar:
    
    tempResultDF = []
    resultFile_progress_bar.set_description(f"{resultFile_description}-{toolName} {teClass} {sampleDepth}")

    
    resultParentDirectory = os.path.join(toolAndResultDirectory[toolName],sampleDepth)
    count = count_directories_with_string(resultParentDirectory,caseSampleName)
    
    if count == 0: continue
    
    resultFileName = teClass+".bed"
    resultFileNameWithPath = os.path.join(resultParentDirectory, resultFileName)
    if os.path.exists(resultFileNameWithPath):
        print(resultFileNameWithPath + " is existing")
        continue
    else:
        print(resultFileNameWithPath + " is not exist. Generate the result.")
        #print(os.path.join(resultParentDirectory,caseSampleName,teClass,caseSampleName+"_"+convertTEClassString(teClass)+".vcf"))
        tempResultDF = getResultFunctionForTool(toolName,resultParentDirectory,caseSampleName,teClass)
    
    if tempResultDF is None:
        continue
    elif len(tempResultDF) >= 1 :
        print("result count: " + str(len(tempResultDF)) )
        saveResultDFtoBedFileforTool(toolName,resultParentDirectory, resultFileName, tempResultDF)
        
    else:
        print("no rows from the result file, no save")

Generating result files: 0it [00:00, ?it/s]

/n/data1/bch/genetics/lee/projects/SMaHT/results/RetroSom/shortread/mosaic/HapMap/v2/50x/Alu.bed is existing
/n/data1/bch/genetics/lee/projects/SMaHT/results/RetroSom/shortread/mosaic/HapMap/v2/100x/Alu.bed is not exist. Generate the result.
no rows from the result file, no save
/n/data1/bch/genetics/lee/projects/SMaHT/results/RetroSom/shortread/mosaic/HapMap/v2/200x/Alu.bed is not exist. Generate the result.
no rows from the result file, no save
/n/data1/bch/genetics/lee/projects/SMaHT/results/RetroSom/shortread/mosaic/HapMap/v2/200x_UW/Alu.bed is not exist. Generate the result.
no rows from the result file, no save
/n/data1/bch/genetics/lee/projects/SMaHT/results/RetroSom/shortread/mosaic/HapMap/v2/200x_UW_Corrine  is not existing
/n/data1/bch/genetics/lee/projects/SMaHT/results/RetroSom/shortread/mosaic/HapMap/v2/200x_NYGC/Alu.bed is not exist. Generate the result.
no rows from the result file, no save
/n/data1/bch/genetics/lee/projects/SMaHT/results/RetroSom/shortread/mosaic/HapMap

# 3. Load Files and calculate statistics of results

In [14]:
## Load Gold Standard Sets
goldStandardColumns = ['cellLineName','mixedRatio','class', 'filePath', 'backBone', 'count']
goldStandardDF = pd.DataFrame(columns= goldStandardColumns)

for cellLine, teClass in itertools.product(cellLinesAndMixtureRatio.items(), teClasses):
    # Extract cellLineName and mixtureRatio from the tuple
    cellLineName, mixtureRatio = cellLine
    filePath=os.path.join(goldStandardDirectory,teClass,cellLineName+"-"+goldStandardPrefix+".bed")
    
    if not os.path.exists(filePath):
        print(f"The file '{filePath}' does not exist. Stopping the script.")
        # Stop the script by exiting or raising an exception
        exit
    
    backBone = False
    if cellLineName == backBoneCellLineName:
        backBone = True            
            
    print([cellLineName, mixtureRatio, teClass, filePath, backBone, len(BedTool(filePath))])
    goldStandardDF.loc[len(goldStandardDF)] = [cellLineName, mixtureRatio, teClass, filePath, backBone, len(BedTool(filePath))]


## Load Tool Files per each depth

resultBedColumns = ['toolName','depth','class', 'filePath', 'count']
resultBedDF = pd.DataFrame(columns= resultBedColumns)

for toolInfo, depth, teClass in itertools.product(toolAndResultDirectory.items(), sampleDepths, teClasses):
    toolName, resultDir = toolInfo
            
    filePath = os.path.join(resultDir,depth,teClass+".bed")
    if not os.path.exists(filePath):
        print(f"The file '{filePath}' does not exist. Continue to the next.")
        # Stop the script by exiting or raising an exception
        continue
        
    print([toolName, depth, teClass, filePath,len(BedTool(filePath))])
    
    resultBedDF.loc[len(resultBedDF)] = [toolName, depth, teClass, filePath, len(BedTool(filePath))]


vafs = list(set(goldStandardDF[goldStandardDF['backBone'] == False]['mixedRatio'].tolist()))

['HG00438', 0.5, 'Alu', '/n/data1/bch/genetics/lee/projects/SMaHT/gold_standards/HapMapMix/Alu/HG00438-tier1.bed', False, 1220]
['HG00438', 0.5, 'L1', '/n/data1/bch/genetics/lee/projects/SMaHT/gold_standards/HapMapMix/L1/HG00438-tier1.bed', False, 267]
['HG002', 2, 'Alu', '/n/data1/bch/genetics/lee/projects/SMaHT/gold_standards/HapMapMix/Alu/HG002-tier1.bed', False, 1188]
['HG002', 2, 'L1', '/n/data1/bch/genetics/lee/projects/SMaHT/gold_standards/HapMapMix/L1/HG002-tier1.bed', False, 233]
['HG02257', 2, 'Alu', '/n/data1/bch/genetics/lee/projects/SMaHT/gold_standards/HapMapMix/Alu/HG02257-tier1.bed', False, 1520]
['HG02257', 2, 'L1', '/n/data1/bch/genetics/lee/projects/SMaHT/gold_standards/HapMapMix/L1/HG02257-tier1.bed', False, 286]
['HG02486', 2, 'Alu', '/n/data1/bch/genetics/lee/projects/SMaHT/gold_standards/HapMapMix/Alu/HG02486-tier1.bed', False, 1458]
['HG02486', 2, 'L1', '/n/data1/bch/genetics/lee/projects/SMaHT/gold_standards/HapMapMix/L1/HG02486-tier1.bed', False, 333]
['HG0262

In [15]:
goldStandardDF

Unnamed: 0,cellLineName,mixedRatio,class,filePath,backBone,count
0,HG00438,0.5,Alu,/n/data1/bch/genetics/lee/projects/SMaHT/gold_...,False,1220
1,HG00438,0.5,L1,/n/data1/bch/genetics/lee/projects/SMaHT/gold_...,False,267
2,HG002,2.0,Alu,/n/data1/bch/genetics/lee/projects/SMaHT/gold_...,False,1188
3,HG002,2.0,L1,/n/data1/bch/genetics/lee/projects/SMaHT/gold_...,False,233
4,HG02257,2.0,Alu,/n/data1/bch/genetics/lee/projects/SMaHT/gold_...,False,1520
5,HG02257,2.0,L1,/n/data1/bch/genetics/lee/projects/SMaHT/gold_...,False,286
6,HG02486,2.0,Alu,/n/data1/bch/genetics/lee/projects/SMaHT/gold_...,False,1458
7,HG02486,2.0,L1,/n/data1/bch/genetics/lee/projects/SMaHT/gold_...,False,333
8,HG02622,10.0,Alu,/n/data1/bch/genetics/lee/projects/SMaHT/gold_...,False,1539
9,HG02622,10.0,L1,/n/data1/bch/genetics/lee/projects/SMaHT/gold_...,False,320


In [16]:
resultBedDF

Unnamed: 0,toolName,depth,class,filePath,count
0,RetroSom,50x,Alu,/n/data1/bch/genetics/lee/projects/SMaHT/resul...,263
1,RetroSom,50x,L1,/n/data1/bch/genetics/lee/projects/SMaHT/resul...,68
2,RetroSom,100x,L1,/n/data1/bch/genetics/lee/projects/SMaHT/resul...,64
3,RetroSom,200x,L1,/n/data1/bch/genetics/lee/projects/SMaHT/resul...,51
4,RetroSom,200x_UW,L1,/n/data1/bch/genetics/lee/projects/SMaHT/resul...,60
5,RetroSom,200x_UW_HK,L1,/n/data1/bch/genetics/lee/projects/SMaHT/resul...,67
6,RetroSom,200x_NYGC,L1,/n/data1/bch/genetics/lee/projects/SMaHT/resul...,117
7,RetroSom,200x_NYGC_HK,L1,/n/data1/bch/genetics/lee/projects/SMaHT/resul...,118
8,RetroSom,300x,L1,/n/data1/bch/genetics/lee/projects/SMaHT/resul...,75
9,RetroSom,400x,L1,/n/data1/bch/genetics/lee/projects/SMaHT/resul...,102


In [17]:
vafs

[0.5, 2.0, 10.0]

In [18]:
toolComparisonDepths = ["50x", "100x", "200x", "300x"]
tools = ['RetroSom', 'xTea', 'xTea-mosaic', 'MELT']

for targetDepth in toolComparisonDepths:
    
    if targetDepth == "300x":
        tools.remove("MELT")    
    
    generateComparisonFigures(resultBedDF,             
                          tools,
                          [targetDepth],
                          1000,
                          "results/resultComparisons/HapMapMix/tool_Comparison_"+targetDepth,
                          'L1',
                          "Tool_Compared-"+targetDepth)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

2024-06-17 18:23:57,314 - INFO - Directory exists: results/resultComparisons/HapMapMix/tool_Comparison_50x, removing it.
2024-06-17 18:23:57,517 - INFO - Running command: bedtools slop -i /n/data1/bch/genetics/lee/projects/SMaHT/results/RetroSom/shortread/mosaic/HapMap/v2/50x/L1.bed -g /n/data1/bch/genetics/lee/reference/hg38/hg38.chrom.sizes -b 1000 | sort -k1,1V -k1,1 -k2,2n
2024-06-17 18:23:57,568 - INFO - Successfully created: results/resultComparisons/HapMapMix/tool_Comparison_50x/RetroSom_50x_L1_E1000.bed
2024-06-17 18:23:57,581 - INFO - Running command: bedtools slop -i /n/data1/bch/genetics/lee/projects/SMaHT/results/xTea/shortread/mosaic/HapMap/50x/L1.bed -g /n/data1/bch/genetics/lee/reference/hg38/hg38.chrom.sizes 


Generating a 4-way "venn" diagram. Please wait...


Done! Please check your results @ venn. 
Thank you for using Intervene!



2024-06-17 18:24:09,078 - INFO - Successfully executed: None
2024-06-17 18:24:09,083 - INFO - Running command: intervene upset -i RetroSom_50x_L1_E1000.bed xTea-mosaic_50x_L1_E1000.bed xTea_50x_L1_E1000.bed MELT_50x_L1_E1000.bed --bedtools-options f=0.5,r --filenames --output upset --save-overlaps --showzero --showsize --showshiny
1: Setting LC_CTYPE failed, using "C" 
2: Setting LC_COLLATE failed, using "C" 
3: Setting LC_TIME failed, using "C" 
4: Setting LC_MESSAGES failed, using "C" 
5: Setting LC_MONETARY failed, using "C" 
6: Setting LC_PAPER failed, using "C" 
7: Setting LC_MEASUREMENT failed, using "C" 
2024-06-17 18:24:18,709 - INFO - Successfully created: shiny.fig.code


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

2024-06-17 18:24:18,714 - INFO - Directory ex


Generating a 4-way "venn" diagram. Please wait...


Done! Please check your results @ venn. 
Thank you for using Intervene!



2024-06-17 18:24:28,271 - INFO - Successfully executed: None
2024-06-17 18:24:28,285 - INFO - Running command: intervene upset -i RetroSom_100x_L1_E1000.bed xTea-mosaic_100x_L1_E1000.bed xTea_100x_L1_E1000.bed MELT_100x_L1_E1000.bed --bedtools-options f=0.5,r --filenames --output upset --save-overlaps --showzero --showsize --showshiny
1: Setting LC_CTYPE failed, using "C" 
2: Setting LC_COLLATE failed, using "C" 
3: Setting LC_TIME failed, using "C" 
4: Setting LC_MESSAGES failed, using "C" 
5: Setting LC_MONETARY failed, using "C" 
6: Setting LC_PAPER failed, using "C" 
7: Setting LC_MEASUREMENT failed, using "C" 
2024-06-17 18:24:38,094 - INFO - Successfully created: shiny.fig.code


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

2024-06-17 18:24:38,098 - INFO - Director


Generating a 3-way "venn" diagram. Please wait...


Done! Please check your results @ venn. 
Thank you for using Intervene!



2024-06-17 18:24:45,528 - INFO - Successfully executed: None
2024-06-17 18:24:45,538 - INFO - Running command: intervene upset -i RetroSom_200x_L1_E1000.bed xTea-mosaic_200x_L1_E1000.bed xTea_200x_L1_E1000.bed --bedtools-options f=0.5,r --filenames --output upset --save-overlaps --showzero --showsize --showshiny
1: Setting LC_CTYPE failed, using "C" 
2: Setting LC_COLLATE failed, using "C" 
3: Setting LC_TIME failed, using "C" 
4: Setting LC_MESSAGES failed, using "C" 
5: Setting LC_MONETARY failed, using "C" 
6: Setting LC_PAPER failed, using "C" 
7: Setting LC_MEASUREMENT failed, using "C" 
2024-06-17 18:24:53,755 - INFO - Successfully created: shiny.fig.code


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

2024-06-17 18:24:53,762 - INFO - Directory exists: results/resul


Generating a 3-way "venn" diagram. Please wait...


Done! Please check your results @ venn. 
Thank you for using Intervene!



2024-06-17 18:25:00,663 - INFO - Successfully executed: None
2024-06-17 18:25:00,676 - INFO - Running command: intervene upset -i RetroSom_300x_L1_E1000.bed xTea-mosaic_300x_L1_E1000.bed xTea_300x_L1_E1000.bed --bedtools-options f=0.5,r --filenames --output upset --save-overlaps --showzero --showsize --showshiny
1: Setting LC_CTYPE failed, using "C" 
2: Setting LC_COLLATE failed, using "C" 
3: Setting LC_TIME failed, using "C" 
4: Setting LC_MESSAGES failed, using "C" 
5: Setting LC_MONETARY failed, using "C" 
6: Setting LC_PAPER failed, using "C" 
7: Setting LC_MEASUREMENT failed, using "C" 
2024-06-17 18:25:11,727 - INFO - Successfully created: shiny.fig.code


In [82]:
# RetroSomComaprison comparison 

generateComparisonFigures(resultBedDF,             
                          ['RetroSom', 'RetroSom-v3'],
                          ['200x_UW', '200x_UW_HK', '200x_NYGC', '200x_NYGC_HK'],
                          1000,
                          "results/resultComparisons/HapMapMix/retroSom_HK_Comparison",
                          'L1',
                          "HK_Compared")





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

2024-06-17 16:19:19,837 - INFO - Running command: bedtools slop -i /n/data1/bch/genetics/lee/projects/SMaHT/results/RetroSom/shortread/mosaic/HapMap/v2/200x_UW/L1.bed -g /n/data1/bch/genetics/lee/reference/hg38/hg38.chrom.sizes -b 1000 | sort -k1,1V -k1,1 -k2,2n
2024-06-17 16:19:19,904 - INFO - Successfully created: results/resultComparisons/HapMapMix/retroSom_HK_Comparison/RetroSom_200x_UW_L1_E1000.bed
2024-06-17 16:19:19,922 - INFO - Running command: bedtools slop -i /n/data1/bch/genetics/lee/projects/SMaHT/results/RetroSom/shortread/mosaic/HapMap/v2/200x_UW_HK/L1.bed -g /n/data1/bch/genetics/lee/reference/hg38/hg38.chrom.sizes -b 1000 | sort -k1,1V -k1,1 -k2,2n
2024-06-17 16:19:19,990 - INFO - Successfully created: result


Generating a 6-way "venn" diagram. Please wait...


Done! Please check your results @ venn. 
Thank you for using Intervene!



2024-06-17 16:19:35,998 - INFO - Successfully executed: None
2024-06-17 16:19:36,004 - INFO - Running command: intervene upset -i RetroSom_200x_UW_L1_E1000.bed RetroSom_200x_UW_HK_L1_E1000.bed RetroSom_200x_NYGC_L1_E1000.bed RetroSom_200x_NYGC_HK_L1_E1000.bed RetroSom-v3_200x_UW_HK_L1_E1000.bed RetroSom-v3_200x_NYGC_HK_L1_E1000.bed --bedtools-options f=0.5,r --filenames --output upset --save-overlaps --showzero --showsize --showshiny
1: Setting LC_CTYPE failed, using "C" 
2: Setting LC_COLLATE failed, using "C" 
3: Setting LC_TIME failed, using "C" 
4: Setting LC_MESSAGES failed, using "C" 
5: Setting LC_MONETARY failed, using "C" 
6: Setting LC_PAPER failed, using "C" 
7: Setting LC_MEASUREMENT failed, using "C" 
2024-06-17 16:19:49,122 - INFO - Successfully created: shiny.fig.code


In [81]:
# CorrineComparison

generateComparisonFigures(resultBedDF,             
                          ['xTea-mosaic'],
                          ['200x_UW', '200x_UW_Corrine', '200x_NYGC', '200x_NYGC_Corrine'],
                          1000,
                          "results/resultComparisons/HapMapMix/xTea_Corrine_Comparison",
                          'L1',
                          "Corrine_Compared")
                        



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

2024-06-17 16:16:49,579 - INFO - Directory exists: results/resultComparisons/HapMapMix/xTea_Corrine_Comparison, removing it.
2024-06-17 16:16:49,670 - INFO - Running command: bedtools slop -i /n/data1/bch/genetics/lee/projects/SMaHT/results/xTea/shortread/mosaic/HapMap/200x_UW/L1.bed -g /n/data1/bch/genetics/lee/reference/hg38/hg38.chrom.sizes -b 1000 | sort -k1,1V -k1,1 -k2,2n
2024-06-17 16:16:49,725 - INFO - Successfully created: results/resultComparisons/HapMapMix/xTea_Corrine_Comparison/xTea-mosaic_200x_UW_L1_E1000.bed
2024-06-17 16:16:49,732 - INFO - Running command: bedtools slop -i /n/data1/bch/genetics/lee/projects/SMaHT/results/xTea/shortread/mosaic/HapMap/200x_NYGC/L1.bed -g /n/data1/bch/genetics/lee/reference/hg38


Generating a 3-way "venn" diagram. Please wait...


Done! Please check your results @ venn. 
Thank you for using Intervene!



2024-06-17 16:16:57,367 - INFO - Successfully executed: None
2024-06-17 16:16:57,373 - INFO - Running command: intervene upset -i xTea-mosaic_200x_UW_L1_E1000.bed xTea-mosaic_200x_NYGC_L1_E1000.bed xTea-mosaic_200x_NYGC_Corrine_L1_E1000.bed --bedtools-options f=0.5,r --filenames --output upset --save-overlaps --showzero --showsize --showshiny
1: Setting LC_CTYPE failed, using "C" 
2: Setting LC_COLLATE failed, using "C" 
3: Setting LC_TIME failed, using "C" 
4: Setting LC_MESSAGES failed, using "C" 
5: Setting LC_MONETARY failed, using "C" 
6: Setting LC_PAPER failed, using "C" 
7: Setting LC_MEASUREMENT failed, using "C" 
2024-06-17 16:17:04,355 - INFO - Successfully created: shiny.fig.code


In [14]:
# Add counts above
# Need to draw upset figure here? 

# 4. Calculate TP, FP, FN and a contingency table 

In [19]:
# Initial values
initial_values = [0, 50, 100]

# Generate values from 200 to 1000 with a step of 100
incremental_values = list(range(200, 1100, 100))

# Concatenate the lists
window_sizes = initial_values + incremental_values

teClass = "L1"

vafCheck = 0
filteredCellLinesAndMixtureRatio = {k: v for k, v in cellLinesAndMixtureRatio.items() if k != backBoneCellLineName}





all_result_combinations = []
for idx, window_size in enumerate(window_sizes, start=1):
    resultCombinations = list(itertools.product(
        toolAndResultDirectory.keys(),
        sampleDepths,
        filteredCellLinesAndMixtureRatio.values(),
        [window_size]
    ))
    if idx > 1:
        all_result_combinations += resultCombinations
    else:
        all_result_combinations = resultCombinations

results = []

calculateResults = lambda toolName, sampleDepth, vaf, window_size: (
    (results.append(getResultDF(toolName, teClass,sampleDepth,vaf, window_size, resultBedDF, goldStandardDF, bedFileResultSaveDirectory)) )
    if (globals().update(vafCheck=vaf) if vafCheck != vaf else False) is None else None
)

progress_bar = tqdm(all_result_combinations, desc="Processing combinations")
for toolName, sampleDepth, vaf, window_size in progress_bar:
    progress_bar.set_description(f"Processing: {toolName} {sampleDepth} {vaf} {window_size}")
    # Check whether they are existing or not from the resultBedDF
    if checkRowCount(resultBedDF, toolName, sampleDepth, teClass) == 0: continue
    calculateResults(toolName, sampleDepth, vaf, window_size)

# Concatenate all DataFrames into a single DataFrame
resultDF = pd.concat(results, ignore_index=True)

# Display the final DataFrame
print(resultDF)

Processing combinations:   0%|          | 0/3900 [00:00<?, ?it/s]

      toolName teClass      depth   vaf  windowSize  totalEvents  TP  FP   FN  \
0     RetroSom      L1        50x   0.5           0           68   0  68   68   
1     RetroSom      L1        50x   2.0           0           68   0  68  306   
2     RetroSom      L1        50x  10.0           0           68   1  67  126   
3     RetroSom      L1       100x   0.5           0           64   0  64   68   
4     RetroSom      L1       100x   2.0           0           64   1  63  305   
...        ...     ...        ...   ...         ...          ...  ..  ..  ...   
1219      MELT      L1    200x_UW   2.0        1000           16   0  16  251   
1220      MELT      L1    200x_UW  10.0        1000           16   4  12   97   
1221      MELT      L1  200x_NYGC   0.5        1000           95   0  95   55   
1222      MELT      L1  200x_NYGC   2.0        1000           95   8  87  243   
1223      MELT      L1  200x_NYGC  10.0        1000           95  30  65   71   

      precision    recall  

In [20]:
#formatted_date = "06122024"
resultExcelFile = os.path.join(resultExcelPath,formatted_date+'-'+resultExcelFileName+'.xlsx')
with pd.ExcelWriter(resultExcelFile, engine='xlsxwriter') as writer:
    resultDF.to_excel(writer, sheet_name='result', index=False)
    resultBedDF.to_excel(writer, sheet_name='resultBeds', index=False)
    goldStandardDF.to_excel(writer, sheet_name='goldStandards', index=False) 

print("DataFrames have been saved to " + resultExcelFile)

DataFrames have been saved to results/resultComparisons/HapMapMix/06172024-hapmapmix.xlsx


In [21]:
resultDF = pd.read_excel(resultExcelFile, sheet_name="result")


# 5. Check the results with figures

In [22]:
walker = pyg.walk(resultDF)

Box(children=(HTML(value='<div id="ifr-pyg-00061b1db2c57f7cAD3p6WMHR4thfVEG" style="height: auto">\n    <head>…

## Performance Plot

In [41]:
def create_line_plot(df, x_column, y_column, x_axis_range, colour_column, x_label, y_label, title):
    """
    Create a line plot using Plotly Express.

    Parameters:
    - df (pd.DataFrame): DataFrame containing the data.
    - x_column (str): Column for the x-axis.
    - y_column (str): Column for the y-axis.
    - x_label (str): Label for the x-axis.
    - y_label (str): Label for the y-axis.
    - title (str): Title of the plot.

    Returns:
    - None
    """
    

    fig = px.line(df, x=x_column, y=y_column, color=colour_column, markers=True,
#                  line_group=colour_column, 
                  color_discrete_sequence=px.colors.qualitative.Set1,
                  symbol=colour_column, symbol_sequence=[1, 3, 16, 17])

    fig.update_traces(marker=dict(size=10))

    fig.update_layout(
        width=600,
        xaxis_title=x_label,
        yaxis_title=y_label,
        xaxis=dict(
            type='log',
            tickvals=x_axis_range,
            range=[-2, 3.25]
        ),
        yaxis=dict(
            range=[0, 1.1],
            tickvals=[0, 0.2, 0.4, 0.6, 0.8, 1]
        ),
        legend=dict(
            traceorder='normal',
            orientation='v',
            y=1.1,
            x=0.01,
            font=dict(size=16),
            title=dict(text=colour_column, font=dict(size=18, color='black'))
        ),
        font=dict(
            family="Arial, sans-serif",
            size=18,
            color="RebeccaPurple"
        ),
        title=title
    )

    fig.show()


In [29]:
windowSize = 1000

In [43]:
figureCombinations = itertools.product(
        toolAndResultDirectory.keys(),
        teClasses)

In [44]:
for tool, teClass in figureCombinations:
    temp_result = resultDF[(resultDF['toolName'] == tool) & (resultDF['teClass'] == teClass) & (resultDF['windowSize'] == windowSize) ]
    create_line_plot(temp_result, 'vaf', 'precision', vafs, "depth", "Level of Mosaicism (%)", f"Precision ({teClass})", f"Precision Plot-{tool}")


# Test Codes

In [None]:
toolName = "RetroSom"
teClass = "L1"
depth = "200x"
vaf = 10
windowSize = 1000


resultColumns = ['toolName','teClass','depth', 'vaf', 'windowSize','totalRows','TP','FP', 'FN','precision','recall']
resultdDF = pd.DataFrame(columns= resultColumns)


# Load Bed Files : This would be loop for all the results
caseBedFile = resultBedDF[(resultBedDF['toolName'] == toolName) &
                      (resultBedDF['depth'] == depth) &
                      (resultBedDF['class'] == teClass)]['filePath'].tolist()[0]
caseBed = BedTool(caseBedFile)

controlBedFile = goldStandardDF[(goldStandardDF['class'] == teClass) &
                            (goldStandardDF['backBone'] == True)]['filePath'].tolist()[0]
controlBed = BedTool(controlBedFile)

answerBedFiles = goldStandardDF[(goldStandardDF['class'] == teClass) &
                            (goldStandardDF['backBone'] == False)]['filePath'].tolist()
answerBeds = [BedTool(bed_file) for bed_file in answerBedFiles]


# This would be function

# print("caseBed")
# display(len(caseBed))
# print("controlBed")
# display(len(controlBed))
# print("answerBeds")
# display(len(answerBeds))

# False Positive
# (case) - (control) - Union (GoldStandard)
FP_bed = getUniqueWithWindowFromBeds(caseBed, [controlBed, getUnionBed(answerBeds)],windowSize,windowSize)
# print("FP")
# display(len(FP_bed))

# True Positive
# (1) Unique MEI
targetAnswerBedFiles = goldStandardDF[(goldStandardDF['class'] == teClass) & 
                                 (goldStandardDF['backBone'] == False) &
                                 (goldStandardDF['mixedRatio'] == vaf)]['filePath'].tolist()

targetControlBedFiles = [item for item in answerBedFiles if item not in targetAnswerBedFiles]
targetControlBedFiles.append(controlBedFile)

# print("targetAnswerBedFiles")
# print(targetAnswerBedFiles)
# print("targetControlBedFiles")
# print(targetControlBedFiles)

if len(targetAnswerBedFiles) > 1 :
    targetAnswerBed = getUnionBed([BedTool(bed_file) for bed_file in targetAnswerBedFiles])
else:
    targetAnswerBed = BedTool(targetAnswerBedFiles[0])

targetControlBed = [BedTool(bed_file) for bed_file in targetControlBedFiles]
uniqueBed = getUniqueWithWindowFromBeds(targetAnswerBed,[getUnionBed(targetControlBed)], windowSize, windowSize)

# print("uniqueBeds")
# display(len(uniqueBed))

# (2) Get True Positive
TP_bed = getIntersectBed([getUniqueWithWindowFromBeds(caseBed, [controlBed], windowSize, windowSize), uniqueBed],
                         windowSize, windowSize)
# print("TP")
# display(len(TP_bed))

# False Negative
FN_bed = getUniqueWithWindowFromBeds(caseBed, [TP_bed], windowSize, windowSize)
# print("FN")
# display(len(FN_bed))

# Precision and Recall
precision = len(TP_bed) / (len(TP_bed) + len(FP_bed))
# print("precision:",precision)

recall = len(TP_bed) / (len(TP_bed) + len(FN_bed))
# print("recall:", recall)




In [80]:
def find_intersection_with_window(bed_files, l=300, r=20000):
    bedtools_objs = [BedTool(bed_file) for bed_file in bed_files]
    intersection = bedtools_objs[0]
    for bedtool in bedtools_objs[1:]:
        intersection = intersection.window(b=bedtool, l=l, r=r)
    #intersection.saveas('intersection_with_window.bed')
    return intersection


In [74]:
bed_files = ['exampleData/testA.bed', 'exampleData/testB.bed']

In [99]:
intersection = find_intersection_with_window(bed_files, 1000, 1000)

In [100]:
len(intersection)

6

In [45]:
bed_files = ['results/resultComparisons/HapMapMix/RetroSomV2/HMS_HapMapMix_Illumina-200x_UW_MEI.bed',
            'results/resultComparisons/HapMapMix/RetroSomV2/HMS_HapMapMix_Illumina-200x_NYGC_MEI.bed']

In [49]:
intersection = find_intersection(bed_files, 500)

In [50]:
len(intersection)

28

In [None]:
## 정답셋 처리 할 때는 윈도우 0
## 정답셋과 비교할 때는 가변 윈도우로 처리 (이것도 인풋 값으로 진행해서 For Loop 로 갈 것)
## 결과를 아래와 같이 저장 (마지막에 window 사이즈를 저장해 놓을 것)


def generate_tp_fp_fn_set_from_bed(case_result_df, gs_bed_array, control_result_df, na12878_vcf,
                                   window_size = 10, key_column_name = 'chrom'):
    
    
    #gs_bed_array = [set(x.file.vcf) for x in gs_vcf_array]
    gs_bed_df_array = [ x.file.bed.iloc[:,:2] for x in gs_bed_array]
    
    combined_gs_bed_df = pd.concat(gs_bed_df_array, axis=0, ignore_index=True).drop_duplicates().iloc[:,:2]
    
    print("gs_set:", len(combined_gs_bed_df))
    
    # False Positive
    # (1) - (2) - Union(3)
    #control_set = set(control_vcf)
    #case_result_df = pd.DataFrame(case_result_vcf, columns=['chrom','position'])
    
    #FP = set(case_result_vcf) - control_set - set.union(*gs_set_array)    
    temp_compareSet_df = compareTwoSets(case_result_df, control_result_df,  key_column_name, window_size).iloc[:,:2]
    #print(temp_compareSet_df)
    
    FP_temp = getLeftDFOnly(case_result_df, temp_compareSet_df)
    temp_compareSet_df = compareTwoSets(FP_temp, combined_gs_bed_df,key_column_name ,window_size).iloc[:,:2]
    FP = getLeftDFOnly(FP_temp, temp_compareSet_df)
        
    print("False Positive:", len(FP))
    
    # Unique MEI
    unique_MEI = []
    #set_unique_array = gs_set_array + [control_set]   
    set_unique_array = gs_bed_df_array + [pd.DataFrame(na12878_vcf, columns=['chrom','position'])]     #NA12878을 더해줘야 함
     
    #case_unique = set(case_result_vcf) - set.union(*set_unique_array)
    
    for gs_bed in gs_bed_array:
        
        #print(gs_bed.file.bed)
        print(gs_bed.file.name,  len(gs_bed.file.bed))
        temp = getLeftDFOnly(gs_bed.file.bed.iloc[:,:2],                              
                             getLeftDFOnly(pd.concat(set_unique_array, axis=0, ignore_index=True).drop_duplicates(),
                                           gs_bed.file.bed.iloc[:,:2])
                             )
        
        unique_MEI += [{ "Freq": gs_bed.vaf,
                         "Set": temp
        
        }]
        print("case unique(", gs_bed.vaf, "):",  len(temp))
    

    result_list = []
    
    for MEI in unique_MEI:
        
        # True Positive
        # ( (1) - (2) ) Intersect Unique MEI ( (3)_i - Union( (3) except (3)_i + (4))  )
        print("VAF:", MEI["Freq"])
        
        #TP = (set(case_result_vcf) - control_set).intersection(MEI["Set"])
        
        TP_temp = getLeftDFOnly(case_result_df, control_result_df)
        print("TP_temp:", len(TP_temp))

        TP = compareTwoSets(TP_temp, MEI["Set"], 'chrom', window_size)
    
        print("True Positive:", len(TP))

        # False Negative
        # ( (3)_i - True_Positive_i)
        
        #FN = set(case_result_vcf) - TP
        
        #FN = case_result_df if len(TP) == 0 else getLeftDFOnly(case_result_df, TP)
        FN = case_result_df if len(TP) == 0 else getLeftDFOnly(case_result_df, TP)
        print("False Negative:", len(FN))
        
        result_list += [
            { "Freq" : MEI["Freq"],
              "TP": TP,
              "FN": FN
            }
        
        ]
        
    addtional_field = {"FP": FP }
    
    result_list = [{**item, **addtional_field } for item in result_list]

    return result_list
    






In [23]:
HapMap_200x_UW_L1 = filter_svg_files('../results/RetroSom/shortread/mosaic/HapMap/v2/200x_UW/HapMapMix_NoModel/visual','L1')
convertResultToBedFile(HapMap_200x_UW_L1, 'results/HapMapMix/RetroSomV2/HMS_HapMapMix_Illumina-200x_UW_MEI.bed')


In [24]:
HapMap_200x_NYGC_L1 = filter_svg_files('../results/RetroSom/shortread/mosaic/HapMap/v2/200x_NYGC/HapMapMix_NoModel/visual','L1')
convertResultToBedFile(HapMap_200x_NYGC_L1, 'results/HapMapMix/RetroSomV2/HMS_HapMapMix_Illumina-200x_NYGC_MEI.bed')



In [26]:
HapMap_200x_BCM_L1 = filter_svg_files('../results/RetroSom/shortread/mosaic/HapMap/v2/200x/HapMapMix_NoModel/visual','L1')
# convertResultToBedFile(HapMap_200x_BCM_L1, 'results/HapMapMix/RetroSomV2/HMS_HapMapMix_Illumina-200x_BCM_MEI.bed')

In [29]:
convertResultToBedFile(HapMap_200x_BCM_L1, 'results/resultComparisons/HapMapMix/RetroSomV2/HMS_HapMapMix_Illumina-200x_BCM_MEI.bed')

In [31]:
HapMap_50x_BCM_L1 = filter_svg_files('../results/RetroSom/shortread/mosaic/HapMap/v2/50x/HapMapMix/visual','L1')
#HapMap_100x_BCM_L1 = filter_svg_files('../results/RetroSom/shortread/mosaic/HapMap/v2/100x/HapMapMix_NoModel/visual','L1')
HapMap_300x_BCM_L1 = filter_svg_files('../results/RetroSom/shortread/mosaic/HapMap/v2/300x/HapMapMix_NoModel/visual','L1')


In [32]:
convertResultToBedFile(HapMap_50x_BCM_L1, 'results/resultComparisons/HapMapMix/RetroSomV2/HMS_HapMapMix_Illumina-50x_BCM_MEI.bed')
convertResultToBedFile(HapMap_300x_BCM_L1, 'results/resultComparisons/HapMapMix/RetroSomV2/HMS_HapMapMix_Illumina-300x_BCM_MEI.bed')

In [51]:
HapMap_400x_BCM_L1 = filter_svg_files('../results/RetroSom/shortread/mosaic/HapMap/v2/400x/HapMapMix_NoModel/visual','L1')
#HapMap_100x_BCM_L1 = filter_svg_files('../results/RetroSom/shortread/mosaic/HapMap/v2/100x/HapMapMix_NoModel/visual','L1')
HapMap_500x_BCM_L1 = filter_svg_files('../results/RetroSom/shortread/mosaic/HapMap/v2/500x_WashU/HapMapMix_NoModel/visual','L1')


In [55]:
convertResultToBedFile(HapMap_400x_BCM_L1, 'results/resultComparisons/HapMapMix/RetroSomV2/HMS_HapMapMix_Illumina-400x_BCM_MEI.bed')
convertResultToBedFile(HapMap_500x_BCM_L1, 'results/resultComparisons/HapMapMix/RetroSomV2/HMS_HapMapMix_Illumina-500x_WashU_MEI.bed')

In [54]:
HapMap_500x_BCM_L1

Unnamed: 0,chrom,position,strand,class,family
0,chr10,115560693,+,L1,L1HS
1,chr11,117324478,+,L1,L1HS
2,chr12,128865636,+,L1,L1HS
3,chr12,28073470,+,L1,L1HS
4,chr14,41199885,+,L1,L1HS
...,...,...,...,...,...
83,chrX,126723679,-,L1,L1HS
84,chrX,33407521,-,L1,L1HS
85,chrX,34973901,-,L1,L1HS
86,chrX,72092746,-,L1,L1HS
