# Plot Missing Data

## Modules

In [1]:
import plotly.graph_objects as go
import os

## Setup

In [8]:
filter_logs = []
plot_thresholds_list = []
reads_origin_list = ["assembly", "sra", "local", "all"]


# Figure out the project directory
try:
    snakemake.input.filter_snp_log
    project_dir = os.getcwd()
    # Figure out reads origin
    print(snakemake.wildcards)
    reads_origin = snakemake.wildcards.reads_origin
    locus_name = snakemake.wildcards.locus_name
    prune = snakemake.wildcards.prune
    # Figure out locus
except NameError:
    # Testing outside snakemake
    project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
    reads_origin = "all"
    locus_name = "chromosome"
    prune = "prune"
    
output_dir = os.path.join(project_dir, "results", "snippy_multi", reads_origin, locus_name, prune)

# Get the list of log files
try:
    filter_logs = [log for log in snakemake.input.filter_snp_log]
except NameError:
    filter_logs = []
    for filter_dir in os.listdir(output_dir):
        filter_dir_path = os.path.join(output_dir, filter_dir)
        if not os.path.isdir(filter_dir_path): continue
        for log in os.listdir(filter_dir_path):
            log_path = os.path.join(filter_dir_path, log)
            if log_path[-4:] == ".log":
                filter_logs.append(log_path)

['/mnt/c/Users/ktmea/Projects/plague-phylogeography/results/snippy_multi/all/chromosome/prune/filter0/snippy-multi.snps.log', '/mnt/c/Users/ktmea/Projects/plague-phylogeography/results/snippy_multi/all/chromosome/prune/filter10/snippy-multi.snps.log', '/mnt/c/Users/ktmea/Projects/plague-phylogeography/results/snippy_multi/all/chromosome/prune/filter15/snippy-multi.snps.log', '/mnt/c/Users/ktmea/Projects/plague-phylogeography/results/snippy_multi/all/chromosome/prune/filter20/snippy-multi.snps.log', '/mnt/c/Users/ktmea/Projects/plague-phylogeography/results/snippy_multi/all/chromosome/prune/filter25/snippy-multi.snps.log', '/mnt/c/Users/ktmea/Projects/plague-phylogeography/results/snippy_multi/all/chromosome/prune/filter5/snippy-multi.snps.log']


## Processing

In [17]:
# Initialize data dict
data = {
        "missing_data" : [],
        "all_variants" : [],
        "singleton_variants" : [],
        "parsimony_variants" : [],
        "singleton_variants_filter" : [],
        "parsimony_variants_filter" : [],
        "all_variants_filter" : [],
       }

all_variants_term = "Alignment length: "
singleton_variants_term = "Total singleton sites: "
parsimony_variants_term = "Parsimony informative sites: "
singleton_variants_filter_term = "Singleton sites passing missing data filter:"
parsimony_variants_filter_term = "Parsimony informative sites passing missing data filter:"
all_variants_filter_term = "Total sites passing missing data filter:"

for log in filter_logs:
    all_variants = 0
    singleton_variants = 0
    parsimony_variants = 0
    threshold = int(os.path.basename(os.path.dirname(log)).replace("filter", ""))
        
    with open(log, "r") as logfile:
        for line in logfile:
            # Get all sites count
            if all_variants_term in line:
                all_variants = int(line.split(all_variants_term)[1])
            # Get singletons count
            if singleton_variants_term in line:
                singleton_variants = line.split(singleton_variants_term)[1]
                singleton_variants = int(singleton_variants.split(" ")[0])
            # Get parsimony count
            if parsimony_variants_term in line:
                parsimony_variants = line.split(parsimony_variants_term)[1]
                parsimony_variants = int(parsimony_variants.split(" ")[0])
            # Get filtered singletons count
            if singleton_variants_filter_term in line:
                singleton_variants_filter = int(line.split(singleton_variants_filter_term)[1])
            # Get filtered parsimony count
            if parsimony_variants_filter_term in line:
                parsimony_variants_filter = int(line.split(parsimony_variants_filter_term)[1])                
            # Get all filtered count
            if all_variants_filter_term in line:
                all_variants_filter = int(line.split(all_variants_filter_term)[1])     
                
                
    data["missing_data"].append(threshold)
    data["all_variants"].append(all_variants)
    data["singleton_variants"].append(singleton_variants)
    data["parsimony_variants"].append(parsimony_variants)
    data["singleton_variants_filter"].append(singleton_variants_filter)
    data["parsimony_variants_filter"].append(parsimony_variants_filter)
    data["all_variants_filter"].append(all_variants_filter)
print(data)

{'missing_data': [0, 10, 15, 20, 25, 5], 'all_variants': [2451, 2451, 2451, 2451, 2451, 2451], 'singleton_variants': [2451, 2451, 2451, 2451, 2451, 2451], 'parsimony_variants': [0, 0, 0, 0, 0, 0], 'singleton_variants_filter': [2451, 2451, 2451, 2451, 2451, 2451], 'parsimony_variants_filter': [0, 0, 0, 0, 0, 0], 'all_variants_filter': [2451, 2451, 2451, 2451, 2451, 2451]}


## Figure

In [None]:
fig = go.Figure()

# ----------------------------------------
# All Variants (lines)
fig.add_trace(
  go.Scatter(x = data["missing_data"], y = data["all_variants"], mode='lines', name = "Total Variants", line=dict(width=5))
)

fig.add_trace(
  go.Scatter(x = data["missing_data"], y = data["singleton_variants"],mode='lines', name = "Total Singletons",line=dict(width=5))
)

fig.add_trace(
  go.Scatter(x= data["missing_data"], y = data["parsimony_variants"], mode='lines', name = "Total Parsimony Informative", line=dict(width=5))
)

# ----------------------------------------
# Filtered Variants (bars)

fig.add_trace(
    go.Bar(x= data["missing_data"],y = data["singleton_variants_filter"],  name = "Filtered Singletons",),     
)

fig.add_trace(
    go.Bar(x= data["missing_data"],y = data["parsimony_variants_filter"],  name = "Filtered Parsimony Informative",),     
)

fig.add_trace(
    go.Bar(x= data["missing_data"],y = data["all_variants_filter"],  name = "Filtered All",       
    ),     
)

# ----------------------------------------
# Customize Appearance
fig.update_layout(
  template="simple_white", 
  width=720,
  height=480,
  title=("<b>Variants Across Missing Data Site Thresholds</b>"),
  title_x = 0.5,
  xaxis = dict(
      title = "Missing Data Threshold Per Site (%)",
      tickvals = data["missing_data"],
      ),
  yaxis_title = "Number of Variant Sites",
)

fig.show()

In [None]:
try:
    output_plot = snakemake.output.plot
except NameError:
    output_plot = os.path.join(output_dir, "snippy-multi.snps.missing-data.html")
fig.write_html(output_plot)