# Plot Missing Data

## Modules

In [22]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os

## Setup

In [8]:
filter_logs = []
plot_thresholds_list = []
reads_origin_list = ["assembly", "sra", "local", "all"]


# Figure out the project directory
try:
    snakemake.input.filter_snp_log
    project_dir = os.getcwd()
    # Figure out reads origin
    print(snakemake.wildcards)
    reads_origin = snakemake.wildcards.reads_origin
    locus_name = snakemake.wildcards.locus_name
    prune = snakemake.wildcards.prune
    # Figure out locus
except NameError:
    # Testing outside snakemake
    project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
    reads_origin = "all"
    locus_name = "chromosome"
    prune = "full"
    
output_dir = os.path.join(project_dir, "results", "snippy_multi", reads_origin, locus_name, prune)

# Get the list of log files
try:
    filter_logs = [log for log in snakemake.input.filter_snp_log]
except NameError:
    filter_logs = []
    for filter_dir in os.listdir(output_dir):
        filter_dir_path = os.path.join(output_dir, filter_dir)
        if not os.path.isdir(filter_dir_path): continue
        for log in os.listdir(filter_dir_path):
            log_path = os.path.join(filter_dir_path, log)
            if log_path[-4:] == ".log":
                filter_logs.append(log_path)

## Processing

In [33]:
# Initialize data dict
data = {
        "missing_data" : [],
        "all_variants" : [],
        "singleton_variants" : [],
        "parsimony_variants" : [],
        "singleton_variants_filter" : [],
        "parsimony_variants_filter" : [],
        "all_variants_filter" : [],
        "total_nucleotides_filter" : [],
        "ambiguous_nucleotides_filter" : [],    
       }

all_variants_term = "Alignment length: "
singleton_variants_term = "Total singleton sites: "
parsimony_variants_term = "Parsimony informative sites: "
singleton_variants_filter_term = "Singleton sites passing missing data filter:"
parsimony_variants_filter_term = "Parsimony informative sites passing missing data filter:"
all_variants_filter_term = "Total sites passing missing data filter:"
total_nucleotides_filter_term = "Total nucleotides passing missing data filter:"
ambiguous_nucleotides_filter_term = "Ambiguous nucleotides passing missing data filter:"

for log in filter_logs:
    all_variants = 0
    singleton_variants = 0
    parsimony_variants = 0
    threshold = int(os.path.basename(os.path.dirname(log)).replace("filter", ""))
        
    with open(log, "r") as logfile:
        for line in logfile:
            # Get all sites count
            if all_variants_term in line:
                all_variants = int(line.split(all_variants_term)[1])
            # Get singletons count
            if singleton_variants_term in line:
                singleton_variants = line.split(singleton_variants_term)[1]
                singleton_variants = int(singleton_variants.split(" ")[0])
            # Get parsimony count
            if parsimony_variants_term in line:
                parsimony_variants = line.split(parsimony_variants_term)[1]
                parsimony_variants = int(parsimony_variants.split(" ")[0])
            # Get filtered singletons count
            if singleton_variants_filter_term in line:
                singleton_variants_filter = int(line.split(singleton_variants_filter_term)[1])
            # Get filtered parsimony count
            if parsimony_variants_filter_term in line:
                parsimony_variants_filter = int(line.split(parsimony_variants_filter_term)[1])                
            # Get all filtered count
            if all_variants_filter_term in line:
                all_variants_filter = int(line.split(all_variants_filter_term)[1])     
            # Get filtered nucleotides count
            if total_nucleotides_filter_term in line:
                total_nucleotides_filter = int(line.split(total_nucleotides_filter_term)[1])               
            # Get ambiguous nucleotides count
            if ambiguous_nucleotides_filter_term in line:
                ambiguous_nucleotides_filter = int(line.split(ambiguous_nucleotides_filter_term)[1])              
                            
    data["missing_data"].append(threshold)
    data["all_variants"].append(all_variants)
    data["singleton_variants"].append(singleton_variants)
    data["parsimony_variants"].append(parsimony_variants)
    data["singleton_variants_filter"].append(singleton_variants_filter)
    data["parsimony_variants_filter"].append(parsimony_variants_filter)
    data["all_variants_filter"].append(all_variants_filter)
    data["total_nucleotides_filter"].append(total_nucleotides_filter)
    data["ambiguous_nucleotides_filter"].append(ambiguous_nucleotides_filter)
print(data)

{'missing_data': [0, 1, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 2, 20, 3, 4, 5, 6, 7, 8, 9], 'all_variants': [483, 483, 483, 483, 483, 483, 483, 483, 483, 483, 483, 483, 483, 483, 483, 483, 483, 483, 483, 483, 483], 'singleton_variants': [245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245, 245], 'parsimony_variants': [238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238], 'singleton_variants_filter': [6, 6, 79, 79, 91, 91, 106, 106, 114, 114, 114, 123, 20, 127, 20, 35, 35, 53, 53, 68, 68], 'parsimony_variants_filter': [16, 16, 123, 123, 138, 138, 149, 149, 160, 160, 160, 172, 38, 179, 38, 58, 58, 82, 82, 100, 100], 'all_variants_filter': [22, 22, 202, 202, 229, 229, 255, 255, 274, 274, 274, 295, 58, 306, 58, 93, 93, 135, 135, 168, 168], 'total_nucleotides_filter': [1100, 1100, 10100, 10100, 11450, 11450, 12750, 12750, 13700, 13700, 13700, 14750, 2900, 15300, 2900, 4650, 4650, 6750, 6750, 

## Figure

In [36]:
fig = go.Figure()


# ----------------------------------------
# All Variants (lines)
fig.add_trace(
  go.Scatter(x = data["missing_data"], y = data["all_variants"], mode='lines', name = "Total Variants", line=dict(width=5))
)

fig.add_trace(
  go.Scatter(x = data["missing_data"], y = data["singleton_variants"],mode='lines', name = "Total Singletons",line=dict(width=5))
)

fig.add_trace(
  go.Scatter(x= data["missing_data"], y = data["parsimony_variants"], mode='lines', name = "Total Parsimony Informative", line=dict(width=5))
)

# ----------------------------------------
# Filtered Variants (bars)

fig.add_trace(
    go.Bar(x= data["missing_data"],y = data["singleton_variants_filter"],  name = "Filtered Singletons",),     
)

fig.add_trace(
    go.Bar(x= data["missing_data"],y = data["parsimony_variants_filter"],  name = "Filtered Parsimony Informative",),     
)

fig.add_trace(
    go.Bar(x= data["missing_data"],y = data["all_variants_filter"],  name = "Filtered All",       
    ),     
)

# ----------------------------------------
# Customize Appearance
fig.update_layout(
  template="simple_white", 
  width=720,
  height=480,
  title=("<b>Variants Across Missing Data Site Thresholds</b>"),
  title_x = 0.5,
  xaxis = dict(
      title = "Missing Data Threshold Per Site (%)",
      tickvals = data["missing_data"],
      ),
  yaxis_title = "Number of Sites",
)

fig.show()

In [37]:
output_plot = os.path.join(output_dir, "snippy-multi.snps.missing-data.html")
fig.write_html(output_plot)

In [38]:
fig = go.Figure()


# ----------------------------------------
# Filtered Nucleotides (bars)

fig.add_trace(
    go.Bar(x= data["missing_data"],y = data["total_nucleotides_filter"],  name = "Filtered Nucleotides",),     
)

fig.add_trace(
    go.Bar(x= data["missing_data"],y = data["ambiguous_nucleotides_filter"],  name = "Ambiguous Nucleotides",),     
)

# ----------------------------------------
# Customize Appearance
fig.update_layout(
  template="simple_white", 
  width=720,
  height=480,
  title=("<b>Nucleotides Across Missing Data Site Thresholds</b>"),
  title_x = 0.5,
  xaxis = dict(
      title = "Missing Data Threshold Per Site (%)",
      tickvals = data["missing_data"],
      ),
  yaxis_title = "Number of Nucleotides",
)

fig.show()

In [39]:
output_plot = os.path.join(output_dir, "snippy-multi.snps.ambig-nuc.html")
fig.write_html(output_plot)