# Plot Missing Data

## Modules

In [1]:
import plotly.graph_objects as go
import os

## Setup

In [2]:
filter_logs = []
plot_thresholds_list = []
reads_origin_list = ["assembly", "sra", "local", "all"]


# Figure out the project directory
try:
    snakemake.input.filter_snp_log
    project_dir = os.getcwd()
    # Figure out reads origin
    for origin in reads_origin_list:
        if origin in snakemake.input.filter_snp_log[0]:
            reads_origin = origin
            break    

except NameError:
    project_dir = os.path.dirname(os.path.dirname(os.getcwd()))
    reads_origin = "all"
    
output_dir = os.path.join(project_dir, "results", "snippy_multi", reads_origin)
logs_dir = os.path.join(project_dir, "results", "logs", "snippy_multi", reads_origin)
    
# Get the list of log files
try:
    filter_logs = [log for log in snakemake.input.filter_snp_log]
except NameError:
    filter_logs = [os.path.join(logs_dir, log) for log in os.listdir(logs_dir) if "filter" in log]

## Processing

In [3]:
# Initialize data dict
data = {
        "missing_data" : [],
        "all_variants" : [],
        "singleton_variants" : [],
        "parsimony_variants" : [],
        "singleton_variants_filter" : [],
        "parsimony_variants_filter" : [],
        "all_variants_filter" : [],
       }

all_variants_term = "Alignment length: "
singleton_variants_term = "Total singleton sites: "
parsimony_variants_term = "Parsimony informative sites: "
singleton_variants_filter_term = "Singleton sites passing missing data filter:"
parsimony_variants_filter_term = "Parsimony informative sites passing missing data filter:"
all_variants_filter_term = "Total sites passing missing data filter:"

for log in filter_logs:
    all_variants = 0
    singleton_variants = 0
    parsimony_variants = 0
    threshold = int(log.split("filter")[1].replace(".log",""))    
        
    with open(log, "r") as logfile:
        for line in logfile:
            # Get all sites count
            if all_variants_term in line:
                all_variants = int(line.split(all_variants_term)[1])
            # Get singletons count
            if singleton_variants_term in line:
                singleton_variants = line.split(singleton_variants_term)[1]
                singleton_variants = int(singleton_variants.split(" ")[0])
            # Get parsimony count
            if parsimony_variants_term in line:
                parsimony_variants = line.split(parsimony_variants_term)[1]
                parsimony_variants = int(parsimony_variants.split(" ")[0])
            # Get filtered singletons count
            if singleton_variants_filter_term in line:
                singleton_variants_filter = int(line.split(singleton_variants_filter_term)[1])
            # Get filtered parsimony count
            if parsimony_variants_filter_term in line:
                parsimony_variants_filter = int(line.split(parsimony_variants_filter_term)[1])                
            # Get all filtered count
            if all_variants_filter_term in line:
                all_variants_filter = int(line.split(all_variants_filter_term)[1])     
                
                
    data["missing_data"].append(threshold)
    data["all_variants"].append(all_variants)
    data["singleton_variants"].append(singleton_variants)
    data["parsimony_variants"].append(parsimony_variants)
    data["singleton_variants_filter"].append(singleton_variants_filter)
    data["parsimony_variants_filter"].append(parsimony_variants_filter)
    data["all_variants_filter"].append(all_variants_filter)
print(data)

{'missing_data': [0, 100, 25, 50, 75], 'all_variants': [355, 355, 355, 355, 355], 'singleton_variants': [215, 215, 215, 215, 215], 'parsimony_variants': [140, 140, 140, 140, 140], 'singleton_variants_filter': [0, 215, 3, 205, 215], 'parsimony_variants_filter': [5, 140, 23, 140, 140], 'all_variants_filter': [5, 355, 26, 345, 355]}


## Figure

In [12]:
fig = go.Figure()

# ----------------------------------------
# All Variants (lines)
fig.add_trace(
  go.Scatter(x = data["missing_data"], y = data["all_variants"], mode='lines', name = "Total Variants", line=dict(width=5))
)

fig.add_trace(
  go.Scatter(x = data["missing_data"], y = data["singleton_variants"],mode='lines', name = "Total Singletons",line=dict(width=5))
)

fig.add_trace(
  go.Scatter(x= data["missing_data"], y = data["parsimony_variants"], mode='lines', name = "Total Parsimony Informative", line=dict(width=5))
)

# ----------------------------------------
# Filtered Variants (bars)

fig.add_trace(
    go.Bar(x= data["missing_data"],y = data["singleton_variants_filter"],  name = "Filtered Singletons",),     
)

fig.add_trace(
    go.Bar(x= data["missing_data"],y = data["parsimony_variants_filter"],  name = "Filtered Parsimony Informative",),     
)

fig.add_trace(
    go.Bar(x= data["missing_data"],y = data["all_variants_filter"],  name = "Filtered All",       
    ),     
)

# ----------------------------------------
# Customize Appearance
fig.update_layout(
  template="simple_white", 
  width=720,
  height=480,
  title=("<b>Variants Across Missing Data Site Thresholds</b>"),
  title_x = 0.5,
  xaxis = dict(
      title = "Missing Data Threshold Per Site (%)",
      tickvals = data["missing_data"],
      ),
  yaxis_title = "Number of Variant Sites",
)

fig.show()

In [13]:
try:
    output_plot = snakemake.output.plot
except NameError:
    output_plot = os.path.join(output_dir, "missing_data_chromosome.snps.html")
fig.write_html(output_plot)