In [2]:
source("utils/plot.R")

In [3]:
options(repr.plot.width=4, repr.plot.height=3, jupyter.plot_scale=1)

In [4]:
params <- read_tsv(snakemake@input$parameters, col_types=cols()) %>% select(uuid, sweep_mode)

### Selection strength regressions

In [5]:
selstrength <- read_tsv(snakemake@input$selstrength, col_types=cols()) %>%
    inner_join(params, on="uuid") %>%
    mutate(sweep_mode=sweepmode_factor(sweep_mode))

In [6]:
slice_sample(selstrength, n=10)

In [7]:
selstrength_fig <- ggplot(selstrength) +
    geom_point(aes(x=true_log_selection_coefficient, y=predicted_log_selection_coefficient, colour=sweep_mode), size=0.5) +
    geom_abline(linetype='dashed') +
    facet_wrap(vars(sweep_mode)) +
    scale_x_continuous(labels=function(x){10**x}) +
    scale_y_continuous(labels=function(x){10**x}) +
    labs(
        x = "True s",
        y = "Predicted s"
    ) +
    sweeps_colour +
    sweeps_theme +
    theme(
        legend.position="none"
    )

In [8]:
selstrength_fig

### Sweep mode confusion matrices

In [9]:
sweepmode_raw <- read_tsv(snakemake@input$sweepmode, col_types=cols()) %>%
    select(true_label, predicted_label)  


sweepmode_confmat <- sweepmode_raw %>%
    table %>%
    as_tibble %>%
    mutate(
        true_label=sweepmode_factor_short(true_label),
        predicted_label=sweepmode_factor_short(predicted_label)
    ) %>%
    group_by(true_label) %>%
    mutate(
        percent=n/sum(n),
        percent_label=paste0(round(percent*100, 1), '%')
    )

In [10]:
confmat_fig <- ggplot(sweepmode_confmat) +
    geom_tile(aes(x=true_label, y=predicted_label, fill=percent)) +
    geom_text(aes(x=true_label, y=predicted_label, label=percent_label, colour=percent<0.5)) +
    scale_colour_manual(values=c('white', 'black')) +
    scale_y_discrete(limits=rev) +
    scale_fill_distiller(palette=3, direction=1) +
    sweeps_theme +
    labs(x='True', y='Predicted') +
    theme(
        legend.position='none',
        panel.grid=element_blank(),
        panel.spacing=unit(0.3, "in")
    )

In [11]:
confmat_fig

### Sweep mode ROC curves

In [12]:
sweepmode_roc <- read_tsv(snakemake@input$sweepmode_roc, col_types=cols()) %>%
    mutate(reference_label=sweepmode_factor_short(reference_label))

In [13]:
roc_fig <- ggplot(sweepmode_roc) +
    geom_line(aes(x=false_positive_rate, y=true_positive_rate, colour=reference_label)) +
    geom_abline(linetype='dashed') +
    guides(colour=guide_legend(title='Reference')) +
    labs(
        x='False positive rate',
        y='True positive rate'
    ) +
    sweeps_colour +
    sweeps_theme +
    theme(
        legend.position=c(1, 0),
        legend.justification=c(1, 0),
        legend.background=element_rect(colour='white')
    )

In [14]:
roc_fig

### Feature analysis results

In [15]:
feat_factor <- function(v) {
    result <- str_replace_all(v, c(
        "pi"="Pi",
        "num_snps"="# SNPs",
        "num_haps"="# Haplotypes",
        "taj_D"="Tajima's D"
    ))
    result <- factor(result, levels=c("Pi", "# SNPs", "# Haplotypes", "H1", "H12", "H2/H1", "Tajima's D"))
    return(result)
}

feature_subset_factor <- function(v) {
    result <- factor(v, levels=c(
        "1000000", "0100000", "0010000", "0001000", "0000100", "0000010", "0000001",
        "0111111", "1011111", "1101111", "1110111", "1111011", "1111101", "1111110", "1111111"
    ))
    return(result)
}

In [16]:
feat_grid <- read_tsv(snakemake@input$feature_analysis_code, col_types=cols()) %>%
    pivot_longer(cols=!feature_subset) %>%
    mutate(name=feat_factor(name), feature_subset=feature_subset_factor(feature_subset))

In [17]:
feat_grid$value[feat_grid$feature_subset == "1111111"] = 2

In [18]:
feat_grid_fig <- ggplot(feat_grid) +
    geom_point(aes(x=name, y=feature_subset, colour=as.factor(value)), shape=4) +
    scale_colour_manual(values=c('white', "black", "darkred")) +
    scale_y_discrete(limits=rev) +
    labs(y='Feature subset') +
    sweeps_theme +
    theme(
        axis.title.x=element_blank(),
        axis.text.x=element_text(angle=45, hjust=1),
        legend.position='none',
        axis.text.y=element_blank(),
        panel.grid.major.x=element_blank()
    )

In [19]:
feat_grid_fig

In [20]:
feats <- read_tsv(snakemake@input$feature_analysis, col_types=cols()) %>%
    mutate(feature_subset=feature_subset_factor(feature_subset)) %>%
    filter(target %in% c("log-sel-strength", "sweep-mode")) %>%
    mutate(
        target=target_factor(target),
        is_baseline=feature_subset=="1111111"
    )

In [21]:
selstrength_feat_plot <- ggplot(filter(feats, target=="Sel. strength", metric=="mean_relative_error")) +
    geom_segment(x=0, aes(xend=value, y=feature_subset, yend=feature_subset, colour=is_baseline)) +
    geom_point(aes(x=value, y=feature_subset, colour=is_baseline)) +
    scale_colour_manual(values=c('black', 'darkred')) +
    scale_y_discrete(limits=rev) +
    labs(x="Mean relative error", title='Sel. strength') +
    sweeps_theme +
    theme(
        axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        legend.position='none'
    )

In [22]:
selstrength_feat_plot

In [23]:
sweepmode_feat_plot <- ggplot(filter(feats, target=="Sweep mode")) +
    geom_segment(x=0, aes(xend=value, y=feature_subset, yend=feature_subset, colour=is_baseline)) +
    geom_point(aes(x=value, y=feature_subset, colour=is_baseline)) +
    scale_colour_manual(values=c('black', 'darkred')) +
    scale_y_discrete(limits=rev) +
    labs(x="Accuracy", title='Sweep mode') +
    xlim(0, 1) +
    sweeps_theme +
    theme(
        axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        legend.position='none'
    )

In [24]:
sweepmode_feat_plot

### Plot it all together

In [25]:
aligned <- align_plots(
    selstrength_fig,
    confmat_fig + theme(plot.margin = unit(c(1, 1, 1, 1), "lines")),
    align='v', axis='lr')
classification_fig <- plot_grid(
    aligned[[2]],
    roc_fig + theme(plot.margin = unit(c(1, 1, 1, 1), "lines")),
    labels=c('B', 'C'))
feature_analysis_fig <- plot_grid(
    feat_grid_fig + theme(plot.margin = unit(c(0, 0, 1, 1), "lines")),
    selstrength_feat_plot,
    sweepmode_feat_plot + theme(plot.margin = unit(c(0, 1, 0, 0), "lines")),
    nrow=1, align='h', axis='tb', labels=c("D", NA)
)
all_fig <- plot_grid(aligned[[1]], classification_fig, feature_analysis_fig, nrow=3, labels=c('A', NA, NA))

In [26]:
sweeps_save(snakemake@output$figure, all_fig, width=7, asp=1)

### Get metrics

In [14]:
metrics_sel_by_sm <- selstrength %>%
    group_by(sweep_mode) %>%
    summarize(
        selstrength_rmse=rmse(true_log_selection_coefficient, predicted_log_selection_coefficient),
        selstrength_mre=mean_relative_error(true_log_selection_coefficient, predicted_log_selection_coefficient)
    ) %>%
    pivot_longer(!sweep_mode, names_to='metric', values_to='value')

metrics_sweepmode <- tibble(
    sweep_mode='All',
    metric='accuracy',
    value=accuracy(sweepmode_raw$true_label, sweepmode_raw$predicted_label)
)

metrics_all_sel <- tibble(
    sweep_mode=c('All', 'All'),
    metric=c('selstrength_rsme', 'selstrength_mre'),
    value=c(
        rmse(selstrength$true_log_selection_coefficient, selstrength$predicted_log_selection_coefficient),
        mean_relative_error(selstrength$true_log_selection_coefficient, selstrength$predicted_log_selection_coefficient)
    )
)

metrics <- bind_rows(metrics_sel_by_sm, metrics_all_sel, metrics_sweepmode)

In [15]:
write_tsv(metrics, snakemake@output$metrics)