In [2]:
source("utils/plot.R")

In [3]:
options(repr.plot.width=4, repr.plot.height=3, jupyter.plot_scale=1)

In [4]:
params <- read_tsv(snakemake@input$parameters, col_types=cols()) %>% select(uuid, sweep_mode)

### Hard vs. soft confusion matrix

In [5]:
hardsoft_confmat <- read_tsv(snakemake@input$hard_vs_soft, col_types=cols()) %>%
    select(true_label, predicted_label) %>%
    table %>%
    as_tibble %>%
    mutate(
        true_label=sweepmode_factor_short(true_label),
        predicted_label=sweepmode_factor_short(predicted_label)
    ) %>%
    group_by(true_label) %>%
    mutate(
        percent=n/sum(n),
        percent_label=paste0(round(percent*100, 1), '%')
    )

In [6]:
hardsoft_confmat

In [7]:
hardsoft_confmat_fig <- ggplot(hardsoft_confmat) +
    geom_tile(aes(x=true_label, y=predicted_label, fill=percent)) +
    geom_text(aes(x=true_label, y=predicted_label, label=percent_label, colour=percent<0.5)) +
    scale_colour_manual(values=c('white', 'black')) +
    scale_y_discrete(limits=rev) +
    scale_fill_distiller(palette=3, direction=1) +
    sweeps_theme +
    labs(x='True', y='Predicted') +
    theme(
        legend.position='none',
        panel.grid=element_blank(),
        panel.spacing=unit(0.3, "in")
    )

In [8]:
hardsoft_confmat_fig

### RNM vs. SGV confusion matrix

In [9]:
rnmsgv_confmat <- read_tsv(snakemake@input$rnm_vs_sgv, col_types=cols()) %>%
    select(true_label, predicted_label) %>%
    table %>%
    as_tibble %>%
    mutate(
        true_label=sweepmode_factor_short(true_label),
        predicted_label=sweepmode_factor_short(predicted_label)
    ) %>%
    group_by(true_label) %>%
    mutate(
        percent=n/sum(n),
        percent_label=paste0(round(percent*100, 1), '%')
    )

In [10]:
rnmsgv_confmat

In [11]:
rnmsgv_confmat_fig <- ggplot(rnmsgv_confmat) +
    geom_tile(aes(x=true_label, y=predicted_label, fill=percent)) +
    geom_text(aes(x=true_label, y=predicted_label, label=percent_label, colour=percent<0.5)) +
    scale_colour_manual(values=c('white', 'black')) +
    scale_y_discrete(limits=rev) +
    scale_fill_distiller(palette=3, direction=1) +
    sweeps_theme +
    labs(x='True', y='Predicted') +
    theme(
        legend.position='none',
        panel.grid=element_blank(),
        panel.spacing=unit(0.3, "in")
    )

In [12]:
rnmsgv_confmat_fig

### Hard vs. soft ROC curve

In [13]:
hardsoft_roc <- read_tsv(snakemake@input$hard_vs_soft_roc, col_types=cols()) %>%
    mutate(reference_label=sweepmode_factor_short(reference_label)) %>%
    filter(reference_label=='Soft')

In [14]:
hardsoft_roc_fig <- ggplot(hardsoft_roc) +
    geom_line(aes(x=false_positive_rate, y=true_positive_rate)) +
    geom_abline(linetype='dashed') +
    labs(
        x='False positive rate',
        y='True positive rate'
    ) +
    sweeps_theme

In [15]:
hardsoft_roc_fig

### RNM vs. SGV ROC curve

In [16]:
rnmsgv_roc <- read_tsv(snakemake@input$rnm_vs_sgv_roc, col_types=cols()) %>%
    mutate(reference_label=sweepmode_factor_short(reference_label)) %>%
    filter(reference_label=='SGV')

In [17]:
rnmsgv_roc_fig <- ggplot(rnmsgv_roc) +
    geom_line(aes(x=false_positive_rate, y=true_positive_rate)) +
    geom_abline(linetype='dashed') +
    labs(
        x='False positive rate',
        y='True positive rate'
    ) +
    sweeps_theme

In [18]:
rnmsgv_roc_fig

### Feature analysis results

In [19]:
feat_factor <- function(v) {
    result <- str_replace_all(v, c(
        "pi"="Pi",
        "num_snps"="# SNPs",
        "num_haps"="# Haplotypes",
        "taj_D"="Tajima's D"
    ))
    result <- factor(result, levels=c("Pi", "# SNPs", "# Haplotypes", "H1", "H12", "H2/H1", "Tajima's D"))
    return(result)
}

feature_subset_factor <- function(v) {
    result <- factor(v, levels=c(
        "1000000", "0100000", "0010000", "0001000", "0000100", "0000010", "0000001",
        "0111111", "1011111", "1101111", "1110111", "1111011", "1111101", "1111110", "1111111"
    ))
    return(result)
}

In [20]:
feat_grid <- read_tsv(snakemake@input$feature_analysis_code, col_types=cols()) %>%
    pivot_longer(cols=!feature_subset) %>%
    mutate(name=feat_factor(name), feature_subset=feature_subset_factor(feature_subset))

In [21]:
feat_grid$value[feat_grid$feature_subset == "1111111"] = 2

In [22]:
feat_grid_fig <- ggplot(feat_grid) +
    geom_point(aes(x=name, y=feature_subset, colour=as.factor(value)), shape=4) +
    scale_colour_manual(values=c('white', "black", "darkred")) +
    scale_y_discrete(limits=rev) +
    labs(y='Feature subset') +
    sweeps_theme +
    theme(
        axis.title.x=element_blank(),
        axis.text.x=element_text(angle=45, hjust=1),
        legend.position='none',
        axis.text.y=element_blank(),
        panel.grid.major.x=element_blank()
    )

In [23]:
feat_grid_fig

In [24]:
feats <- read_tsv(snakemake@input$feature_analysis, col_types=cols()) %>%
    mutate(feature_subset=feature_subset_factor(feature_subset)) %>%
    filter(target %in% c("hard-vs-soft", "rnm-vs-sgv")) %>%
    mutate(
        target=target_factor(target),
        is_baseline=feature_subset=="1111111"
    )

In [25]:
hardsoft_feat_plot <- ggplot(filter(feats, target=="Hard vs. Soft")) +
    geom_segment(x=0, aes(xend=value, y=feature_subset, yend=feature_subset, colour=is_baseline)) +
    geom_point(aes(x=value, y=feature_subset, colour=is_baseline)) +
    scale_colour_manual(values=c('black', 'darkred')) +
    scale_y_discrete(limits=rev) +
    labs(x="Accuracy", title='Hard vs. Soft') +
    xlim(0, 1) +
    sweeps_theme +
    theme(
        axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        legend.position='none'
    )

In [26]:
hardsoft_feat_plot

In [27]:
rnmsgv_feat_plot <- ggplot(filter(feats, target=="RNM vs. SGV")) +
    geom_segment(x=0, aes(xend=value, y=feature_subset, yend=feature_subset, colour=is_baseline)) +
    geom_point(aes(x=value, y=feature_subset, colour=is_baseline)) +
    scale_colour_manual(values=c('black', 'darkred')) +
    scale_y_discrete(limits=rev) +
    labs(x="Accuracy", title='RNM vs. SGV') +
    xlim(0, 1) +
    sweeps_theme +
    theme(
        axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        legend.position='none'
    )

In [28]:
rnmsgv_feat_plot

### Plot it all together

In [29]:
hardsoft_fig <- plot_grid(
    hardsoft_confmat_fig + theme(plot.margin = unit(c(1, 1, 1, 1), "lines")),
    hardsoft_roc_fig + theme(plot.margin = unit(c(1, 1, 1, 1), "lines")),
    labels=c('A', 'B'))

In [30]:
rnmsgv_fig <- plot_grid(
    rnmsgv_confmat_fig + theme(plot.margin = unit(c(1, 1, 1, 1), "lines")),
    rnmsgv_roc_fig + theme(plot.margin = unit(c(1, 1, 1, 1), "lines")),
    labels=c('C', 'D'))

In [31]:
feature_analysis_fig <- plot_grid(
    feat_grid_fig + theme(plot.margin = unit(c(0, 0, 1, 1), "lines")),
    hardsoft_feat_plot,
    rnmsgv_feat_plot + theme(plot.margin = unit(c(0, 1, 0, 0), "lines")),
    nrow=1, align='h', axis='tb', labels=c("E", NA)
)

In [32]:
all_fig <- plot_grid(
    hardsoft_fig,
    rnmsgv_fig,
    feature_analysis_fig,
    nrow=3)

In [34]:
sweeps_save(snakemake@output[[1]], all_fig, width=7, asp=1)