In [2]:
source("utils/plot.R")

In [46]:
get_folded_sfs <- function(sfs_raw, N) {
    sfs <- sfs_raw %>%
        filter(num_alternate != 0, num_alternate != N) %>%
        mutate(num_minor=pmin(num_alternate, N - num_alternate)) %>%
        group_by(num_minor) %>%
        summarize(num_sites=sum(num_sites)) %>%
        mutate(
            freq=num_minor/N
        )
    return(sfs)
}

In [96]:
GENOME_SIZE_IN_MB <- 136826056/1e6

empirical_sfs <- read_delim(
    snakemake@input$empirical,
    col_types=cols(num_alternate=col_number(), num_sites=col_number()),
    delim=" ", col_names=c('num_sites', 'num_alternate')) %>%
    # 205 homozygous individuals, each representing a homozygous line
    get_folded_sfs(N=410) %>%
    # Divide by megabases in genome to get SFS per megabase
    mutate(num_sites = num_sites/GENOME_SIZE_IN_MB)

simulated_sfs <- read_tsv(snakemake@input$simulated, col_types=cols()) %>%
    # In  simulations, the sample size is just 205, since we aren't sampling homozygous individuals
    get_folded_sfs(N=205)

sfs <- bind_rows(Empirical=empirical_sfs, Simulated=simulated_sfs, .id='Dataset') %>%
    mutate(bin=cut(freq, breaks=15, labels=FALSE)) %>%
    group_by(Dataset, bin) %>%
    summarize(total_sites = sum(num_sites)) %>%
    mutate(bin_freq = (bin - 1)*0.5/max(bin - 1))

In [97]:
fig <- ggplot(sfs) +
    geom_line(aes(x=bin_freq, y=total_sites, colour=Dataset)) +
    geom_point(aes(x=bin_freq, y=total_sites, colour=Dataset, shape=Dataset), size=2) +
    xlim(0, 0.5) +
    scale_shape_manual(values=c(1, 4)) +
    sweeps_colour +
    labs(
        x = "Frequency",
        y = "Sites per Mb"
    ) +
    sweeps_theme +
    theme(
        legend.position='top',
        legend.title=element_blank()
    )

sweeps_save(snakemake@output[[1]], fig, width=3)