In [2]:
%%bash

mkdir -p output/empirical-windows/commands

In [3]:
from pathlib import Path

import pandas as pd

from utils.project_parameters import locus_size

## Plan what windows to take

### Parameters

In [4]:
# source of chrom sizes is Release 5 of the _D. melanogaster_ genome from
# https://www.ncbi.nlm.nih.gov/assembly/GCF_000001215.2/
# We ignore 4, the tiny chromosome, and X because it's sexual

chrom_sizes = {
    '2L': 23011544,
    '2R': 21146708,
    '3L': 24543557,
    '3R': 27905053,
    # '4': 1351857,
    # 'X': 22422827,
}

# Known sweeps

known_pos = {
    'ace': 9069408,
    'chkov': 21150000,
    'cyp': 8072884,
}

known_sweep_chroms = {
    'ace': '3R',
    'chkov': '3R',
    'cyp': '2R',
}

Start a DataFrame with the known sweep loci:

In [5]:
records = []

for locus, coord in known_pos.items():
    start = int(coord - locus_size//2)
    end = int(coord + locus_size//2)
    chrom = known_sweep_chroms[locus]
    assert end < chrom_sizes[chrom]
    name = f"sweep-{locus}_{chrom}_{start}_{end}"
    records.append((name, locus, chrom, start, end, coord))
    
sweeps_df = pd.DataFrame.from_records(records,
                              columns=['identifier', 'locus', 'chrom', 'start', 'end', 'center'])

In [6]:
sweeps_df

### Add random genomewide windows

In [7]:
new_windows = []

windows_overlaps = locus_size//5

for chrom, size in chrom_sizes.items():
    start = 1
    end = start + locus_size
    while end < size:
        name = f"genomewide_{chrom}_{start}_{end}"
        new_windows.append([name, chrom, start, end, (start + end)//2])
        start = start + windows_overlaps
        end = start + locus_size
        
genomewide_df = pd.DataFrame.from_records(new_windows, columns=['identifier', 'chrom', 'start', 'end', 'center'])

In [8]:
genomewide_df

In [9]:
df = pd.concat([
    sweeps_df,
    genomewide_df
])

### Prepare commands

In [10]:
template = (
"vcftools "
"--gzvcf output/dgrp2/imputed.vcf.gz "
"--chr CHROM "
"--from-bp START "
"--to-bp END "
"--mac 1 "
"--012 ACE? "
"--out \"output/empirical-windows/genotypes/NAME\" "
"2>/dev/null"
)

In [11]:
def get_command(template, name, chrom, start, end, ace=False, locus=None):
    if ace:
        ace_repl = "--exclude-positions output/resistant-lines/ace-genotype.012.pos"
    else:
        ace_repl = ""
    return (
        template
        .replace('NAME', name)
        .replace('CHROM', chrom)
        .replace('START', str(start))
        .replace('END', str(end))
        .replace('ACE?', ace_repl)
        )

In [12]:
for ix, row in df.iterrows():
    with open(Path(snakemake.output[0])/f"{row.identifier}.command", "w") as f:
        f.write(get_command(
            template, row.identifier, row.chrom, row.start, row.end, ace=row.chrom=='3R', locus=row.locus
        ))