In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import allel

### Genotypes

In [None]:
vcf = allel.read_vcf(snakemake.input["vcf"])

In [None]:
records = list()

for position, haps, chrom in zip(vcf["variants/POS"], vcf["calldata/GT"], vcf["variants/CHROM"]):
    haploid = haps[:, 0]
    genotypes = haploid.ravel()
    records.append((chrom, position, genotypes.sum(), len(genotypes), 1))

variants = pd.DataFrame.from_records(records, columns=["chrom", "position", "x", "n", "folded"])

In [None]:
save_folder = Path(snakemake.output["input_per_chrom"][0]).parent
for chrom in variants.chrom.unique():
    chrom_data = variants.loc[variants.chrom == chrom].drop("chrom", axis="columns")
    chrom_data.to_csv(save_folder/f"clr_{chrom}.tsv", sep="\t", index=False)

### Genome-wide SFS

In [None]:
variants = allel.GenotypeArray(vcf["calldata/GT"])

In [None]:
folded_sfs = allel.sfs_folded(variants.count_alleles())

In [None]:
sfs_df = pd.DataFrame({
    "num_minor": range(len(folded_sfs)),
    "num_sites": folded_sfs
})

At this point, each SNP can have between 0 and 205 minor alleles in our total sample of 410 haploid genomes. However, our real sample is 205 haploid genomes, since every individual is supposed to be homozygous. Therefore, any SNP with an odd number of minor allele counts has heterozygosity, and therefore is less trustworthy. We get rid of all of them. In the end, we produce a folded SFS designed for a haploid sample of size 205, where each SNP can have between 0 and 102 minor allele copies.

Finally, for SweepFinder, the number of sites has to be the proportion of all sites. It also requires a row for 0 minor alleles with 0 sites in it.

In [None]:
sfs_df = (
    sfs_df.loc[sfs_df.num_minor % 2 == 0]
    .assign(
        num_minor=lambda df: df.num_minor/2,
        num_sites=lambda df: df.num_sites/df.num_sites.sum()
    )
    .reset_index(drop=True)
)

sfs_df["num_minor"] = sfs_df["num_minor"].astype(int)

In [None]:
sfs_df.to_csv(snakemake.output["genomewide_sfs"], header=False, index=False, sep="\t")