In [15]:
from pathlib import Path
from random import sample
import tarfile
from io import TextIOWrapper
from collections import Counter

import tqdm.notebook
import numpy as np
import pandas as pd

In [3]:
from utils.conversion import ms_to_numpy
from utils.prepare_data import save_data

In [12]:
neutral_sim_folder = next(Path(snakemake.config["additional_sim_folder"]).glob('*neutral*'))
genotypes_files = list(neutral_sim_folder.glob('genotypes/*.tar.gz'))
num_ms_files = 0
for ms_file in genotypes_files:
    with tarfile.open(ms_file) as f:    
        num_ms_files += len([member for member in f.getmembers() if not member.isdir()])

In [11]:
sfs = Counter()
for file in tqdm.notebook.tqdm(genotypes_files):
    with tarfile.open(file) as f:    
        members = f.getmembers()
        for member in members:
            if member.isdir():
                continue
            fileobj = TextIOWrapper(f.extractfile(member))
            positions, genotypes = ms_to_numpy(fileobj)
            sfs.update(genotypes.sum(axis=1))

In [16]:
keys, values = zip(*sfs.items())
# We divide the counts by the number of ms files; each file is 1Mb, so we're getting the number of sites per megabase
sfs_df = pd.DataFrame({'num_alternate': keys, 'num_sites': np.array(values)/num_ms_files}).sort_values(by='num_alternate')

In [17]:
sfs_df

In [18]:
save_data(sfs_df, snakemake.output[0])