In [2]:
%%bash

mkdir -p output/empirical-windows/ms
mkdir -p output/empirical-windows/features
mkdir -p output/empirical-windows/npy
mkdir -p output/empirical-windows/npy-log-scale

In [3]:
from pathlib import Path

In [4]:
NAME = snakemake.wildcards['window']
NORM_STATS = snakemake.input['normalization_stats']

genotype_file = snakemake.input['genotypes']
ms_file = snakemake.output['ms']
feature_file = snakemake.output['features']
stats_file = snakemake.output['stats']

### Convert to ms format

In [5]:
from utils.conversion import vcftools_to_ms

In [6]:
vcftools_to_ms(genotype_file, ms_file)

### Calculate features

In [7]:
from utils.feature_calculation import get_windows, calculate_features
from utils.project_parameters import locus_size, data_dimension, default_summary_statistics, smallest_window

In [19]:
start = int(NAME.split('_')[2])
window_sizes, center_pos_dict = get_windows(locus_size, data_dimension, start_pos=start, smallest_window=smallest_window)

In [20]:
calculate_features(
    ms_file=ms_file,
    summary_statistics=default_summary_statistics,
    center_pos=center_pos_dict,
    window_sizes=window_sizes,
    output_file=feature_file,
    output_stats=stats_file,
)

### Normalize features

In [10]:
import numpy as np

from utils.project_parameters import summary_statistic_order
from utils.feature_calculation import get_normalization_stats, normalize_features

norm_stats_dict = get_normalization_stats(NORM_STATS, summary_statistic_order)
normalized = normalize_features(feature_file, summary_statistic_order, norm_stats_dict,
                                reshape=True, convert_to_uint8=True, log_scale=False)

for data_id, normalized_array in normalized.items():
    np.save(f"{snakemake.params['outdir']}/{data_id}.npy", normalized_array)
    
# Also do log-scale normalization
normalized_log = normalize_features(feature_file, summary_statistic_order, norm_stats_dict,
                            reshape=True, convert_to_uint8=True, log_scale=True)
log_scale_outdir = Path(snakemake.params['outdir']).parent/'npy-log-scale'
for data_id, normalized_array in normalized_log.items():
    np.save(f"{log_scale_outdir}/{data_id}.npy", normalized_array)