In [2]:
import numpy as np

import sys
from os import listdir
from itertools import product

from inheritance_states import AutosomalInheritanceStates
from input_output import WGSData, write_to_file, pull_families
from transition_matrices import AutosomalTransitionMatrix
from genotypes import Genotypes
from losses import LazyLoss
from viterbi import viterbi_forward_sweep_autosomes, viterbi_backward_sweep_autosomes
from mask import mask_states

In [3]:
# Read in command line arguments
chrom = '15'
m = 4
ped_file = '../data/160826.ped.quads.ped'
data_dir = '../split_gen_miss'
batch_size = None
batch_num = None
batch_offset = None

# set up filenames
sample_file = '%s/chr.%s.gen.samples.txt' % (data_dir, chrom)
coord_file = '%s/chr.%s.gen.coordinates.npy' % (data_dir,  chrom)
gen_files = sorted([f for f in listdir(data_dir) if ('chr.%s' % chrom) in f and 'gen.npz' in f])


In [4]:
# pull families of interest
families_of_this_size = pull_families(sample_file, ped_file, m, batch_size, batch_offset)

families with sequence data 2055
families of size 4: 2049
families pulled 4: 2049


In [5]:
shift_costs = [10]*4 + [500]*(2*(m-2))

# create inheritance states
inheritance_states = AutosomalInheritanceStates(m)

# create transition matrix
transition_matrix = AutosomalTransitionMatrix(inheritance_states, shift_costs)

# create genotypes
genotypes = Genotypes(m)

# create loss function
loss = LazyLoss(m, inheritance_states, genotypes)

# get ready to pull processed WGS data 
wgs_data = WGSData(data_dir, gen_files, coord_file, sample_file, chrom)

inheritance states (64, 8)
transitions (64, 18)
genotypes (256, 4)
perfect matches (110, 4)
chrom shape only SNPs (2164322,)


In [6]:
import random

#fkey, inds = random.choice(families_of_this_size)
fkey, inds = [x for x in families_of_this_size if x[0] == ('AU0012.AU001203.AU001204', 'AU001201', 'AU001202')][0]
print('family', fkey, inds)

family ('AU0012.AU001203.AU001204', 'AU001201', 'AU001202') ['AU001201', 'AU001202', 'AU001204', 'AU001203']


In [None]:
# pull genotype data for this family
family_genotypes, family_snp_positions, mult_factor = wgs_data.pull_data_for_individuals(inds)
print('Data pulled')

# forward sweep
v_cost = viterbi_forward_sweep_autosomes(family_genotypes, family_snp_positions, mult_factor, inheritance_states, transition_matrix, loss)

# backward sweep
final_states = viterbi_backward_sweep_autosomes(v_cost, inheritance_states, transition_matrix)
