In [1]:
import pickle
import random

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import h5py

%matplotlib inline

In [2]:
#compute mendelian errors (biallelic)
def compute_mendelian_errors(mother, father, offspring):
    num_errors = 0
    num_ofs_problems = 0
    if len(mother.union(father)) == 1:
        # Mother and father are homo and the same
        for ofs in offspring:
            if len(ofs) == 2:
                # Offspring is het
                num_errors += 1
                num_ofs_problems += 1
            elif len(ofs.intersection(mother)) == 0:
                # Offspring is homo, but opposite from parents
                num_errors += 2
                num_ofs_problems += 1
    elif len(mother) == 1 and len(father) == 1:
        # Mother and father are homo and different
        for ofs in offspring:
            if len(ofs) == 1:
                # Homo, should be het
                num_errors += 1
                num_ofs_problems += 1
    elif len(mother) == 2 and len(father) == 2:
        # Both are het, individual offspring can be anything
        pass
    else:
        # One is het, the other is homo
        homo = mother if len(mother) == 1 else father
        for ofs in offspring:
            if len(ofs) == 1 and ofs.intersection(homo):
                # homo, but not including the allele from parent that is homo
                num_errors += 1
                num_ofs_problems += 1
    return num_errors, num_ofs_problems

In [3]:
#compute probabilities

In [4]:
h5_3L = h5py.File('ag1000g.crosses.phase1.ar3sites.3L.h5', 'r')
calldata_genotype = h5_3L['/3L/calldata/genotype']
MQ0 = h5_3L['/3L/variants/MQ0']
num_alleles = h5_3L['/3L/variants/num_alleles']
is_snp = h5_3L['/3L/variants/is_snp']
POS = h5_3L['/3L/variants/POS']
samples_hdf5 = list(map(lambda sample: sample.decode('utf-8'), h5_3L['/3L/samples']))

In [5]:
def get_family_indexes(samples_hdf5, cross_pd):
    offspring = []
    for i, individual in cross_pd.T.iteritems():
        index = samples_hdf5.index(individual.id)
        if individual.function == 'parent':
            if individual.sex == 'M':
                father = index
            else:
                mother = index
        else:
            offspring.append(index)
    return {'mother': mother, 'father': father, 'offspring': offspring}

In [6]:
samples = pd.read_csv('samples.tsv', sep='\t')
cross_pd = samples[samples['cross'] == 'cross-29-2']
family_indexes = get_family_indexes(samples_hdf5, cross_pd)

In [28]:
def acceptable_position_to_genotype():
    for i, genotype in enumerate(calldata_genotype):
        if is_snp[i] and num_alleles[i] == 2:
            if len(np.where(genotype == -1)[0]) > 1:
                # Missing data
                continue
            yield i

def acumulate(fun):
    acumulator = {}
    for res in fun():
        if res is not None:
            acumulator[res[0]] = res[1]
    return acumulator

In [29]:
mother_index = family_indexes['mother']
father_index = family_indexes['father']
offspring_indexes = family_indexes['offspring']
all_errors = {}


def get_mendelian_errors():
    for i in acceptable_position_to_genotype():
        genotype = calldata_genotype[i]
        mother = set(genotype[mother_index])
        father = set(genotype[father_index])
        offspring = [set(genotype[ofs_index]) for ofs_index in offspring_indexes]
        my_mendelian_errors = compute_mendelian_errors(mother, father, offspring)
        yield POS[i], my_mendelian_errors
        #if i % 100000 == 0:
        #    print(POS[i])
        #if i > 100000:
        #    break
        #if mendelian_errors[0] > 0:
        #    #print(POS[i], True, mother, father, offspring)
        #    yield POS[i], my_mendelian_errors
        #else:
        #    yield None

mendelian_errors = acumulate(get_mendelian_errors)

106259
259481
348831
545001
653431
797284


In [30]:
pickle.dump(mendelian_errors, open('mendelian_errors.pickle', 'wb'))

In [31]:
mendelian_errors = pickle.load(open('mendelian_errors.pickle', 'rb'))

In [33]:
len(mendelian_errors), len(list(filter(lambda x: x[0] > 0,mendelian_errors.values())))

(60949, 4248)

In [12]:
def get_stats():
    error_mq0 = []
    ok_mq0 = []
    for i in acceptable_position_to_genotype():
        my_mendelian_errors = mendelian_errors.get(POS[i], (0, 0))
        if mendelian_errors[0] > 0:
            error_mq0.append(MQ0[i])
        else:
            ok_mq0.append(MQ0[i])
    return {
        'MQ0': {
            'errors': error_mq0,
            'OK': ok_mq0
        }
    }

In [None]:
mendelian_stats = get_stats()
fig, axs = plt.subplots(1, 1, squeeze=False, fig_size=(16, 9))
ax = axs[0][0]
MQ0_stats = mendelian_stats['MQ0']
ax.set_title('MQ0')
ax.boxplot([MQ0_stats['errors'], MQ0_stats['OK']])

In [None]:
#Stats by position

In [13]:
import numpy as np
a = np.array([[-1, -1]])
np.where(a==-1)

(array([0, 0]), array([0, 1]))