In [1]:
import pickle
import random

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import h5py

%matplotlib inline

In [2]:
#compute mendelian errors (biallelic)
def compute_mendelian_errors(mother, father, offspring):
    num_errors = 0
    num_ofs_problems = 0
    if len(mother.union(father)) == 1:
        # Mother and father are homo and the same
        for ofs in offspring:
            if len(ofs) == 2:
                # Offspring is het
                num_errors += 1
                num_ofs_problems += 1
            elif len(ofs.intersection(mother)) == 0:
                # Offspring is homo, but opposite from parents
                num_errors += 2
                num_ofs_problems += 1
    elif len(mother) == 1 and len(father) == 1:
        # Mother and father are homo and different
        for ofs in offspring:
            if len(ofs) == 1:
                # Homo, should be het
                num_errors += 1
                num_ofs_problems += 1
    elif len(mother) == 2 and len(father) == 2:
        # Both are het, individual offspring can be anything
        pass
    else:
        # One is het, the other is homo
        homo = mother if len(mother) == 1 else father
        for ofs in offspring:
            if len(ofs) == 1 and ofs.intersection(homo):
                # homo, but not including the allele from parent that is homo
                num_errors += 1
                num_ofs_problems += 1
    return num_errors, num_ofs_problems

In [3]:
#compute probabilities

In [4]:
h5_3L = h5py.File('ag1000g.crosses.phase1.ar3sites.3L.h5', 'r')
calldata_genotype = h5_3L['/3L/calldata/genotype']
MQ0 = h5_3L['/3L/variants/MQ0']
num_alleles = h5_3L['/3L/variants/num_alleles']
is_snp = h5_3L['/3L/variants/is_snp']
POS = h5_3L['/3L/variants/POS']
samples_hdf5 = list(map(lambda sample: sample.decode('utf-8'), h5_3L['/3L/samples']))

In [5]:
def get_family_indexes(samples_hdf5, cross_pd):
    offspring = []
    for i, individual in cross_pd.T.iteritems():
        index = samples_hdf5.index(individual.id)
        if individual.function == 'parent':
            if individual.sex == 'M':
                father = index
            else:
                mother = index
        else:
            offspring.append(index)
    return {'mother': mother, 'father': father, 'offspring': offspring}

In [6]:
samples = pd.read_csv('samples.tsv', sep='\t')
cross_pd = samples[samples['cross'] == 'cross-29-2']
family_indexes = get_family_indexes(samples_hdf5, cross_pd)

In [7]:
def acceptable_position_to_genotype():
    for i, genotype in enumerate(calldata_genotype):
        if is_snp[i] and num_alleles[i] == 2:
            if len(np.where(genotype == -1)[0]) > 1:
                # Missing data
                continue
            yield i

def acumulate(fun):
    acumulator = {}
    for res in fun():
        if res is not None:
            acumulator[res[0]] = res[1]
    return acumulator

In [8]:
mother_index = family_indexes['mother']
father_index = family_indexes['father']
offspring_indexes = family_indexes['offspring']
all_errors = {}


def get_mendelian_errors():
    for i in acceptable_position_to_genotype():
        genotype = calldata_genotype[i]
        mother = set(genotype[mother_index])
        father = set(genotype[father_index])
        offspring = [set(genotype[ofs_index]) for ofs_index in offspring_indexes]
        mendelian_errors = compute_mendelian_errors(mother, father, offspring)
        if i % 10000 == 0:
            print(POS[i])
        if mendelian_errors[0] > 0:
            #print(POS[i], True, mother, father, offspring)
            yield POS[i], mendelian_errors
        else:
            yield None

mendelian_errors = acumulate(get_mendelian_errors)

106259
259481
348831
545001
653431
797284
1015091
1136562
1235632
1311525
1390799
1718716
1866239
1949380
2029964
2108877
2182476
2261237
2352302
2421115
2484642
2551290
2616920
2681504
2767553
2957077
3025191
3096162
3156766
3232421
3302197
3394622
3471575
3551139
3633356
3706782
3846533
3925870
4006051
4073932
4142423
4294788
4362543
4414670
4920246
5162614
5234027
5302690
5376137
5450573
5517180
5659141
5733285
5802923
5871510
5944102
6009332
6205296
6267887
6329963
6388365
6469163
6540508
6607613
6665994
6771707
6826195
6927324
7165021
7213924
7344662
7467127
7528462
7582564
7624854
7666064
7871060
7921685
7973386
8019282
8063476
8116466
8162422
8210422
8307787
8355076
8404911
8439014
8474576
8542599
8579592
8626950
8704574
8752917
8835767
8982647
9027563
9071774
9114147
9157921
9196431
9292985
9336401
9373726
9407536
9449863
9483366
9693825
9827984
9941466
10044890
10082057
10118936
10208107
10282433
10360910
10403572
10441893
10474284
10506787
10542480
10616873
10653157
10720274


37125522
37143598
37161459
37263103
37282469
37326870
37347533
37367797
37410901
37450671
37478545
37499339
37579713
37613825
37636057
37661358
37704287
37723465
37744466
37768230
37790516
37809869
37828915
37869561
37920759
37940811
37959169
37983565
38010288
38031660
38057759
38081916
38104116
38132845
38151197
38194876
38285131
38313285
38367980
38408988
38429656
38448222
38478387
38496803
38518921
38539870
38562150
38581863
38600050
38621558
38650924
38728585
38765770
38820835
38839024
38857147
38876289
38895017
38915070
38933385
38951457
39030190
39048300
39067041
39085012
39106460
39127563
39170382
39194900
39216858
39235743
39276191
39298365
39319995
39339750
39357692
39375882
39395094
39417372
39438503
39474371
39492410
39535414
39555346
39580068
39616472
39666997
39693840
39721975
39741422
39762201
39813308
39880257
39920019
39957659
39976767
39999122
40016422
40073744
40094213
40130131
40149780
40168567
40221141
40238606
40290874
40310790
40401725
40437316
40454839
40471930
4

In [9]:
pickle.dump(mendelian_errors, open('mendelian_errors.pickle', 'wb'))

In [10]:
mendelian_errors = pickle.load(open('mendelian_errors.pickle', 'rb'))

In [11]:
len(mendelian_errors)

541688

In [12]:
def get_stats():
    error_mq0 = []
    ok_mq0 = []
    for i in acceptable_position_to_genotype():
        my_mendelian_errors = mendelian_errors.get(POS[i], (0, 0))
        if mendelian_errors[0] > 0:
            error_mq0.append(MQ0[i])
        else:
            ok_mq0.append(MQ0[i])
    return {
        'MQ0': {
            'errors': error_mq0,
            'OK': ok_mq0
        }
    }

In [None]:
mendelian_stats = get_stats()
fig, axs = plt.subplots(1, 1, squeeze=False, fig_size=(16, 9))
ax = axs[0][0]
MQ0_stats = mendelian_stats['MQ0']
ax.set_title('MQ0')
ax.boxplot([MQ0_stats['errors'], MQ0_stats['OK']])

In [13]:
import numpy as np
a = np.array([[-1, -1]])
np.where(a==-1)

(array([0, 0]), array([0, 1]))