Skip to content

Commit

Permalink
Optimized the impute2 parser.
Browse files Browse the repository at this point in the history
  • Loading branch information
legaultmarc committed Apr 23, 2015
1 parent 17ece2c commit cb59666
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 51 deletions.
58 changes: 22 additions & 36 deletions gepyto/formats/impute2.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,54 +209,40 @@ def _compute_dosage(line, prob_threshold=0, is_chr23=False,
sex_vector=None):
"""Computes dosage from probabilities (IMPUTE2)."""
# Type check
type(line) is line
assert type(line) is _Line

# Constructing the Pandas data frame
data = pd.DataFrame({
"d1": line.probabilities[:, 0],
"d2": line.probabilities[:, 1],
"d3": line.probabilities[:, 2],
})
# We set the dosage as being the expected number of "a2" alleles for
# all samples.
# Given that the rows represent p_aa, p_ab, p_bb, we have that
# E(#b) = 2 * p_bb + p_ab, the dosage.
dosage = 2 * line.probabilities[:, 2] + line.probabilities[:, 1]

# Probability filtering. We mask samples with low probabilities.
if prob_threshold > 0:
dosage[~np.any(line.probabilities > prob_threshold, axis=1)] = np.nan

# Compute the maf.
mac = np.nansum(dosage)
maf = mac / (2 * np.sum(~np.isnan(dosage)))

# Getting the maximal probabilities for each sample
data["dmax"] = data.max(axis=1)
# If maf > 0.5, we need to flip.
major, minor = (line.a1, line.a2)
if maf > 0.5:
maf = 1 - maf
dosage = 2 - dosage # 0 -> 2, 1 -> 1, 2 -> 0.
major, minor = minor, major

# Computing the A1 frequency
if is_chr23:
# TODO: Implement this, please
raise NotImplementedError("dosage for chromosome 23 is not yet "
"supported (because dosage is computed "
"differently for males on chromosome 23)")

else:
sub_data = data[data.dmax >= prob_threshold][["d1", "d2", "d3"]]
count = Counter(sub_data.idxmax(axis=1))
total_count = (sum(count.values()) * 2)
a1_count = ((count["d1"] * 2) + count["d2"])
a1_freq = a1_count / total_count

# Computing the dosage
minor_allele = "d1"
minor = line.a1
major = line.a2
maf = a1_freq
minor_count = a1_count
if a1_freq >= 0.5:
minor_allele = "d3"
minor = line.a2
major = line.a1
maf = 1 - a1_freq
minor_count = total_count - a1_count
data["dosage"] = (data[minor_allele] + (data.d2 / 2)) * 2

# Setting values to NaN when max prob is lower than threshold
data.loc[data.dmax < prob_threshold, :] = np.nan

return (data.dosage.values, {
return (dosage, {
"major": major,
"minor": minor,
"maf": maf,
"minor_allele_count": minor_count,
"minor_allele_count": mac,
"name": line.name,
"chrom": line.chrom,
"pos": line.pos,
Expand Down
41 changes: 26 additions & 15 deletions gepyto/tests/test_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,14 @@
from ..formats import impute2


def compare_vectors(v1, v2):
try:
np.testing.assert_array_almost_equal(v1, v2)
except AssertionError:
return False
return True


def compare_lines(l1, l2):
"""Compare line tuples. """
# Compare the first 4 elements (snpid, chrom, pos, a1, a2).
Expand All @@ -37,16 +45,16 @@ def compare_dosages(self, d1, d2):
v2, info2 = d2

for k in info1:
if k != "maf":
self.assertEqual(info1[k], info2[k])
else:
if k == "maf":
self.assertAlmostEqual(info1[k], info2[k])
elif k == "minor_allele_count":
self.assertEqual(int(info1[k]), int(info2[k]))
else:
self.assertEqual(info1[k], info2[k])

if v1.dtype is np.dtype(float):
# Checking the dosage values (nan != nan in numpy)
v1_nan = np.isnan(v1)
v2_nan = np.isnan(v2)
return ((v1 == v2) | (v1_nan & v2_nan)).all()
return compare_vectors(v1, v2)

if v1.dtype.char == "U" or v1.dtype.char == "S":
return (v1 == v2).all()
Expand Down Expand Up @@ -115,30 +123,33 @@ def setUp(self):
"T",
np.array([[1, 0, 0], [0.1, 0.3, 0.6], [0.1, 0.35, 0.55],
[0, 1, 0]]),
# AA, TT, TT, AT
# 0, 1.5, 1.45, 1
)

self.dosage_snp1 = (
np.array([0., 0.002, 1.003]),
{"minor": "G", "major": "A", "maf": 1 / 6.0, "name": "rs12345",
{"minor": "G", "major": "A", "maf": 1.005 / 6.0, "name": "rs12345",
"chrom": "1", "pos": 1231415, "minor_allele_count": 1}
)

self.dosage_snp2 = (
np.array([0.130, 0.099, 2]),
{"minor": "C", "major": "T", "maf": 2 / 6.0, "name": "rs23456",
{"minor": "C", "major": "T", "maf": 2.229 / 6.0, "name": "rs23456",
"chrom": "1", "pos": 3214569, "minor_allele_count": 2}
)

self.dosage_indel = (
np.array([0.130, 1, 2]),
{"minor": "TC", "major": "T", "maf": 0.5, "name": "rs23457",
np.array([1.87, 1, 0]),
{"minor": "T", "major": "TC", "maf": 2.87 / 6.0, "name": "rs23457",
"chrom": "1", "pos": 3214570, "minor_allele_count": 3}
)

self.dosage_snp3_thresh_0 = (
np.array([2, 0.5, 0.55, 1]),
{"minor": "A", "major": "T", "maf": 3 / 8.0, "name": "rs1234567",
"chrom": "1", "pos": 1234567, "minor_allele_count": 3},
np.array([0, 1.5, 1.45, 1]),
{"minor": "T", "major": "A", "maf": 3.95 / 8.0,
"name": "rs1234567", "chrom": "1", "pos": 1234567,
"minor_allele_count": 3},
)

self.dosage_snp3_thresh_9 = (
Expand Down Expand Up @@ -258,7 +269,7 @@ def test_dosage_maf(self):

# Checking for probability threshold of 0.9
with impute2.Impute2File(self.f2.name, "dosage",
prob_threshold=0.9) as f:
prob_threshold=0.9) as f:
self.assertTrue(compare_dosages(
self,
f.readline(),
Expand Down Expand Up @@ -294,7 +305,7 @@ def test_hard_call(self):
raise Exception()

with impute2.Impute2File(self.f.name, "hard_call",
prob_threshold=0.9) as f:
prob_threshold=0.9) as f:
for i, line in enumerate(f):
if i == 0:
self.assertTrue(compare_dosages(
Expand Down

0 comments on commit cb59666

Please sign in to comment.