Optimized the impute2 parser.

legaultmarc · Apr 23, 2015 · cb59666 · cb59666
1 parent 17ece2c
commit cb59666
Show file tree

Hide file tree

Showing 2 changed files with 48 additions and 51 deletions.
diff --git a/gepyto/formats/impute2.py b/gepyto/formats/impute2.py
@@ -209,54 +209,40 @@ def _compute_dosage(line, prob_threshold=0, is_chr23=False,
                     sex_vector=None):
     """Computes dosage from probabilities (IMPUTE2)."""
     # Type check
-    type(line) is line
+    assert type(line) is _Line
 
-    # Constructing the Pandas data frame
-    data = pd.DataFrame({
-        "d1": line.probabilities[:, 0],
-        "d2": line.probabilities[:, 1],
-        "d3": line.probabilities[:, 2],
-    })
+    # We set the dosage as being the expected number of "a2" alleles for
+    # all samples.
+    # Given that the rows represent p_aa, p_ab, p_bb, we have that
+    # E(#b) = 2 * p_bb + p_ab, the dosage.
+    dosage = 2 * line.probabilities[:, 2] + line.probabilities[:, 1]
+
+    # Probability filtering. We mask samples with low probabilities.
+    if prob_threshold > 0:
+        dosage[~np.any(line.probabilities > prob_threshold, axis=1)] = np.nan
+
+    # Compute the maf.
+    mac = np.nansum(dosage)
+    maf = mac / (2 * np.sum(~np.isnan(dosage)))
 
-    # Getting the maximal probabilities for each sample
-    data["dmax"] = data.max(axis=1)
+    # If maf > 0.5, we need to flip.
+    major, minor = (line.a1, line.a2)
+    if maf > 0.5:
+        maf = 1 - maf
+        dosage = 2 - dosage  # 0 -> 2, 1 -> 1, 2 -> 0.
+        major, minor = minor, major
 
-    # Computing the A1 frequency
     if is_chr23:
         # TODO: Implement this, please
         raise NotImplementedError("dosage for chromosome 23 is not yet "
                                   "supported (because dosage is computed "
                                   "differently for males on chromosome 23)")
 
-    else:
-        sub_data = data[data.dmax >= prob_threshold][["d1", "d2", "d3"]]
-        count = Counter(sub_data.idxmax(axis=1))
-        total_count = (sum(count.values()) * 2)
-        a1_count = ((count["d1"] * 2) + count["d2"])
-        a1_freq = a1_count / total_count
-
-    # Computing the dosage
-    minor_allele = "d1"
-    minor = line.a1
-    major = line.a2
-    maf = a1_freq
-    minor_count = a1_count
-    if a1_freq >= 0.5:
-        minor_allele = "d3"
-        minor = line.a2
-        major = line.a1
-        maf = 1 - a1_freq
-        minor_count = total_count - a1_count
-    data["dosage"] = (data[minor_allele] + (data.d2 / 2)) * 2
-
-    # Setting values to NaN when max prob is lower than threshold
-    data.loc[data.dmax < prob_threshold, :] = np.nan
-
-    return (data.dosage.values, {
+    return (dosage, {
         "major": major,
         "minor": minor,
         "maf": maf,
-        "minor_allele_count": minor_count,
+        "minor_allele_count": mac,
         "name": line.name,
         "chrom": line.chrom,
         "pos": line.pos,

diff --git a/gepyto/tests/test_formats.py b/gepyto/tests/test_formats.py
@@ -21,6 +21,14 @@
 from ..formats import impute2
 
 
+def compare_vectors(v1, v2):
+    try:
+        np.testing.assert_array_almost_equal(v1, v2)
+    except AssertionError:
+        return False
+    return True
+
+
 def compare_lines(l1, l2):
     """Compare line tuples. """
     # Compare the first 4 elements (snpid, chrom, pos, a1, a2).
@@ -37,16 +45,16 @@ def compare_dosages(self, d1, d2):
     v2, info2 = d2
 
     for k in info1:
-        if k != "maf":
-            self.assertEqual(info1[k], info2[k])
-        else:
+        if k == "maf":
             self.assertAlmostEqual(info1[k], info2[k])
+        elif k == "minor_allele_count":
+            self.assertEqual(int(info1[k]), int(info2[k]))
+        else:
+            self.assertEqual(info1[k], info2[k])
 
     if v1.dtype is np.dtype(float):
         # Checking the dosage values (nan != nan in numpy)
-        v1_nan = np.isnan(v1)
-        v2_nan = np.isnan(v2)
-        return ((v1 == v2) | (v1_nan & v2_nan)).all()
+        return compare_vectors(v1, v2)
 
     if v1.dtype.char == "U" or v1.dtype.char == "S":
         return (v1 == v2).all()
@@ -115,30 +123,33 @@ def setUp(self):
             "T",
             np.array([[1, 0, 0], [0.1, 0.3, 0.6], [0.1, 0.35, 0.55],
                       [0, 1, 0]]),
+            # AA, TT, TT, AT
+            # 0, 1.5, 1.45, 1
         )
 
         self.dosage_snp1 = (
             np.array([0., 0.002, 1.003]),
-            {"minor": "G", "major": "A", "maf": 1 / 6.0, "name": "rs12345",
+            {"minor": "G", "major": "A", "maf": 1.005 / 6.0, "name": "rs12345",
              "chrom": "1", "pos": 1231415, "minor_allele_count": 1}
         )
 
         self.dosage_snp2 = (
             np.array([0.130, 0.099, 2]),
-            {"minor": "C", "major": "T", "maf": 2 / 6.0, "name": "rs23456",
+            {"minor": "C", "major": "T", "maf": 2.229 / 6.0, "name": "rs23456",
              "chrom": "1", "pos": 3214569, "minor_allele_count": 2}
         )
 
         self.dosage_indel = (
-            np.array([0.130, 1, 2]),
-            {"minor": "TC", "major": "T", "maf": 0.5, "name": "rs23457",
+            np.array([1.87, 1, 0]),
+            {"minor": "T", "major": "TC", "maf": 2.87 / 6.0, "name": "rs23457",
              "chrom": "1", "pos": 3214570, "minor_allele_count": 3}
         )
 
         self.dosage_snp3_thresh_0 = (
-            np.array([2, 0.5, 0.55, 1]),
-            {"minor": "A", "major": "T", "maf": 3 / 8.0, "name": "rs1234567",
-             "chrom": "1", "pos": 1234567, "minor_allele_count": 3},
+            np.array([0, 1.5, 1.45, 1]),
+            {"minor": "T", "major": "A", "maf": 3.95 / 8.0,
+             "name": "rs1234567", "chrom": "1", "pos": 1234567,
+             "minor_allele_count": 3},
         )
 
         self.dosage_snp3_thresh_9 = (
@@ -258,7 +269,7 @@ def test_dosage_maf(self):
 
         # Checking for probability threshold of 0.9
         with impute2.Impute2File(self.f2.name, "dosage",
-                                      prob_threshold=0.9) as f:
+                                 prob_threshold=0.9) as f:
             self.assertTrue(compare_dosages(
                 self,
                 f.readline(),
@@ -294,7 +305,7 @@ def test_hard_call(self):
                     raise Exception()
 
         with impute2.Impute2File(self.f.name, "hard_call",
-                                      prob_threshold=0.9) as f:
+                                 prob_threshold=0.9) as f:
             for i, line in enumerate(f):
                 if i == 0:
                     self.assertTrue(compare_dosages(