Merge pull request #60 from klarman-cell-observatory/boli

Updated VDJdata
lilab-bcb · Mar 16, 2021 · cb09a9c · cb09a9c
2 parents 7200eca + ef6e0d7
commit cb09a9c
Show file tree

Hide file tree

Showing 4 changed files with 41 additions and 4 deletions.
diff --git a/ext_modules/fast_funcs.pyx b/ext_modules/fast_funcs.pyx
@@ -47,7 +47,7 @@ cpdef tuple convert_10x_vdj_to_vdjdata(str[:] tokens, int[:, :] mats_int, str[:,
 
 
     for i in range(nstr - 1):
-        tracing.append({"None": 0})
+        tracing.append({"None": 0, "": 0})
 
     for i in range(tokens.size):
         if bpos < 0 or bview[bpos] != tokens[i]:
@@ -69,12 +69,13 @@ cpdef tuple convert_10x_vdj_to_vdjdata(str[:] tokens, int[:, :] mats_int, str[:,
             strval = mats_str[i, j]
             value = strmap.get(strval, -1)
             if value < 0:
-                value = len(strmap)
+                value = len(strmap) - 1
                 strmap[strval] = value
             matview[j + delta, bpos, fpos] = value
         fmap[chain] = num + 1
 
     for i in range(nstr - 1):
+        del tracing[i]["None"]
         strmap = tracing[i]
         strarr = np.empty(len(strmap), dtype = np.object)
         strview = strarr

diff --git a/pegasusio/multimodal_data.py b/pegasusio/multimodal_data.py
@@ -195,6 +195,11 @@ def get_chain(self, chain: str) -> pd.DataFrame:
         assert self._unidata is not None and isinstance(self._unidata, VDJData)
         return self._unidata.get_chain(chain)
 
+    def construct_clonotype(self, min_umis: int = 2) -> None:
+        """ Surrogate function for VDJData """
+        assert self._unidata is not None and isinstance(self._unidata, VDJData)
+        self._unidata.construct_clonotype(min_umis=min_umis)
+
     def set_aside(self, params: List[str] = None) -> None:
         """ Surrogate function for CITESeqData and CytoData """
         assert self._unidata is not None and (isinstance(self._unidata, CITESeqData) or isinstance(self._unidata, CytoData))

diff --git a/pegasusio/vdj_data.py b/pegasusio/vdj_data.py
@@ -95,6 +95,37 @@ def get_chain(self, chain: str) -> pd.DataFrame:
         return df
 
 
+    def construct_clonotype(self, min_umis: int = 2) -> None:
+        if self.get_modality() == "tcr":
+            df1 = self.get_chain("TRA")
+            df2 = self.get_chain("TRB")
+            idx = (df1["umis"] >= min_umis) & (df2["umis"] >= min_umis)
+            self.obs["has_clonotype"] = idx
+            self.obs["clonotype"] = ""
+            self.obs.loc[idx, "clonotype"] = df1.loc[idx, "cdr3"] + "|" + df2.loc[idx, "cdr3"]
+
+            df3 = self.get_chain("TRD")
+            df4 = self.get_chain("TRG")
+            idx2 = (df3["umis"] >= min_umis) & (df4["umis"] >= min_umis) & (~idx)
+            if idx2.sum() > 0:
+                self.obs.loc[idx2, "has_clonotype"] = True
+                self.obs.loc[idx2, "clonotype"] = df3.loc[idx2, "cdr3"] + "|" + df4.loc[idx2, "cdr3"]
+        else:
+            assert self.get_modality() == "bcr"
+            df1 = self.get_chain("IGH")
+            df2 = self.get_chain("IGK")
+            df3 = self.get_chain("IGL")
+            idx1 = df1["umis"] >= min_umis
+            idx2 = (df2["umis"] >= df3["umis"]) & (df2["umis"] >= min_umis)
+            idx3 = (df2["umis"] < df3["umis"]) & (df3["umis"] >= min_umis)
+            idxkh = idx1 & idx2
+            idxlh = idx1 & idx3
+            self.obs["has_clonotype"] = idxkh | idxlh
+            self.obs["clonotype"] = ""
+            self.obs.loc[idxkh, "clonotype"] = df2.loc[idxkh, "cdr3"] + "|" + df1.loc[idxkh, "cdr3"]
+            self.obs.loc[idxlh, "clonotype"] = df3.loc[idxlh, "cdr3"] + "|" + df1.loc[idxlh, "cdr3"]
+
+
     def from_anndata(self, data: anndata.AnnData, genome: str = None, modality: str = None) -> None:
         raise ValueError("Cannot convert an AnnData object to a VDJData object!")
 

diff --git a/pegasusio/vdj_utils.py b/pegasusio/vdj_utils.py
@@ -33,10 +33,10 @@ def load_10x_vdj_file(input_csv: str, genome: str = None, modality: str = None)
     except ModuleNotFoundError:
         print("No module named 'pegasusio.cylib.funcs'")
 
-    df = pd.read_csv(input_csv)
+    df = pd.read_csv(input_csv, na_filter = False) # Otherwise, '' will be converted to NaN
     idx = df["productive"] == (True if df["productive"].dtype.kind == "b" else "True")
     df = df[idx]
-    df.sort_values(by = "barcode", inplace = True, kind = "mergesort") # sort barcode and make sure it is stable
+    df.sort_values(by = ["barcode", "umis"], ascending = [True, False], inplace = True, kind = "mergesort") # sort barcode and make sure it is stable
 
     feature_name = [x for x in df["chain"].value_counts().index if x != "Multi"][0]
     modal = None