Skip to content

Commit

Permalink
Merge pull request #60 from klarman-cell-observatory/boli
Browse files Browse the repository at this point in the history
Updated VDJdata
  • Loading branch information
bli25 committed Mar 16, 2021
2 parents 7200eca + ef6e0d7 commit cb09a9c
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 4 deletions.
5 changes: 3 additions & 2 deletions ext_modules/fast_funcs.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ cpdef tuple convert_10x_vdj_to_vdjdata(str[:] tokens, int[:, :] mats_int, str[:,


for i in range(nstr - 1):
tracing.append({"None": 0})
tracing.append({"None": 0, "": 0})

for i in range(tokens.size):
if bpos < 0 or bview[bpos] != tokens[i]:
Expand All @@ -69,12 +69,13 @@ cpdef tuple convert_10x_vdj_to_vdjdata(str[:] tokens, int[:, :] mats_int, str[:,
strval = mats_str[i, j]
value = strmap.get(strval, -1)
if value < 0:
value = len(strmap)
value = len(strmap) - 1
strmap[strval] = value
matview[j + delta, bpos, fpos] = value
fmap[chain] = num + 1

for i in range(nstr - 1):
del tracing[i]["None"]
strmap = tracing[i]
strarr = np.empty(len(strmap), dtype = np.object)
strview = strarr
Expand Down
5 changes: 5 additions & 0 deletions pegasusio/multimodal_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,11 @@ def get_chain(self, chain: str) -> pd.DataFrame:
assert self._unidata is not None and isinstance(self._unidata, VDJData)
return self._unidata.get_chain(chain)

def construct_clonotype(self, min_umis: int = 2) -> None:
""" Surrogate function for VDJData """
assert self._unidata is not None and isinstance(self._unidata, VDJData)
self._unidata.construct_clonotype(min_umis=min_umis)

def set_aside(self, params: List[str] = None) -> None:
""" Surrogate function for CITESeqData and CytoData """
assert self._unidata is not None and (isinstance(self._unidata, CITESeqData) or isinstance(self._unidata, CytoData))
Expand Down
31 changes: 31 additions & 0 deletions pegasusio/vdj_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,37 @@ def get_chain(self, chain: str) -> pd.DataFrame:
return df


def construct_clonotype(self, min_umis: int = 2) -> None:
if self.get_modality() == "tcr":
df1 = self.get_chain("TRA")
df2 = self.get_chain("TRB")
idx = (df1["umis"] >= min_umis) & (df2["umis"] >= min_umis)
self.obs["has_clonotype"] = idx
self.obs["clonotype"] = ""
self.obs.loc[idx, "clonotype"] = df1.loc[idx, "cdr3"] + "|" + df2.loc[idx, "cdr3"]

df3 = self.get_chain("TRD")
df4 = self.get_chain("TRG")
idx2 = (df3["umis"] >= min_umis) & (df4["umis"] >= min_umis) & (~idx)
if idx2.sum() > 0:
self.obs.loc[idx2, "has_clonotype"] = True
self.obs.loc[idx2, "clonotype"] = df3.loc[idx2, "cdr3"] + "|" + df4.loc[idx2, "cdr3"]
else:
assert self.get_modality() == "bcr"
df1 = self.get_chain("IGH")
df2 = self.get_chain("IGK")
df3 = self.get_chain("IGL")
idx1 = df1["umis"] >= min_umis
idx2 = (df2["umis"] >= df3["umis"]) & (df2["umis"] >= min_umis)
idx3 = (df2["umis"] < df3["umis"]) & (df3["umis"] >= min_umis)
idxkh = idx1 & idx2
idxlh = idx1 & idx3
self.obs["has_clonotype"] = idxkh | idxlh
self.obs["clonotype"] = ""
self.obs.loc[idxkh, "clonotype"] = df2.loc[idxkh, "cdr3"] + "|" + df1.loc[idxkh, "cdr3"]
self.obs.loc[idxlh, "clonotype"] = df3.loc[idxlh, "cdr3"] + "|" + df1.loc[idxlh, "cdr3"]


def from_anndata(self, data: anndata.AnnData, genome: str = None, modality: str = None) -> None:
raise ValueError("Cannot convert an AnnData object to a VDJData object!")

Expand Down
4 changes: 2 additions & 2 deletions pegasusio/vdj_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,10 @@ def load_10x_vdj_file(input_csv: str, genome: str = None, modality: str = None)
except ModuleNotFoundError:
print("No module named 'pegasusio.cylib.funcs'")

df = pd.read_csv(input_csv)
df = pd.read_csv(input_csv, na_filter = False) # Otherwise, '' will be converted to NaN
idx = df["productive"] == (True if df["productive"].dtype.kind == "b" else "True")
df = df[idx]
df.sort_values(by = "barcode", inplace = True, kind = "mergesort") # sort barcode and make sure it is stable
df.sort_values(by = ["barcode", "umis"], ascending = [True, False], inplace = True, kind = "mergesort") # sort barcode and make sure it is stable

feature_name = [x for x in df["chain"].value_counts().index if x != "Multi"][0]
modal = None
Expand Down

0 comments on commit cb09a9c

Please sign in to comment.