In sample_background - added the option to return a pd.DataFrame.

kmayerb · Jun 29, 2021 · a020d77 · a020d77
1 parent 469b5f1
commit a020d77
Showing 1 changed file with 32 additions and 18 deletions.
diff --git a/tcrsampler/sampler.py b/tcrsampler/sampler.py
@@ -364,7 +364,8 @@ def build_background( self,
 
     self.ref_dict = d
 
-  def sample_background(self,v,j,n=1, d= None, depth = 1, seed =1, use_frequency= True ):
+  def sample_background(self, v, j, n=1, d=None, depth=1, seed=1, use_frequency=True,
+                        return_df=False, return_df_cols=['v_reps', 'j_reps', 'cdr3', 'subject']):
     """
     Parameters
     ----------
@@ -374,57 +375,70 @@ def sample_background(self,v,j,n=1, d= None, depth = 1, seed =1, use_frequency=
       j-gene name e.g., 'TRBJ1-1*01'
     n : int
       number of cdr3 samples to draw for given v,j
-    d : dict 
+    d : dict
       Dictionary for sampling, generated in by .build_background
-    depth : 
+    depth :
 
     seed : int
       random number generating seed
     use_frequency : bool
-      If True, uses frequency for sampling proportionaly. If False, uses raw counts. 
+      If True, uses frequency for sampling proportionaly. If False, uses raw counts.
+    return_df : bool
+      If True, returns a pandas dataframe (containing columns indicated in return_df_cols).
+      If false, returns a list of cdr3 samples.
+    return_df_cols : list
+      List of columns to be included in the returned dataframe,
+      from the background dataframe (only if return_df=True).
+      Options are: ['v_reps', 'j_reps', 'cdr3', 'subject', 'count', 'freq']
 
     Returns
     -------
-    r: list
+    r: list or pd.Dataframe
 
-    Example 
+    Example
     -------
     >>> sample_background(v ='TRBV10-1*01', j ='TRBJ1-1*01',n=1, d= None, depth = 1, seed =1, use_frequency= True )
     """
     if d is None:
       d = self.ref_dict
 
-
     if use_frequency:
       col = 'freq'
     else:
       col = 'count'
-    
+
     assert isinstance(v, str)
     assert isinstance(j, str)
     assert isinstance(d, dict)
     assert isinstance(depth, int)
     assert isinstance(seed, int)
 
     try:
-      subdf = d[(v,j)]
-  
+      subdf = d[(v, j)]
+
       selection_probability = \
-        subdf[ col ] / np.sum(subdf[ col ])
+        subdf[col] / np.sum(subdf[col])
+
+      np.random.seed(seed)
 
-      np.random.seed(seed) 
-
       probabalistic_selection_index = \
-        np.random.choice( range(subdf.shape[0]),
-        size = n * depth,
-        p=selection_probability)
+        np.random.choice(range(subdf.shape[0]),
+                         size=n * depth,
+                         p=selection_probability)
+
+      if return_df:
+        r = subdf.iloc[probabalistic_selection_index,][return_df_cols]
+      else:
+        r = subdf.iloc[probabalistic_selection_index,]['cdr3'].to_list()
 
-      r = subdf.iloc[probabalistic_selection_index,]['cdr3'].to_list()
       return r
 
     except KeyError:
       warnings.warn(f"({v},{j} gene usage not available")
-      r = [None]
+      if return_df:
+        r = pd.DataFrame(columns=return_df_cols)
+      else:
+        r = [None]
       return r