Skip to content

Commit

Permalink
In sample_background - added the option to return a pd.DataFrame.
Browse files Browse the repository at this point in the history
  • Loading branch information
liel-cohen committed Jun 29, 2021
1 parent 469b5f1 commit a020d77
Showing 1 changed file with 32 additions and 18 deletions.
50 changes: 32 additions & 18 deletions tcrsampler/sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,8 @@ def build_background( self,

self.ref_dict = d

def sample_background(self,v,j,n=1, d= None, depth = 1, seed =1, use_frequency= True ):
def sample_background(self, v, j, n=1, d=None, depth=1, seed=1, use_frequency=True,
return_df=False, return_df_cols=['v_reps', 'j_reps', 'cdr3', 'subject']):
"""
Parameters
----------
Expand All @@ -374,57 +375,70 @@ def sample_background(self,v,j,n=1, d= None, depth = 1, seed =1, use_frequency=
j-gene name e.g., 'TRBJ1-1*01'
n : int
number of cdr3 samples to draw for given v,j
d : dict
d : dict
Dictionary for sampling, generated in by .build_background
depth :
depth :
seed : int
random number generating seed
use_frequency : bool
If True, uses frequency for sampling proportionaly. If False, uses raw counts.
If True, uses frequency for sampling proportionaly. If False, uses raw counts.
return_df : bool
If True, returns a pandas dataframe (containing columns indicated in return_df_cols).
If false, returns a list of cdr3 samples.
return_df_cols : list
List of columns to be included in the returned dataframe,
from the background dataframe (only if return_df=True).
Options are: ['v_reps', 'j_reps', 'cdr3', 'subject', 'count', 'freq']
Returns
-------
r: list
r: list or pd.Dataframe
Example
Example
-------
>>> sample_background(v ='TRBV10-1*01', j ='TRBJ1-1*01',n=1, d= None, depth = 1, seed =1, use_frequency= True )
"""
if d is None:
d = self.ref_dict


if use_frequency:
col = 'freq'
else:
col = 'count'

assert isinstance(v, str)
assert isinstance(j, str)
assert isinstance(d, dict)
assert isinstance(depth, int)
assert isinstance(seed, int)

try:
subdf = d[(v,j)]
subdf = d[(v, j)]

selection_probability = \
subdf[ col ] / np.sum(subdf[ col ])
subdf[col] / np.sum(subdf[col])

np.random.seed(seed)

np.random.seed(seed)

probabalistic_selection_index = \
np.random.choice( range(subdf.shape[0]),
size = n * depth,
p=selection_probability)
np.random.choice(range(subdf.shape[0]),
size=n * depth,
p=selection_probability)

if return_df:
r = subdf.iloc[probabalistic_selection_index,][return_df_cols]
else:
r = subdf.iloc[probabalistic_selection_index,]['cdr3'].to_list()

r = subdf.iloc[probabalistic_selection_index,]['cdr3'].to_list()
return r

except KeyError:
warnings.warn(f"({v},{j} gene usage not available")
r = [None]
if return_df:
r = pd.DataFrame(columns=return_df_cols)
else:
r = [None]
return r


Expand Down

0 comments on commit a020d77

Please sign in to comment.