## Dependencies

In [None]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import scipy.interpolate as interp
import statsmodels.stats.multitest as smm
from statsmodels.nonparametric.smoothers_lowess import lowess
import random

## Functions

### loess-related functions

In [None]:
# function to calculate per-residue score using loess 
def loess_v3(x_obs, y_obs, span, x_out=None, interp_how='quadratic', it=0, loess_kws=dict(missing='raise',return_sorted=False), interp_kws=dict(fill_value='extrapolate')):
    x_obs = x_obs.astype(float)
    if x_out is None:
        x_out = x_obs
    df_loess = pd.DataFrame()
    df_loess['xvals'] = x_out.astype(float)
    df_loess = df_loess.sort_values(by='xvals').reset_index(drop=True)
    if interp_how == 'statsmodels':
        df_loess['y_loess'] = lowess(endog=y_obs, exog=x_obs, xvals=df_loess['xvals'], frac=span, it=it, **loess_kws)
    else:
        df_interp = pd.DataFrame()
        df_interp['x_obs'] = x_obs
        df_interp['y_obs_loess'] = lowess(endog=y_obs, exog=x_obs, xvals=x_obs, frac=span, it=it, missing='drop')#, **loess_kws)
        df_interp = df_interp.groupby('x_obs',as_index=False).agg({'y_obs_loess':'mean'})
        fx_interp = interp.interp1d(x=df_interp['x_obs'], y=df_interp['y_obs_loess'], kind=interp_how, **interp_kws)
        df_loess['y_loess'] = fx_interp(df_loess['xvals'])
    df_loess['type'] = np.where(df_loess['xvals'].isin(x_obs), 'loess', 'interp')
    return df_loess

# function to randomize sgRNA scores and calculate loess to generate null distribution
def randomize(input_df,loess_col,n,aa_len):
    loess_df = pd.DataFrame()
    loess_df = pd.concat([loess_v3(input_df['Residue'],
                        random.sample(input_df[loess_col].tolist(),len(input_df[loess_col].tolist())),
                        x_out=np.arange(1,aa_len+1,1),span=20/aa_len, interp_how='quadratic')[['y_loess']] for i in range(0,n)], ignore_index=True,axis=1)
    return(loess_df)

# function to calculate adjusted p-value of per-residue scores 
def calculate_sig(loess_actual,randomized_loess,n):
    pval_df = loess_actual[['xvals','y_loess']].copy()
    pval_df['obs_gt'] = randomized_loess.gt(pval_df['y_loess'], axis=0).sum(axis=1) # get # of values greater than obs_val
    pval_df['1t_pval'] = pval_df['obs_gt'] / n # divide "rank" of obs val by N to get empirical p-val

    temp = smm.multipletests(pval_df['1t_pval'], alpha=0.05, method='fdr_bh', is_sorted=False, returnsorted=False) # apply benjamini-hochberg FDR correction
    pval_df['sig'] = temp[0]
    pval_df['corr_pval'] = temp[1]
    return(pval_df)

### miscellaneous functions

In [None]:
# function to combine ABE and CBE sgRNA scores
def combine_and_zscore(ABE,CBE):
    combined_ABE = ABE[ABE['Mutation_type']=='Missense'][['Residue']].copy()
    combined_ABE = combined_ABE.assign(zscore_lfc = stats.zscore(ABE['lfc_over_unsorted_adjusted_param1']))
    combined_CBE = CBE[CBE['Mutation_type']=='Missense'][['Residue']].copy()
    combined_CBE = combined_CBE.assign(zscore_lfc = stats.zscore(CBE['lfc_over_unsorted_adjusted_param1']))
    return(pd.concat([combined_ABE,combined_CBE]))

## example lines of code to run linear clustering

In [None]:
ABE_df = #df containing sgRNA scores from ABE library
CBE_df = #df containing sgRNA scores from CBE library
combined_zscore = combine_and_zscore(ABE_df, CBE_df)

In [None]:
aa_len = #number of residues protein contains
n_iterations = #number of randomization iterations
span = #span of sliding window
loess = loess_v3(x_obs=combined_zscore['Residue'].astype(float),y_obs=combined_zscore['zscore_lfc'].astype(float),x_out=np.arange(1,aa_len+1,1),span=span,interp_how='quadratic')
bg_loess = randomize(combined_zscore,'zscore_lfc',n_iterations,aa_len)
pval = calculate_sig(loess,bg_loess,n_iterations)