In [1]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import os
import sys
import re
import glob
import suftware as su
from scipy.stats import norm
from scipy.stats import poisson

e = np.exp(1)
pi = np.pi
def pseudo_log(x,base=e):
    return np.log(x+.5)/np.log(base)

In [2]:
def compute_activities(bc_df, bootstrap=False, seed=None):
        """
        This function computes log_psi for all splice sites
        """
        
        # Seed random number generator
        if seed:
            np.random.seed(seed)
        
        # Copy bc_df so original isn't changed
        bc_df = bc_df.copy()
        
        # Add bc_ct col
        bc_df['bc_ct'] = 1
        
        # Compute weights for bootstrapping
        num_bcs = len(bc_df)
        if bootstrap:
            weights = poisson.rvs(mu=1.0, size=num_bcs)
        else:
            weights = np.ones(num_bcs)
            
        # Multipy ct cols through by weights
        ct_cols = [col for col in bc_df.columns if '_ct' in col]
        for col in ct_cols:
            bc_df[col] = bc_df[col]*weights
            
        # Marginalize by splice site
        ss_df = bc_df.groupby('ss').sum()
        
        # Divide by total number of counts
        ss_df = ss_df
        
        # Store logs centered on medians
        ss_df['log_psi'] = pseudo_log(ss_df['ex_ct']) - pseudo_log(ss_df['tot_ct'])

        # Remove unecessary columns
        cols_to_keep = ['log_psi']
        return ss_df[cols_to_keep]

In [3]:
def process_data(in_file, 
                 min_ct_per_bc=1, 
                 min_num_barcodes=10, 
                 num_resamps=100, 
                 min_sigma=1E-2, 
                 report_every=10,
                 estimate_mi=True):
        
    # Create dict to record statistics
    stats_dict = {}
        
    # Load file
    in_df = pd.read_csv(in_file, delimiter='\t', index_col=0)
    print(f'Processing {in_file}')

    # Remove extraneous columns
    del in_df['mis_ct']
    del in_df['lib_ct']

    # Get total number of barcodes
    num_bcs_total = len(in_df)

    # Only keep barcodes that have a minimum number of counts in all bc samples
    ix = (in_df['tot_ct'] + in_df['ex_ct'] >=min_ct_per_bc)
    bc_df = in_df[ix]
    
    # Compute the number of splice sites per bc
    ss_per_bc = bc_df[['ss','bc']].groupby('ss').count()

    # Only keep splice sites that have at least 10 barcodes
    ix = (ss_per_bc['bc']>=min_num_barcodes)
    ss_to_keep = ss_per_bc[ix].index

    # Remove barcodes linked to unusued splice sites
    ix = bc_df['ss'].isin(ss_to_keep)
    bc_df = bc_df[ix]
    
    # provide feedback
    stats_dict['num_ss'] = len(ss_to_keep)
    stats_dict['pct_ss'] = 100*len(ss_to_keep)/len(ss_per_bc)
    stats_dict['num_bc'] = len(bc_df)
    stats_dict['pct_bc'] = 100*len(bc_df)/len(in_df)
    stats_dict['sum_tot_ct'] = bc_df["tot_ct"].sum()
    stats_dict['sum_ex_ct'] = bc_df["ex_ct"].sum()
    
    # Get best estimate of activites
    ss_df = compute_activities(bc_df, bootstrap=False)
    
    # Get boostrap resampled estimates
    print(f'Doing bootstrap reampling:',end='')
    resampled_dfs = []
    for n in range(num_resamps):
        if n%report_every==0 and n>0:
            print('.', end='')
        ss_resamp_df = compute_activities(bc_df, bootstrap=True, seed=n)
        resampled_dfs.append(ss_resamp_df)
    print('')
        
    # Compute std for each column in ss_df
    for col in ss_df.columns:
        std_col = 'd'+col
        vals = np.array([df[col].values for df in resampled_dfs]).T
        ss_df[std_col] = vals.std(axis=1, ddof=1)

    # Get number of splice sites
    num_ss = len(ss_df)
        
    if estimate_mi:
        # Compute conditional entropy
        sigma = ss_df['dlog_psi'].values
        sigma[sigma<min_sigma]=min_sigma
        H_contributions = 0.5*np.log2(2*e*pi*sigma**2)
        H_ygx = np.mean(H_contributions)
        dH_ygx = np.std(H_contributions, ddof=1)/np.sqrt(num_ss)
        #stats_dict['H[y|yhat]'] = H_ygyhat
        #stats_dict['dH[y|yhat]'] = dH_ygyhat

        # Compute entropy 
        y = ss_df['log_psi'].values
        p_y = su.DensityEstimator(y)
        H_y = -p_y.get_stats().loc['posterior mean','entropy']
        dH_y = p_y.get_stats().loc['posterior RMSD','entropy']
        #stats_dict['H[y]'] = H_y
        #stats_dict['dH[y]'] = dH_y

        # Report final mutual information value
        I_y_x = H_y - H_ygx
        dI_y_x = np.sqrt(dH_y**2 + dH_ygx**2)
        stats_dict['I[y;x]'] = I_y_x
        stats_dict['dI[y;x]'] = dI_y_x
        
    return ss_df, stats_dict

This scripts cleans the `results` files from Wong et al. 2018. 
1. Barcodes with no reads in tot_ct or ex_ct are removed.
2. Splice sites with fewer than 10 associated barcodes are removed.
3. The best estimate for log_psi for each splice site is computed as the ratio ex_ct/tot_ct, where each of these quantities is summed across all barcodes, using a pseudocount of 0.5.
4. The standard error of these estimates is comptued by bootstrap resampling all barcodes, then re-computing these ratios for each splice site. Note that this is NOT ideal; ideally we would bootstrap resample for each splice site individually. 
5. Sample statistics, including intrinsic information, is then recorded.

In [None]:
in_dir = '/Users/jkinney/Dropbox/15_mpathic/20_mpathic_redo/20.08.16_mpsa_raw_data'
out_dir = '20.08.16_mpsa_data'

#import warnings
#warnings.filterwarnings("error")

# Clear output directory
files = glob.glob(f'{out_dir}/*.*')
for f in files:
    os.remove(f)

# Get list of input files and sort them
in_files = glob.glob(f'{in_dir}/results.*.txt')
in_files.sort()
print(f'-> {len(in_files)} files to process.')

stats_df = pd.DataFrame()

# For each input file
for in_file in in_files:
    
    # Determine name of sample
    m = re.match(".*/results.(?P<locus>[^_]+)_9nt_(?P<librep>.+).txt",in_file)
    locus = m.groupdict()['locus']
    librep = m.groupdict()['librep']
    name = f'{locus}_{librep}'
    
    # Process sample
    ss_df, stats_dict = process_data(in_file, num_resamps=100)
    stats_dict['name'] = name
    stats_df = stats_df.append(stats_dict, ignore_index=True)
    
    # Save cleaned data from sample
    out_file = f'{out_dir}/{name}.csv'
    ss_df.to_csv(out_file)
    print(f'Output written to {out_file}.\n')
    
# Save dataframe containing sample stats
stats_df.set_index('name', inplace=True, drop=True)
stats_df = stats_df[['num_bc','pct_bc','num_ss','pct_ss','sum_tot_ct','sum_ex_ct','I[y;x]','dI[y;x]']]
stats_df.to_csv(f'{out_dir}/stats.csv')
stats_df

-> 21 files to process.
Processing /Users/jkinney/Dropbox/15_mpathic/20_mpathic_redo/20.08.16_mpsa_raw_data/results.brca2_9nt_lib1_rep1.txt
Doing bootstrap reampling:.........
Output written to 20.08.16_mpsa_data/brca2_lib1_rep1.csv.

Processing /Users/jkinney/Dropbox/15_mpathic/20_mpathic_redo/20.08.16_mpsa_raw_data/results.brca2_9nt_lib1_rep2.txt
Doing bootstrap reampling:.........