# Process MPSA data

In [1]:
# Standard imports
import numpy as np
import pandas as pd
from scipy import stats
import re
import sys

# My functions
sys.path.insert(0,'../')
import utils

# Useful variables
e = np.exp(1)
idx = pd.IndexSlice
_ = np.newaxis

In [2]:
# Load motifs
motifs = pd.read_excel('iupac_motifs.xlsx', index_col=0)['iupac']
motifs

name
wt           AGGA/GUAAGU
cons         NCAG/GUAAGU
neg          NCAG/GGAAGA
ris_iupac    ANGA/GUHDNN
hyp_iupac    NAGA/GUNNNN
ris_min      ANGA/GUADGN
hyp_min      RAGA/GURNGN
ris_max      ANGA/GUHDNN
hyp_max      NANN/GUNNNN
Name: iupac, dtype: object

In [3]:
# Process SMN2 MPSA data

# Set file names
filenames = ['psi_smn2_dmso.csv',
             'psi_smn2_rg.csv',
             'psi_smn2_nvs.csv']
filenames = ['mpsa/'+f for f in filenames]
print(f'Processing files: {filenames}')

y_cols = ['smn2_dmso', 'smn2_rg', 'smn2_nvs']
dy_cols = [col+'_log_std' for col in y_cols]

# Set a minimal PSI below which we clip
min_psi = 1E-2

# Load SMN2 data
for i, filename in enumerate(filenames):
    col = filename.split('/')[-1].split('.')[0][4:]
    tmp_df = pd.read_csv(filename, sep=',')

    # Rename splice sites
    tmp_df['ss'] = [s.replace('T','U') for s in tmp_df['ss']]
    tmp_df['ss'] = [s[:4]+'/'+s[4:] for s in tmp_df['ss']]
    tmp_df = tmp_df.set_index('ss')

    # Normalize each replicate by the median PSI of the 4 consensus sequences
    cons_ix = utils.motif_to_ix(motifs['cons'], tmp_df.index)
    cons_psi = tmp_df[cons_ix].median(axis=0)
    tmp_df.loc[:,:] = 100*tmp_df.values/cons_psi.values
    
    # Make header
    if i==0:
        smn2_df = pd.DataFrame(index=tmp_df.index)
    
    # Take median PSIs across columns
    smn2_df[col] = tmp_df.median(axis=1)
    #pdb.set_trace()
    
    # Create mask
    psis = tmp_df.values
    mask = tmp_df >= min_psi
    num_cols = tmp_df.shape[1]
    std_log = np.std(np.log(psis), where=mask, axis=1)
    num_entries = np.sum(mask, axis=1)
    smn2_df[col+'_log_std'] = std_log/np.sqrt(num_entries)    
        
# Normalize AGAIN by median PSI
cons_ix = utils.motif_to_ix(motifs['cons'], smn2_df.index)
cons_psi = smn2_df[cons_ix][y_cols].median(axis=0)
smn2_df.loc[:,y_cols] = 100*smn2_df.loc[:,y_cols].values/cons_psi.values

# Verify that median PSI of consensus sequences is 100.0
cons_ix = utils.motif_to_ix(motifs['cons'], smn2_df.index)

# Set sequences
seqs = smn2_df.index.values

# Set known motif indices and view quantities
wt_ix = utils.motif_to_ix(motifs['wt'], seqs)
cons_ix     = utils.motif_to_ix(motifs['cons'], seqs) 
null_ix     = utils.motif_to_ix(motifs['neg'], seqs)

print(f'Num wildtype seqs: {sum(wt_ix)} ')
print(f'Num cons seqs: {sum(cons_ix)} ')
print(f'Num null seqs: {sum(null_ix)} ')

print(f'N: {len(smn2_df)}')

file_name = 'mpsa_data_smn2.csv'
smn2_df.to_csv(file_name)
smn2_df.head()

Processing files: ['mpsa/psi_smn2_dmso.csv', 'mpsa/psi_smn2_rg.csv', 'mpsa/psi_smn2_nvs.csv']
Num wildtype seqs: 1 
Num cons seqs: 4 
Num null seqs: 4 
N: 285


  std_log = np.std(np.log(psis), where=mask, axis=1)
  std_log = np.std(np.log(psis), where=mask, axis=1)
  std_log = np.std(np.log(psis), where=mask, axis=1)


Unnamed: 0_level_0,smn2_dmso,smn2_dmso_log_std,smn2_rg,smn2_rg_log_std,smn2_nvs,smn2_nvs_log_std
ss,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAAA/GUAAGU,94.816183,0.038251,74.969468,0.043137,81.631149,0.056442
AACA/GUAAGU,86.257933,0.043625,72.047758,0.056629,90.199186,0.06454
AAGA/GUAAAU,0.119696,0.279578,0.0,0.533347,0.384295,0.314739
AAGA/GUAACU,0.208651,0.125712,0.0,0.479768,0.209434,0.23757
AAGA/GUAAGA,0.496079,0.27021,23.553215,0.141298,76.988843,0.059458


In [4]:
tmp_df.head()

Unnamed: 0_level_0,smn2_select_lib1_rep1,smn2_select_lib1_rep2,smn2_select_lib1_rep3,smn2_select_lib2_rep1,smn2_select_lib2_rep2,smn2_select_lib2_rep3,smn2_select_lib3_rep1,smn2_select_lib3_rep2,smn2_select_lib3_rep3
ss,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AAAA/GUAAGU,64.830254,69.567808,67.147859,99.780946,79.730733,95.209341,100.346012,98.623278,80.62714
AACA/GUAAGU,66.785946,76.057126,61.279862,103.180544,91.150037,103.534293,106.940212,89.089796,72.656092
AAGA/GUAAAU,0.135608,0.379568,0.385956,0.302992,0.414734,0.536284,1.588637,0.0,0.06304
AAGA/GUAACU,0.37106,0.334552,0.897556,0.177374,0.206034,0.080658,0.206858,0.0,0.441645
AAGA/GUAAGA,54.357579,80.418442,58.896132,53.999697,84.281899,85.899626,63.275333,78.432938,76.041931


In [5]:
# Process ELP1 MPSA data

# Set file names
filenames = ['psi_elp1_dmso.csv',
             'psi_elp1_rg.csv',
             'psi_elp1_nvs.csv',]
filenames = ['mpsa/'+f for f in filenames]
print(f'Processing files: {filenames}')

# Create container to hold data
psi_df = pd.DataFrame()

# Load SMN2 data
locus = 'elp1'
locus_filenames = [name for name in filenames if locus in name]
for i, filename in enumerate(locus_filenames):
    col = filename.split('/')[-1].split('.')[0][4:]
    tmp_df = pd.read_csv(filename, sep=',')

    # Rename splice sites
    tmp_df['ss'] = [s.replace('T','U') for s in tmp_df['ss']]
    tmp_df['ss'] = ['A'+s[:3]+'/'+s[3:] for s in tmp_df['ss']]
    tmp_df = tmp_df.set_index('ss')

    # Normalize each replicate by the median PSI of the 4 consensus sequences
    cons_ix = utils.motif_to_ix(motifs['cons'], tmp_df.index)
    cons_psi = tmp_df[cons_ix].median(axis=0)
    tmp_df.loc[:,:] = 100*tmp_df.values/cons_psi.values

    # Take median PSIs across columns
    if i==0:
        out_df = pd.DataFrame(index=tmp_df.index)
    out_df[col] = tmp_df.median(axis=1)

# Normalize AGAIN by median PSI
cons_ix = utils.motif_to_ix(motifs['cons'], out_df.index)
cons_psi = out_df[cons_ix].median(axis=0)
out_df.loc[:,:] = 100*out_df.values/cons_psi.values

# Verify that median PSI of consensus sequences is 100.0
cons_ix = utils.motif_to_ix(motifs['cons'], out_df.index)
assert(np.all(out_df[cons_ix].median(axis=0) == 100))

# Merge into psi_df
psi_df = pd.merge(left=psi_df, 
                  left_index=True, 
                  right=out_df, 
                  right_index=True, 
                  how='outer')
    
# Set sequences
seqs = psi_df.index.values

# Set known motif indices and view quantities
wt_ix = utils.motif_to_ix(motifs['wt'], seqs)
cons_ix = utils.motif_to_ix(motifs['cons'], seqs) 
null_ix = utils.motif_to_ix(motifs['neg'], seqs)

print(f'Num wildtype seqs: {sum(wt_ix)} ')
print(f'Num cons seqs: {sum(cons_ix)} ')
print(f'Num null seqs: {sum(null_ix)} ')
print(f'N: {len(psi_df)}')

file_name = 'mpsa_data_elp1.csv'
psi_df.to_csv(file_name)
psi_df.head()

Processing files: ['mpsa/psi_elp1_dmso.csv', 'mpsa/psi_elp1_rg.csv', 'mpsa/psi_elp1_nvs.csv']
Num wildtype seqs: 1 
Num cons seqs: 1 
Num null seqs: 0 
N: 30807


Unnamed: 0_level_0,elp1_dmso,elp1_rg,elp1_nvs
ss,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAAA/GCAAAA,0.0,0.025349,0.108121
AAAA/GCAAAG,0.0,0.0,0.0
AAAA/GCAAGA,0.0,0.0,0.0
AAAA/GCAAGG,0.0,0.004373,0.0
AAAA/GCAAUC,0.0,0.0,0.0
